PyPI - sdg-hub - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

sdg-hub 0.4.1py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

sdg_hub/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.4.1'
-__version_tuple__ = version_tuple = (0, 4, 1)
+__version__ = version = '0.4.2'
+__version_tuple__ = version_tuple = (0, 4, 2)
 __commit_id__ = commit_id = None

sdg_hub/core/flow/base.py CHANGED Viewed

@@ -30,9 +30,14 @@ from ..blocks.base import BaseBlock
 from ..blocks.registry import BlockRegistry
 from ..utils.datautils import safe_concatenate_with_validation, validate_no_duplicates
 from ..utils.error_handling import EmptyDatasetError, FlowValidationError
-from ..utils.flow_metrics import display_metrics_summary, save_metrics_to_json
+from ..utils.flow_metrics import (
+    display_metrics_summary,
+    display_time_estimation_summary,
+    save_metrics_to_json,
+)
 from ..utils.logger_config import setup_logger
 from ..utils.path_resolution import resolve_path
+from ..utils.time_estimator import estimate_execution_time
 from ..utils.yaml_utils import save_flow_yaml
 from .checkpointer import FlowCheckpointer
 from .metadata import DatasetRequirements, FlowMetadata
@@ -1006,6 +1011,8 @@ class Flow(BaseModel):
         dataset: Dataset,
         sample_size: int = 2,
         runtime_params: Optional[dict[str, dict[str, Any]]] = None,
+        max_concurrency: Optional[int] = None,
+        enable_time_estimation: bool = False,
     ) -> dict[str, Any]:
         """Perform a dry run of the flow with a subset of data.
@@ -1017,11 +1024,18 @@ class Flow(BaseModel):
             Number of samples to use for dry run testing.
         runtime_params : Optional[Dict[str, Dict[str, Any]]], optional
             Runtime parameters organized by block name.
+        max_concurrency : Optional[int], optional
+            Maximum concurrent requests for LLM blocks. If None, no limit is applied.
+        enable_time_estimation : bool, default=False
+            If True, estimates execution time for the full dataset and displays it
+            in a Rich table. Automatically runs a second dry run if needed for
+            accurate scaling analysis.
         Returns
         -------
         Dict[str, Any]
             Dry run results with execution info and sample outputs.
+            Time estimation is displayed in a table but not included in return value.
         Raises
         ------
@@ -1039,6 +1053,19 @@ class Flow(BaseModel):
         validate_no_duplicates(dataset)
+        # Validate max_concurrency parameter
+        if max_concurrency is not None:
+            if isinstance(max_concurrency, bool) or not isinstance(
+                max_concurrency, int
+            ):
+                raise FlowValidationError(
+                    f"max_concurrency must be an int, got {type(max_concurrency).__name__}"
+                )
+            if max_concurrency <= 0:
+                raise FlowValidationError(
+                    f"max_concurrency must be greater than 0, got {max_concurrency}"
+                )
         # Use smaller sample size if dataset is smaller
         actual_sample_size = min(sample_size, len(dataset))
@@ -1056,6 +1083,7 @@ class Flow(BaseModel):
             "flow_version": self.metadata.version,
             "sample_size": actual_sample_size,
             "original_dataset_size": len(dataset),
+            "max_concurrency": max_concurrency,
             "input_columns": dataset.column_names,
             "blocks_executed": [],
             "final_dataset": None,
@@ -1082,6 +1110,10 @@ class Flow(BaseModel):
                 # Prepare block execution parameters
                 block_kwargs = self._prepare_block_kwargs(block, runtime_params)
+                # Add max_concurrency to block kwargs if provided
+                if max_concurrency is not None:
+                    block_kwargs["_flow_max_concurrency"] = max_concurrency
                 # Check if this is a deprecated block and skip validations
                 is_deprecated_block = (
                     hasattr(block, "__class__")
@@ -1099,7 +1131,9 @@ class Flow(BaseModel):
                     # Execute block with validation and logging
                     current_dataset = block(current_dataset, **block_kwargs)
-                block_execution_time = time.time() - block_start_time
+                block_execution_time = (
+                    time.perf_counter() - block_start_time
+                )  # Fixed: use perf_counter consistently
                 # Record block execution info
                 block_info = {
@@ -1138,6 +1172,12 @@ class Flow(BaseModel):
                 f"in {execution_time:.2f}s"
             )
+            # Perform time estimation if requested (displays table but doesn't store in results)
+            if enable_time_estimation:
+                self._estimate_total_time(
+                    dry_run_results, dataset, runtime_params, max_concurrency
+                )
             return dry_run_results
         except Exception as exc:
@@ -1150,6 +1190,103 @@ class Flow(BaseModel):
             raise FlowValidationError(f"Dry run failed: {exc}") from exc
+    def _estimate_total_time(
+        self,
+        first_run_results: dict[str, Any],
+        dataset: Dataset,
+        runtime_params: Optional[dict[str, dict[str, Any]]],
+        max_concurrency: Optional[int],
+    ) -> dict[str, Any]:
+        """Estimate execution time using 2 dry runs (private method).
+        This method contains all the estimation logic. It determines if a second
+        dry run is needed, executes it, and calls estimate_execution_time.
+        Parameters
+        ----------
+        first_run_results : dict
+            Results from the first dry run.
+        dataset : Dataset
+            Full dataset for estimation.
+        runtime_params : Optional[dict]
+            Runtime parameters.
+        max_concurrency : Optional[int]
+            Maximum concurrency.
+        Returns
+        -------
+        dict
+            Estimation results with estimated_time_seconds, total_estimated_requests, etc.
+        """
+        first_sample_size = first_run_results["sample_size"]
+        # Check if we need a second dry run
+        has_async_blocks = any(
+            getattr(block, "async_mode", False) for block in self.blocks
+        )
+        # For sequential or no async blocks, single run is sufficient
+        if max_concurrency == 1 or not has_async_blocks:
+            estimation = estimate_execution_time(
+                dry_run_1=first_run_results,
+                dry_run_2=None,
+                total_dataset_size=len(dataset),
+                max_concurrency=max_concurrency,
+            )
+        else:
+            # Need second measurement - always use canonical (1, 5) pair
+            if first_sample_size == 1:
+                # Already have 1, need 5
+                logger.info("Running second dry run with 5 samples for time estimation")
+                second_run = self.dry_run(
+                    dataset,
+                    5,
+                    runtime_params,
+                    max_concurrency,
+                    enable_time_estimation=False,
+                )
+                dry_run_1, dry_run_2 = first_run_results, second_run
+            elif first_sample_size == 5:
+                # Already have 5, need 1
+                logger.info("Running second dry run with 1 sample for time estimation")
+                second_run = self.dry_run(
+                    dataset,
+                    1,
+                    runtime_params,
+                    max_concurrency,
+                    enable_time_estimation=False,
+                )
+                dry_run_1, dry_run_2 = second_run, first_run_results
+            else:
+                # For other sizes: run both 1 and 5 for canonical pair
+                logger.info("Running dry runs with 1 and 5 samples for time estimation")
+                dry_run_1 = self.dry_run(
+                    dataset,
+                    1,
+                    runtime_params,
+                    max_concurrency,
+                    enable_time_estimation=False,
+                )
+                dry_run_2 = self.dry_run(
+                    dataset,
+                    5,
+                    runtime_params,
+                    max_concurrency,
+                    enable_time_estimation=False,
+                )
+            estimation = estimate_execution_time(
+                dry_run_1=dry_run_1,
+                dry_run_2=dry_run_2,
+                total_dataset_size=len(dataset),
+                max_concurrency=max_concurrency,
+            )
+        # Display estimation summary
+        display_time_estimation_summary(estimation, len(dataset), max_concurrency)
+        return estimation
     def add_block(self, block: BaseBlock) -> "Flow":
         """Add a block to the flow, returning a new Flow instance.

sdg_hub/core/utils/__init__.py CHANGED Viewed

@@ -1,8 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # Local
-from .flow_identifier import get_flow_identifier
-from .path_resolution import resolve_path
+from .flow_identifier import get_flow_identifier as get_flow_identifier
+from .path_resolution import resolve_path as resolve_path
+from .time_estimator import estimate_execution_time as estimate_execution_time
+from .time_estimator import is_llm_using_block as is_llm_using_block
 # This is part of the public API, and used by instructlab
@@ -10,4 +12,10 @@ class GenerateError(Exception):
     """An exception raised during generate step."""
-__all__ = ["GenerateError", "resolve_path", "get_flow_identifier"]
+__all__ = [
+    "GenerateError",
+    "resolve_path",
+    "get_flow_identifier",
+    "estimate_execution_time",
+    "is_llm_using_block",
+]

sdg_hub/core/utils/flow_metrics.py CHANGED Viewed

@@ -188,6 +188,122 @@ def display_metrics_summary(
     console.print()
+def display_time_estimation_summary(
+    time_estimation: dict[str, Any],
+    dataset_size: int,
+    max_concurrency: Optional[int] = None,
+) -> None:
+    """Display a rich table summarizing time estimation results.
+    Parameters
+    ----------
+    time_estimation : dict[str, Any]
+        Time estimation results from estimate_total_time().
+    dataset_size : int
+        Total number of samples in the dataset.
+    max_concurrency : Optional[int], optional
+        Maximum concurrency used for estimation.
+    """
+    console = Console()
+    # Create main summary table
+    summary_table = Table(
+        show_header=False,
+        box=None,
+        padding=(0, 1),
+    )
+    summary_table.add_column("Metric", style="bright_cyan")
+    summary_table.add_column("Value", style="bright_white")
+    # Format time
+    est_seconds = time_estimation["estimated_time_seconds"]
+    if est_seconds < 60:
+        time_str = f"{est_seconds:.1f} seconds"
+    elif est_seconds < 3600:
+        time_str = f"{est_seconds / 60:.1f} minutes ({est_seconds / 3600:.2f} hours)"
+    else:
+        time_str = f"{est_seconds / 3600:.2f} hours ({est_seconds / 60:.0f} minutes)"
+    summary_table.add_row("Estimated Time:", time_str)
+    summary_table.add_row(
+        "Total LLM Requests:", f"{time_estimation.get('total_estimated_requests', 0):,}"
+    )
+    if time_estimation.get("total_estimated_requests", 0) > 0:
+        requests_per_sample = time_estimation["total_estimated_requests"] / dataset_size
+        summary_table.add_row("Requests per Sample:", f"{requests_per_sample:.1f}")
+    if max_concurrency is not None:
+        summary_table.add_row("Max Concurrency:", str(max_concurrency))
+    # Display summary panel
+    console.print()
+    console.print(
+        Panel(
+            summary_table,
+            title=f"[bold bright_white]Time Estimation for {dataset_size:,} Samples[/bold bright_white]",
+            border_style="bright_blue",
+        )
+    )
+    # Display per-block breakdown if available
+    block_estimates = time_estimation.get("block_estimates", [])
+    if block_estimates:
+        console.print()
+        # Create per-block table
+        block_table = Table(
+            show_header=True,
+            header_style="bold bright_white",
+        )
+        block_table.add_column("Block Name", style="bright_cyan", width=20)
+        block_table.add_column("Time", justify="right", style="bright_yellow", width=10)
+        block_table.add_column(
+            "Requests", justify="right", style="bright_green", width=10
+        )
+        block_table.add_column(
+            "Throughput", justify="right", style="bright_blue", width=12
+        )
+        block_table.add_column(
+            "Amplif.", justify="right", style="bright_magenta", width=10
+        )
+        for block in block_estimates:
+            # Format time
+            block_seconds = block["estimated_time"]
+            if block_seconds < 60:
+                time_str = f"{block_seconds:.1f}s"
+            else:
+                time_str = f"{block_seconds / 60:.1f}min"
+            # Format requests
+            requests_str = f"{block['estimated_requests']:,.0f}"
+            # Format throughput
+            throughput_str = f"{block['throughput']:.2f}/s"
+            # Format amplification
+            amplif_str = f"{block['amplification']:.1f}x"
+            block_table.add_row(
+                block["block"],
+                time_str,
+                requests_str,
+                throughput_str,
+                amplif_str,
+            )
+        console.print(
+            Panel(
+                block_table,
+                title="[bold bright_white]Per-Block Breakdown[/bold bright_white]",
+                border_style="bright_blue",
+            )
+        )
+    console.print()
 def save_metrics_to_json(
     block_metrics: list[dict[str, Any]],
     flow_name: str,

sdg_hub/core/utils/time_estimator.py ADDED Viewed

@@ -0,0 +1,344 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Time estimation utility for predicting full dataset execution time from dry_run results."""
+# Standard
+from typing import Dict, Optional
+import math
+# Default max concurrent requests used during dry runs
+DRY_RUN_MAX_CONCURRENT = 100
+# Conservative estimation factor (20% buffer for API variability, network latency, etc.)
+ESTIMATION_BUFFER_FACTOR = 1.2
+def is_llm_using_block(block_info: Dict) -> bool:
+    """Detect if a block uses LLMs.
+    Identifies blocks that make LLM API calls based on their type or parameters.
+    This is used to calculate request amplification for LLM blocks.
+    Parameters
+    ----------
+    block_info : Dict
+        Block information from dry_run results containing block_type and parameters_used.
+    Returns
+    -------
+    bool
+        True if the block uses LLMs, False otherwise.
+    Examples
+    --------
+    >>> block = {"block_type": "LLMChatBlock", "parameters_used": {"model": "gpt-4"}}
+    >>> is_llm_using_block(block)
+    True
+    """
+    block_type = block_info.get("block_type", "")
+    # Direct LLM blocks or evaluation/verification blocks
+    if any(kw in block_type for kw in ["LLMChatBlock", "Evaluate", "Verify"]):
+        return True
+    # Check for model parameters
+    params = block_info.get("parameters_used", {})
+    if any(key in params for key in ["model", "api_base", "api_key"]):
+        return True
+    return False
+def calculate_block_throughput(
+    block_1: Dict, block_2: Dict, samples_1: int, samples_2: int
+) -> Dict:
+    """Calculate throughput and amplification from two dry runs.
+    Analyzes performance metrics from two dry runs with different sample sizes
+    to estimate throughput (requests/second), amplification factor, and startup overhead.
+    Parameters
+    ----------
+    block_1 : Dict
+        Block execution info from first dry run.
+    block_2 : Dict
+        Block execution info from second dry run.
+    samples_1 : int
+        Number of samples in first dry run.
+    samples_2 : int
+        Number of samples in second dry run.
+    Returns
+    -------
+    Dict
+        Dictionary containing:
+        - throughput: float, requests per second
+        - amplification: float, average requests per input sample
+        - startup_overhead: float, fixed startup time in seconds
+    Raises
+    ------
+    ValueError
+        If throughput cannot be calculated due to invalid measurements.
+    Examples
+    --------
+    >>> block1 = {"execution_time_seconds": 1.0, "input_rows": 1, "block_name": "test"}
+    >>> block2 = {"execution_time_seconds": 2.0, "input_rows": 5, "block_name": "test"}
+    >>> result = calculate_block_throughput(block1, block2, 1, 5)
+    >>> assert result["throughput"] > 0
+    """
+    time_1 = block_1.get("execution_time_seconds", 0)
+    time_2 = block_2.get("execution_time_seconds", 0)
+    requests_1 = block_1.get("input_rows", 0)
+    requests_2 = block_2.get("input_rows", 0)
+    # Calculate amplification (requests per sample)
+    amp_1 = requests_1 / samples_1 if samples_1 > 0 else 1
+    amp_2 = requests_2 / samples_2 if samples_2 > 0 else 1
+    avg_amplification = (amp_1 + amp_2) / 2
+    # Use linear scaling to extract throughput and overhead from two data points
+    # Model: time = startup_overhead + (requests / throughput)
+    if requests_2 > requests_1 and time_2 > time_1:
+        # Calculate marginal time per request (slope of the line)
+        marginal_time = (time_2 - time_1) / (requests_2 - requests_1)
+        # Throughput is the inverse of marginal time
+        measured_throughput = 1.0 / marginal_time if marginal_time > 0 else 0
+        # Y-intercept is the startup overhead
+        startup_overhead = max(0, time_1 - (requests_1 * marginal_time))
+    else:
+        # Fallback to simple calculation if we don't have good data for scaling
+        throughput_1 = requests_1 / time_1 if time_1 > 0 else 0
+        throughput_2 = requests_2 / time_2 if time_2 > 0 else 0
+        measured_throughput = max(throughput_1, throughput_2)
+        # Estimate overhead as a small fraction of time
+        startup_overhead = min(2.0, time_1 * 0.1)  # Assume 10% overhead, max 2 seconds
+    # If we have no valid measurements, raise an error
+    if measured_throughput == 0:
+        raise ValueError(
+            f"Cannot calculate throughput for block '{block_1.get('block_name', 'unknown')}': "
+            f"No valid measurements from dry runs (time_1={time_1}, time_2={time_2}, "
+            f"requests_1={requests_1}, requests_2={requests_2})"
+        )
+    return {
+        "throughput": measured_throughput,
+        "amplification": avg_amplification,
+        "startup_overhead": startup_overhead,
+    }
+def calculate_time_with_pipeline(
+    num_requests: float,
+    throughput: float,
+    startup_overhead: float,
+    max_concurrent: int = DRY_RUN_MAX_CONCURRENT,
+) -> float:
+    """Calculate time considering pipeline behavior and max concurrent limit.
+    Models the execution time for a given number of requests based on throughput,
+    startup overhead, and concurrency constraints. Applies non-linear scaling
+    for diminishing returns at high concurrency levels.
+    Parameters
+    ----------
+    num_requests : float
+        Total number of requests to process.
+    throughput : float
+        Base throughput in requests per second.
+    startup_overhead : float
+        Fixed startup time overhead in seconds.
+    max_concurrent : int, optional
+        Maximum number of concurrent requests, by default 100.
+    Returns
+    -------
+    float
+        Estimated total execution time in seconds.
+    Examples
+    --------
+    >>> time = calculate_time_with_pipeline(1000, 10.0, 0.5, 50)
+    >>> assert time > 0
+    """
+    if num_requests <= 0:
+        return 0
+    # Validate and clamp max_concurrent to avoid division by zero
+    if max_concurrent is None or max_concurrent <= 0:
+        max_concurrent = 1
+    # The throughput is what we measured - it represents the server's processing capability
+    if max_concurrent == 1:
+        # Sequential execution - no pipelining benefit
+        effective_throughput = throughput
+    else:
+        # Concurrent execution - small pipelining benefit
+        # At most 10% improvement from perfect pipelining (conservative estimate)
+        # Logarithmic growth to model diminishing returns
+        pipelining_factor = 1.0 + (0.1 * math.log(max_concurrent) / math.log(100))
+        pipelining_factor = min(pipelining_factor, 1.1)  # Cap at 10% improvement
+        effective_throughput = throughput * pipelining_factor
+    # Calculate total time
+    base_time = startup_overhead + (num_requests / effective_throughput)
+    return base_time
+def estimate_execution_time(
+    dry_run_1: Dict,
+    dry_run_2: Optional[Dict] = None,
+    total_dataset_size: Optional[int] = None,
+    max_concurrency: Optional[int] = None,
+) -> Dict:
+    """Estimate execution time based on dry run results.
+    Estimates the total execution time for a full dataset based on one or two
+    dry runs with smaller sample sizes. For async blocks (with two dry runs),
+    calculates throughput and concurrency benefits. For sync blocks (single dry run),
+    performs simple linear scaling.
+    The estimates include a conservative buffer (20%) to account for API variability,
+    network latency, and other real-world factors.
+    Parameters
+    ----------
+    dry_run_1 : Dict
+        Results from first dry run, must contain 'sample_size' and 'execution_time_seconds'.
+    dry_run_2 : Optional[Dict], optional
+        Results from second dry run for async estimation, by default None.
+    total_dataset_size : Optional[int], optional
+        Size of full dataset to estimate for. If None, uses original_dataset_size from dry_run_1.
+    max_concurrency : Optional[int], optional
+        Maximum concurrent requests allowed, by default 100.
+    Returns
+    -------
+    Dict
+        Estimation results containing:
+        - estimated_time_seconds: float, estimated time with current configuration (includes buffer)
+        - total_estimated_requests: int, total LLM requests (0 for sync blocks)
+        - block_estimates: list, per-block estimates (for async blocks)
+        - note: str, additional information about the estimation
+    Examples
+    --------
+    >>> dry_run = {"sample_size": 2, "execution_time_seconds": 10.0}
+    >>> result = estimate_execution_time(dry_run, total_dataset_size=100)
+    >>> assert result["estimated_time_seconds"] > 0
+    >>>
+    >>> # With two dry runs for async estimation
+    >>> dry_run_1 = {"sample_size": 1, "execution_time_seconds": 5.0, "blocks_executed": [...]}
+    >>> dry_run_2 = {"sample_size": 5, "execution_time_seconds": 20.0, "blocks_executed": [...]}
+    >>> result = estimate_execution_time(dry_run_1, dry_run_2, total_dataset_size=1000)
+    >>> assert result["estimated_time_seconds"] > 0
+    """
+    # Set defaults
+    if max_concurrency is None:
+        max_concurrency = DRY_RUN_MAX_CONCURRENT
+    if total_dataset_size is None:
+        total_dataset_size = dry_run_1.get(
+            "original_dataset_size", dry_run_1["sample_size"]
+        )
+    # Get sample sizes
+    samples_1 = dry_run_1["sample_size"]
+    samples_2 = (
+        dry_run_2["sample_size"] if dry_run_2 else 5
+    )  # Default to 5 if not provided
+    # If only one dry run, do simple scaling
+    if dry_run_2 is None:
+        # Process each block individually for synchronous execution
+        blocks_executed = dry_run_1.get("blocks_executed", [])
+        if not blocks_executed:
+            # Fallback to simple scaling if no block details available
+            total_time = dry_run_1["execution_time_seconds"]
+            simple_estimate = (total_time / samples_1) * total_dataset_size
+            # Apply conservative buffer
+            simple_estimate = simple_estimate * ESTIMATION_BUFFER_FACTOR
+            return {
+                "estimated_time_seconds": simple_estimate,
+                "total_estimated_requests": 0,
+                "note": "Synchronous execution - linear scaling from dry run",
+            }
+        # Calculate time for each block and sum them
+        total_estimated_time = 0
+        for block in blocks_executed:
+            block_time = block.get("execution_time_seconds", 0)
+            input_rows = block.get("input_rows", samples_1)
+            # Calculate time per row for this block
+            if input_rows > 0:
+                time_per_row = block_time / input_rows
+                block_total_time = time_per_row * total_dataset_size
+                total_estimated_time += block_total_time
+        # Apply conservative buffer
+        total_estimated_time = total_estimated_time * ESTIMATION_BUFFER_FACTOR
+        return {
+            "estimated_time_seconds": total_estimated_time,
+            "total_estimated_requests": 0,
+            "note": "Synchronous execution - no concurrency",
+        }
+    # Analyze each block with async execution
+    block_estimates = []
+    total_time = 0
+    total_requests = 0
+    # Process each block
+    for i, block_1 in enumerate(dry_run_1.get("blocks_executed", [])):
+        if i >= len(dry_run_2.get("blocks_executed", [])):
+            break
+        block_2 = dry_run_2["blocks_executed"][i]
+        # Only process LLM blocks
+        if not is_llm_using_block(block_1):
+            continue
+        # Calculate throughput and amplification
+        analysis = calculate_block_throughput(block_1, block_2, samples_1, samples_2)
+        # Estimate requests for full dataset
+        estimated_requests = total_dataset_size * analysis["amplification"]
+        # Calculate time with pipeline model
+        block_time = calculate_time_with_pipeline(
+            estimated_requests,
+            analysis["throughput"],
+            analysis["startup_overhead"],
+            max_concurrency,
+        )
+        total_time += block_time
+        total_requests += estimated_requests
+        block_estimates.append(
+            {
+                "block": block_1["block_name"],
+                "estimated_requests": estimated_requests,
+                "throughput": analysis["throughput"],
+                "estimated_time": block_time,
+                "amplification": analysis["amplification"],
+                "startup_overhead": analysis["startup_overhead"],
+            }
+        )
+    # Apply conservative buffer to account for API variability, network issues, etc.
+    total_time = total_time * ESTIMATION_BUFFER_FACTOR
+    return {
+        "estimated_time_seconds": total_time,
+        "total_estimated_requests": int(total_requests),
+        "block_estimates": block_estimates,
+    }

sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml CHANGED Viewed

@@ -19,7 +19,7 @@ metadata:
     - "japanese"
   license: "Apache-2.0"
   dataset_requirements:
     required_columns:
       - "document"
@@ -54,17 +54,19 @@ blocks:
       output_cols: raw_summary_detailed
       max_tokens: 2048
       async_mode: true
+      # n: 2
   - block_type: LLMParserBlock
     block_config:
-      block_name: extract_detailed_summary
+      block_name: detailed_summary
       input_cols: raw_summary_detailed
       extract_content: true
+      # extract_reasoning_content: true
   - block_type: TextParserBlock
     block_config:
       block_name: parse_detailed_summary
-      input_cols: extract_detailed_summary_content
+      input_cols: detailed_summary_content
       output_cols: summary_detailed
       start_tags: [""]
       end_tags: [""]
@@ -86,14 +88,14 @@ blocks:
   - block_type: LLMParserBlock
     block_config:
-      block_name: extract_atomic_facts
+      block_name: atomic_facts
       input_cols: raw_atomic_facts
       extract_content: true
   - block_type: TextParserBlock
     block_config:
       block_name: parse_atomic_facts
-      input_cols: extract_atomic_facts_content
+      input_cols: atomic_facts_content
       output_cols: summary_atomic_facts
       start_tags: [""]
       end_tags: [""]
@@ -115,14 +117,14 @@ blocks:
   - block_type: LLMParserBlock
     block_config:
-      block_name: extract_extractive_summary
+      block_name: extractive_summary
       input_cols: raw_summary_extractive
       extract_content: true
   - block_type: TextParserBlock
     block_config:
       block_name: parse_extractive_summary
-      input_cols: extract_extractive_summary_content
+      input_cols: extractive_summary_content
       output_cols: summary_extractive
       start_tags: [""]
       end_tags: [""]
@@ -156,14 +158,14 @@ blocks:
   - block_type: LLMParserBlock
     block_config:
-      block_name: extract_knowledge_generation
+      block_name: get_knowledge_generation
       input_cols: raw_knowledge_generation
       extract_content: true
   - block_type: TextParserBlock
     block_config:
       block_name: parse_knowledge_generation
-      input_cols: extract_knowledge_generation_content
+      input_cols: get_knowledge_generation_content
       output_cols: [question, response]
       parsing_pattern: "\\[(?:Question|QUESTION)\\]\\s*(.*?)\\s*\\[(?:Answer|ANSWER)\\]\\s*(.*?)\\s*(?=\\[(?:Question|QUESTION)\\]|$)"
       parser_cleanup_tags: ["[END]"]

{sdg_hub-0.4.1.dist-info → sdg_hub-0.4.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sdg_hub
-Version: 0.4.1
+Version: 0.4.2
 Summary: Synthetic Data Generation
 Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
 License: Apache-2.0

{sdg_hub-0.4.1.dist-info → sdg_hub-0.4.2.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 sdg_hub/__init__.py,sha256=TlkZT40-70urdcWLqv3kupaJj8s-SVgd2QyvlSFwb4A,510
-sdg_hub/_version.py,sha256=k7cu0JKra64gmMNU_UfA5sw2eNc_GRvf3QmesiYAy8g,704
+sdg_hub/_version.py,sha256=A45grTqzrHuDn1CT9K5GVUbY4_Q3OSTcXAl3zdHzcEI,704
 sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sdg_hub/core/__init__.py,sha256=e3BoejbqjYhasf9t__L4qE52lkD9EBjx4o--2kqKdro,460
 sdg_hub/core/blocks/__init__.py,sha256=5FsbkcO-dmBv6MqO96TPn9FKKPTQZQCv20j4wR7UvQw,1502
@@ -33,20 +33,21 @@ sdg_hub/core/blocks/transform/rename_columns.py,sha256=qeB5L2utqDQnutUetH1VKZSqD
 sdg_hub/core/blocks/transform/text_concat.py,sha256=_-B__Hob1WwgwkILPIZvTnsDzuwtoX1hKviyzHlnnes,3149
 sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=XnjiT29z3PzIPy8M-mmE2w-Miab6Ed5ahy32SaxTCTE,3263
 sdg_hub/core/flow/__init__.py,sha256=0_m_htuZfPxk8xQ9IKfp0Pz-JRE4O7lYMUFrKyLNoLA,409
-sdg_hub/core/flow/base.py,sha256=IRnNEZ3laDmR4sW_MTseL4syhLuUylyHY_0tS5QaS-A,54084
+sdg_hub/core/flow/base.py,sha256=4kR-dKXAlLFSwm3YWdT8EoedCIGJT56agcot3tQb6VY,59508
 sdg_hub/core/flow/checkpointer.py,sha256=stm5ZtjjEiLk9ZkAAnoQQn5Y8Yl_d7qCsQLZTrCXR48,11867
 sdg_hub/core/flow/metadata.py,sha256=cFrpJjWOaK87aCuRFyC3Pdf83oYU93mrmZEMdUnhsN8,10540
 sdg_hub/core/flow/migration.py,sha256=6and-RBqV0t2gRipr1GiOOVnyBJdtyyjw1kO08Z--d4,7558
 sdg_hub/core/flow/registry.py,sha256=N6KfX-L7QRkooznIFxDuhRZYuDA5g3N5zC-KRm2jVhk,12109
 sdg_hub/core/flow/validation.py,sha256=pUJvgaUjLpKNwvW6djcqVOF-HShOjegEmGOnUnoX4BA,9722
-sdg_hub/core/utils/__init__.py,sha256=C2FzLn3dHprwGJDEgI4fyFS3aoCJR-9PhHsunxropJ8,351
+sdg_hub/core/utils/__init__.py,sha256=KcT56JhobC5sBg0MKEMn5hc4OyKa9_Vnn45Mt_kS4jQ,610
 sdg_hub/core/utils/datautils.py,sha256=__HkUe1DxcJVHKrFX68z_hDXwxJygBlJDfjJLnj7rHc,4230
 sdg_hub/core/utils/error_handling.py,sha256=yku8cGj_nKCyXDsnb-mHCpgukkkAMucJ4iAUrIzqysc,5510
 sdg_hub/core/utils/flow_id_words.yaml,sha256=5QHpQdP7zwahRuooyAlJIwBY7WcDR7vtbJXxVJqujbg,2317
 sdg_hub/core/utils/flow_identifier.py,sha256=aAHfK_G9AwEtMglLRMdMpi_AI1dciub5UqBGm4yb2HE,2841
-sdg_hub/core/utils/flow_metrics.py,sha256=VOdreUzP0kPgnkPjuQk87tZsK5f1u6XGEPM8ugCt0CY,8824
+sdg_hub/core/utils/flow_metrics.py,sha256=3G-xbfr-rFA578wV4KUbQePTMVGZHr9-rXvyYL4Kt2Q,12604
 sdg_hub/core/utils/logger_config.py,sha256=6_cnsIHtSAdq1iTTZ7Q7nAJ1dmldlxSZ0AB49yLiQ20,2034
 sdg_hub/core/utils/path_resolution.py,sha256=yWof4kGNpQ5dKcrVHg0h9KfOKLZ6ROjdfsLAZsQT5rM,2000
+sdg_hub/core/utils/time_estimator.py,sha256=rM3_R-Ka5DEtvOtlJoA_5pXSyQ6tT6t4h6qh3_5BCZo,12639
 sdg_hub/core/utils/yaml_utils.py,sha256=tShCd-FFkp0xlKnLe7dXsMOR4AvT9d2qRUmu4ZnPSEY,1458
 sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml,sha256=THRT3cY44KGI_69B2wqt2Q89EknnOSE7B4A_jdnxlIU,330
@@ -78,7 +79,7 @@ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/j
 sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml,sha256=OjPZaSCOSLxEWgW3pmNwF7mmLhGhFGTmKL_3rKdqeW4,2488
 sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml,sha256=nEy_RcotHGiiENrmUANpKkbIFsrARAeSwECrBeHi2so,391
 sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml,sha256=V90W0IeJQZTFThA8v0UOs3DtZbtU3BI9jkpChw1BULo,402
-sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml,sha256=iY1N6CY97fEkqI5oqaamSfqmiXpHPhWH_aOppsMxVjY,9176
+sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml,sha256=jumjKmKshSd8hoTYpyBJ0nMOADeQmxBmNPY7yfa_xQ8,9171
 sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml,sha256=96SQqXG7fmb-50SdX85sgVtrFcQ-oNKe_0BoQdZmY5g,2638
 sdg_hub/flows/text_analysis/__init__.py,sha256=WStks4eM_KHNTVsHglcj8vFghmI0PH9P1hUrijBLbwc,125
 sdg_hub/flows/text_analysis/structured_insights/__init__.py,sha256=_DT4NR05JD9CZoSWROPr2lC6se0VjSqQPZJJlEV79mk,274
@@ -87,8 +88,8 @@ sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml,sha256=Q_S
 sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml,sha256=_nPPMdHnxag_lYbhYUjGJGo-CvRwWvwdGX7cQhdZ1S0,847
 sdg_hub/flows/text_analysis/structured_insights/flow.yaml,sha256=BBV18SdvuVTAESjwkJ7V1jbb-cSTBvNl3SCycd0oEQ4,4934
 sdg_hub/flows/text_analysis/structured_insights/summarize.yaml,sha256=WXwQak1pF8e1OwnOoI1EHu8QB6iUNW89rfkTdi1Oq54,687
-sdg_hub-0.4.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-sdg_hub-0.4.1.dist-info/METADATA,sha256=pLRs5oOsVI9515UEZxcUEZFZhCoZ0kli0KLpBPPPB7w,9783
-sdg_hub-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-sdg_hub-0.4.1.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
-sdg_hub-0.4.1.dist-info/RECORD,,
+sdg_hub-0.4.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+sdg_hub-0.4.2.dist-info/METADATA,sha256=5qbw9_DoVmfntmQlvz4VPdQXdUXoLO8Zhrxbc1uY7b0,9783
+sdg_hub-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+sdg_hub-0.4.2.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
+sdg_hub-0.4.2.dist-info/RECORD,,

{sdg_hub-0.4.1.dist-info → sdg_hub-0.4.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{sdg_hub-0.4.1.dist-info → sdg_hub-0.4.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{sdg_hub-0.4.1.dist-info → sdg_hub-0.4.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

sdg-hub 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl

sdg-hub 0.4.1py3-none-any.whl → 0.4.2py3-none-any.whl