PyPI - sdg-hub - Versions diffs - 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl - Mend

sdg-hub 0.5.1py3-none-any.whl → 0.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

sdg_hub/_version.py +2 -2
sdg_hub/core/blocks/base.py +60 -58
sdg_hub/core/blocks/filtering/column_value_filter.py +29 -16
sdg_hub/core/blocks/llm/__init__.py +0 -2
sdg_hub/core/blocks/llm/llm_chat_block.py +42 -36
sdg_hub/core/blocks/llm/llm_parser_block.py +13 -59
sdg_hub/core/blocks/llm/prompt_builder_block.py +15 -10
sdg_hub/core/blocks/llm/text_parser_block.py +14 -61
sdg_hub/core/blocks/transform/duplicate_columns.py +9 -8
sdg_hub/core/blocks/transform/index_based_mapper.py +29 -15
sdg_hub/core/blocks/transform/json_structure_block.py +16 -13
sdg_hub/core/blocks/transform/melt_columns.py +13 -12
sdg_hub/core/blocks/transform/rename_columns.py +20 -9
sdg_hub/core/blocks/transform/text_concat.py +20 -21
sdg_hub/core/blocks/transform/uniform_col_val_setter.py +6 -5
sdg_hub/core/flow/base.py +139 -106
sdg_hub/core/flow/checkpointer.py +34 -36
sdg_hub/core/flow/validation.py +4 -4
sdg_hub/core/utils/datautils.py +52 -54
sdg_hub/core/utils/flow_metrics.py +9 -6
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +1 -0
{sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/METADATA +5 -9
{sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/RECORD +26 -28
sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -771
sdg_hub/core/utils/temp_manager.py +0 -57
{sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/WHEEL +0 -0
{sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/licenses/LICENSE +0 -0
{sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/top_level.txt +0 -0

sdg_hub/core/flow/checkpointer.py CHANGED Viewed

@@ -9,7 +9,7 @@ import os
 import uuid
 # Third Party
-from datasets import Dataset
+import pandas as pd
 # Local
 from ..utils.datautils import safe_concatenate_with_validation
@@ -67,18 +67,18 @@ class FlowCheckpointer:
         return os.path.join(self.checkpoint_dir, "flow_metadata.json")
     def load_existing_progress(
-        self, input_dataset: Dataset
-    ) -> Tuple[Dataset, Optional[Dataset]]:
+        self, input_dataset: pd.DataFrame
+    ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
         """Load existing checkpoint data and determine remaining work.
         Parameters
         ----------
-        input_dataset : Dataset
+        input_dataset : pd.DataFrame
             Original input dataset for the flow.
         Returns
         -------
-        Tuple[Dataset, Optional[Dataset]]
+        Tuple[pd.DataFrame, Optional[pd.DataFrame]]
             (remaining_samples_to_process, completed_samples_dataset)
             If no checkpoints exist, returns (input_dataset, None)
         """
@@ -127,20 +127,20 @@ class FlowCheckpointer:
             logger.warning(f"Failed to load checkpoints: {exc}. Starting from scratch.")
             return input_dataset, None
-    def add_completed_samples(self, samples: Dataset) -> None:
+    def add_completed_samples(self, samples: pd.DataFrame) -> None:
         """Add samples that have completed the entire flow.
         Parameters
         ----------
-        samples : Dataset
+        samples : pd.DataFrame
             Samples that have completed processing through all blocks.
         """
         if not self.is_enabled:
             return
         # Add to pending samples
-        for sample in samples:
-            self._pending_samples.append(sample)
+        for _, sample in samples.iterrows():
+            self._pending_samples.append(sample.to_dict())
             self._samples_processed += 1
             # Check if we should save a checkpoint
@@ -167,9 +167,9 @@ class FlowCheckpointer:
             self.checkpoint_dir, f"checkpoint_{self._checkpoint_counter:04d}.jsonl"
         )
-        # Convert pending samples to dataset and save
-        checkpoint_dataset = Dataset.from_list(self._pending_samples)
-        checkpoint_dataset.to_json(checkpoint_file, orient="records", lines=True)
+        # Convert pending samples to dataframe and save
+        checkpoint_df = pd.DataFrame(self._pending_samples)
+        checkpoint_df.to_json(checkpoint_file, orient="records", lines=True)
         # Update metadata
         self._save_metadata()
@@ -207,7 +207,7 @@ class FlowCheckpointer:
             logger.warning(f"Failed to load metadata: {exc}")
             return None
-    def _load_completed_samples(self) -> Optional[Dataset]:
+    def _load_completed_samples(self) -> Optional[pd.DataFrame]:
         """Load all completed samples from checkpoint files."""
         checkpoint_files = []
         checkpoint_dir = Path(self.checkpoint_dir)
@@ -222,27 +222,25 @@ class FlowCheckpointer:
         # Sort checkpoint files by number
         checkpoint_files.sort()
-        # Load and concatenate all checkpoint datasets
-        datasets = []
+        # Load and concatenate all checkpoint dataframes
+        dataframes = []
         for file_path in checkpoint_files:
             try:
-                dataset = Dataset.from_json(file_path)
-                if len(dataset) > 0:
-                    datasets.append(dataset)
-                    logger.debug(
-                        f"Loaded checkpoint: {file_path} ({len(dataset)} samples)"
-                    )
+                df = pd.read_json(file_path, lines=True)
+                if len(df) > 0:
+                    dataframes.append(df)
+                    logger.debug(f"Loaded checkpoint: {file_path} ({len(df)} samples)")
             except Exception as exc:
                 logger.warning(f"Failed to load checkpoint {file_path}: {exc}")
-        if not datasets:
+        if not dataframes:
             return None
-        return safe_concatenate_with_validation(datasets, "checkpoint files")
+        return safe_concatenate_with_validation(dataframes, "checkpoint files")
     def _find_remaining_samples(
-        self, input_dataset: Dataset, completed_dataset: Dataset
-    ) -> Dataset:
+        self, input_dataset: pd.DataFrame, completed_dataset: pd.DataFrame
+    ) -> pd.DataFrame:
         """Find samples from input_dataset that are not in completed_dataset.
         Note: Assumes input_dataset contains unique samples. For datasets with
@@ -250,19 +248,19 @@ class FlowCheckpointer:
         Parameters
         ----------
-        input_dataset : Dataset
+        input_dataset : pd.DataFrame
             Original input dataset (assumed to contain unique samples).
-        completed_dataset : Dataset
+        completed_dataset : pd.DataFrame
             Dataset of completed samples.
         Returns
         -------
-        Dataset
+        pd.DataFrame
             Samples that still need processing.
         """
         # Get common columns for comparison
-        input_columns = set(input_dataset.column_names)
-        completed_columns = set(completed_dataset.column_names)
+        input_columns = set(input_dataset.columns.tolist())
+        completed_columns = set(completed_dataset.columns.tolist())
         common_columns = list(input_columns & completed_columns)
         if not common_columns:
@@ -272,9 +270,9 @@ class FlowCheckpointer:
             )
             return input_dataset
-        # Convert to pandas for easier comparison
-        input_df = input_dataset.select_columns(common_columns).to_pandas()
-        completed_df = completed_dataset.select_columns(common_columns).to_pandas()
+        # Select only common columns for comparison
+        input_df = input_dataset[common_columns]
+        completed_df = completed_dataset[common_columns]
         # Find rows that haven't been completed
         # Use tuple representation for comparison
@@ -287,10 +285,10 @@ class FlowCheckpointer:
         remaining_indices = input_df[remaining_mask].index.tolist()
         if not remaining_indices:
-            # Return empty dataset with same structure
-            return input_dataset.select([])
+            # Return empty dataframe with same structure
+            return input_dataset.iloc[0:0]
-        return input_dataset.select(remaining_indices)
+        return input_dataset.iloc[remaining_indices]
     def get_progress_info(self) -> Dict[str, Any]:
         """Get information about current progress.

sdg_hub/core/flow/validation.py CHANGED Viewed

@@ -5,7 +5,7 @@
 from typing import TYPE_CHECKING, Any
 # Third Party
-from datasets import Dataset
+import pandas as pd
 if TYPE_CHECKING:
     # Local
@@ -180,14 +180,14 @@ class FlowValidator:
         return errors
-    def validate_flow_execution(self, flow: "Flow", dataset: Dataset) -> list[str]:
+    def validate_flow_execution(self, flow: "Flow", dataset: pd.DataFrame) -> list[str]:
         """Validate that a flow can be executed with the given dataset.
         Parameters
         ----------
         flow : Flow
             The flow to validate.
-        dataset : Dataset
+        dataset : pd.DataFrame
             Dataset to validate against.
         Returns
@@ -206,7 +206,7 @@ class FlowValidator:
             return errors
         # Track available columns as we progress through blocks
-        current_columns = set(dataset.column_names)
+        current_columns = set(dataset.columns.tolist())
         for _i, block in enumerate(flow.blocks):
             block_name = block.block_name

sdg_hub/core/utils/datautils.py CHANGED Viewed

@@ -1,33 +1,67 @@
 # Third Party
-from datasets import Dataset, concatenate_datasets
 import numpy as np
+import pandas as pd
 # Local
 from .error_handling import FlowValidationError
+def _is_hashable(x):
+    """Check if a value is hashable."""
+    try:
+        hash(x)
+        return True
+    except TypeError:
+        return False
+def _make_hashable(x):
+    """Convert any value to a hashable representation for duplicate detection.
+    Handles numpy arrays, dicts, sets, lists, and other complex types by
+    converting them to hashable equivalents (tuples, frozensets, etc.).
+    """
+    if _is_hashable(x):
+        return x
+    if isinstance(x, np.ndarray):
+        if x.ndim == 0:
+            return _make_hashable(x.item())
+        return tuple(_make_hashable(i) for i in x)
+    if isinstance(x, dict):
+        return tuple(
+            sorted(
+                ((k, _make_hashable(v)) for k, v in x.items()),
+                key=lambda kv: repr(kv[0]),
+            )
+        )
+    if isinstance(x, (set, frozenset)):
+        return frozenset(_make_hashable(i) for i in x)
+    if hasattr(x, "__iter__"):
+        return tuple(_make_hashable(i) for i in x)
+    return repr(x)
 def safe_concatenate_datasets(datasets: list):
     """Concatenate datasets safely, ignoring any datasets that are None or empty."""
-    filtered_datasets = [ds for ds in datasets if ds is not None and ds.num_rows > 0]
+    filtered_datasets = [ds for ds in datasets if ds is not None and len(ds) > 0]
     if not filtered_datasets:
         return None
-    return concatenate_datasets(filtered_datasets)
+    return pd.concat(filtered_datasets, ignore_index=True)
-def validate_no_duplicates(dataset: Dataset) -> None:
+def validate_no_duplicates(dataset: pd.DataFrame) -> None:
     """
     Validate that the input dataset contains only unique rows.
     Uses pandas `.duplicated()` for efficient duplicate detection, with preprocessing
-    to handle numpy arrays that cause TypeError in pandas duplicate detection.
-    Raises FlowValidationError if duplicates are found, including a count
-    of the duplicate rows detected.
+    to handle numpy arrays and other unhashable types that cause TypeError in pandas
+    duplicate detection.
     Parameters
     ----------
-    dataset : Dataset
+    dataset : pd.DataFrame
         Input dataset to validate.
     Raises
@@ -38,47 +72,11 @@ def validate_no_duplicates(dataset: Dataset) -> None:
     if len(dataset) == 0:
         return
-    df = dataset.to_pandas()
-    def is_hashable(x):
-        try:
-            hash(x)
-            return True
-        except TypeError:
-            return False
-    def make_hashable(x):
-        if is_hashable(x):
-            # int, float, str, bytes, None etc. are already hashable
-            return x
-        if isinstance(x, np.ndarray):
-            if x.ndim == 0:
-                return make_hashable(x.item())
-            return tuple(make_hashable(i) for i in x)
-        if isinstance(x, dict):
-            # sort robustly even with heterogeneous key types
-            return tuple(
-                sorted(
-                    ((k, make_hashable(v)) for k, v in x.items()),
-                    key=lambda kv: repr(kv[0]),
-                )
-            )
-        if isinstance(x, (set, frozenset)):
-            # order‑insensitive
-            return frozenset(make_hashable(i) for i in x)
-        if hasattr(x, "__iter__"):
-            # lists, tuples, custom iterables
-            return tuple(make_hashable(i) for i in x)
-        # last‑resort fallback to a stable representation
-        return repr(x)
-    # Apply to the whole dataframe to ensure every cell is hashable
-    if hasattr(df, "map"):
-        df = df.map(make_hashable)
-    else:
-        df = df.applymap(make_hashable)
-    duplicate_count = int(df.duplicated(keep="first").sum())
+    # Transform all cells to hashable representations for duplicate detection
+    # This creates a temporary copy but is necessary for reliable duplicate detection
+    hashable_df = dataset.map(_make_hashable)
+    duplicate_count = int(hashable_df.duplicated(keep="first").sum())
     if duplicate_count > 0:
         raise FlowValidationError(
             f"Input dataset contains {duplicate_count} duplicate rows. "
@@ -89,19 +87,19 @@ def validate_no_duplicates(dataset: Dataset) -> None:
 def safe_concatenate_with_validation(
     datasets: list, context: str = "datasets"
-) -> Dataset:
+) -> pd.DataFrame:
     """Safely concatenate datasets with schema validation and clear error messages.
     Parameters
     ----------
-    datasets : list[Dataset]
+    datasets : list[pd.DataFrame]
         List of datasets to concatenate
     context : str
         Description of what's being concatenated for error messages
     Returns
     -------
-    Dataset
+    pd.DataFrame
         Concatenated dataset
     Raises
@@ -119,12 +117,12 @@ def safe_concatenate_with_validation(
         return valid_datasets[0]
     try:
-        return concatenate_datasets(valid_datasets)
+        return pd.concat(valid_datasets, ignore_index=True)
     except Exception as e:
         # Schema mismatch or other concatenation error
         schema_info = []
         for i, ds in enumerate(valid_datasets):
-            schema_info.append(f"Dataset {i}: columns={ds.column_names}")
+            schema_info.append(f"Dataset {i}: columns={ds.columns.tolist()}")
         schema_details = "\n".join(schema_info)
         raise FlowValidationError(

sdg_hub/core/utils/flow_metrics.py CHANGED Viewed

@@ -8,12 +8,13 @@ from typing import Any, Optional
 import json
 import time
-# Third Party
-from datasets import Dataset
 from rich.console import Console
 from rich.panel import Panel
 from rich.table import Table
+# Third Party
+import pandas as pd
 def aggregate_block_metrics(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
     """Aggregate per-block metrics, coalescing chunked runs.
@@ -71,7 +72,7 @@ def aggregate_block_metrics(entries: list[dict[str, Any]]) -> list[dict[str, Any
 def display_metrics_summary(
     block_metrics: list[dict[str, Any]],
     flow_name: str,
-    final_dataset: Optional[Dataset] = None,
+    final_dataset: Optional[pd.DataFrame] = None,
 ) -> None:
     """Display a rich table summarizing block execution metrics.
@@ -81,7 +82,7 @@ def display_metrics_summary(
         Raw block metrics from flow execution.
     flow_name : str
         Name of the flow for display title.
-    final_dataset : Optional[Dataset], optional
+    final_dataset : Optional[pd.DataFrame], optional
         Final dataset from flow execution. None if flow failed.
     """
     if not block_metrics:
@@ -146,8 +147,10 @@ def display_metrics_summary(
     # Add summary row
     table.add_section()
-    final_row_count = len(final_dataset) if final_dataset else 0
-    final_col_count = len(final_dataset.column_names) if final_dataset else 0
+    final_row_count = len(final_dataset) if final_dataset is not None else 0
+    final_col_count = (
+        len(final_dataset.columns.tolist()) if final_dataset is not None else 0
+    )
     table.add_row(
         "[bold]TOTAL[/bold]",

sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml CHANGED Viewed

@@ -16,6 +16,7 @@ metadata:
     - "qa-pairs"
     - "document-processing"
     - "educational"
+    - "multilingual"
     - "japanese"
   license: "Apache-2.0"

{sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sdg_hub
-Version: 0.5.1
+Version: 0.6.1
 Summary: Synthetic Data Generation
 Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
 License: Apache-2.0
@@ -28,23 +28,17 @@ Requires-Dist: httpx<1.0.0,>=0.25.0
 Requires-Dist: jinja2
 Requires-Dist: litellm<1.75.0,>=1.73.0
 Requires-Dist: rich
+Requires-Dist: pandas
 Requires-Dist: pydantic<3.0.0,>=2.0.0
 Requires-Dist: python-dotenv<2.0.0,>=1.0.0
 Requires-Dist: tenacity!=8.4.0,>=8.3.0
 Requires-Dist: tqdm<5.0.0,>=4.66.2
-Provides-Extra: vllm
-Requires-Dist: vllm>=0.9.1; extra == "vllm"
-Requires-Dist: torch>=2.0.0; extra == "vllm"
-Requires-Dist: transformers>=4.37.0; extra == "vllm"
-Requires-Dist: accelerate>=0.21.0; extra == "vllm"
-Requires-Dist: xformers>=0.0.22.post7; extra == "vllm"
 Provides-Extra: examples
 Requires-Dist: tabulate>=0.9.0; extra == "examples"
 Requires-Dist: transformers>=4.37.0; extra == "examples"
 Requires-Dist: langchain-text-splitters; extra == "examples"
 Requires-Dist: docling>=2.3.0; extra == "examples"
 Requires-Dist: scikit-learn; extra == "examples"
-Requires-Dist: pandas; extra == "examples"
 Requires-Dist: polars; extra == "examples"
 Requires-Dist: matplotlib; extra == "examples"
 Requires-Dist: spacy; extra == "examples"
@@ -76,7 +70,9 @@ Dynamic: license-file
 [![Tests](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml/badge.svg)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml)
 [![codecov](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub/graph/badge.svg?token=SP75BCXWO2)](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub)
+<p align="center">
+  <img src="docs/assets/sdg-hub-cover.png" alt="SDG Hub Cover" width="400">
+</p>
 A modular Python framework for building synthetic data generation pipelines using composable blocks and flows. Transform datasets through **building-block composition** - mix and match LLM-powered and traditional processing blocks to create sophisticated data generation workflows.

{sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/RECORD RENAMED Viewed

@@ -1,42 +1,40 @@
 sdg_hub/__init__.py,sha256=TlkZT40-70urdcWLqv3kupaJj8s-SVgd2QyvlSFwb4A,510
-sdg_hub/_version.py,sha256=cYMOhuaBHd0MIZmumuccsEQ-AxM8LIJy9dsBAWgOpqE,704
+sdg_hub/_version.py,sha256=7vNQiXfKffK0nbqts6Xy6-E1b1YOm4EGigvgaHr83o4,704
 sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sdg_hub/core/__init__.py,sha256=e3BoejbqjYhasf9t__L4qE52lkD9EBjx4o--2kqKdro,460
 sdg_hub/core/blocks/__init__.py,sha256=8Rn1SglH8V3jGmTD_cG-h7qk9ktAab2eaBdyk7RN_hY,865
-sdg_hub/core/blocks/base.py,sha256=-SOdBpJwtRTMsrmCEuLjUBQMRCo_PLYlHEBRrz8sF9g,13031
+sdg_hub/core/blocks/base.py,sha256=EpHvqXySIdx0f672c-csGKKs7N57ablC8pad_SiB1s8,13066
 sdg_hub/core/blocks/registry.py,sha256=FuEN_pnq-nSH1LguY3_oCubT6Kz3SuJjk3TcUpLT-lw,10695
 sdg_hub/core/blocks/filtering/__init__.py,sha256=isxSVSvDqkMjG8dQSl3Q2M4g5c1t9fTjBSA21icf-yA,275
-sdg_hub/core/blocks/filtering/column_value_filter.py,sha256=2Z9j_CiiTn5mHZ9gfXU-itLXDmeXSh0UI0x1x7j-LQ0,6001
-sdg_hub/core/blocks/llm/__init__.py,sha256=AyS0dd3pkPPXH5a9aj4mT5HsKjX2vjXfkmQc6rkFV4A,795
+sdg_hub/core/blocks/filtering/column_value_filter.py,sha256=tHNykB-Q_ItbjDzvlpnjt0Z46mR67O6ZY29ed2ecOwo,6493
+sdg_hub/core/blocks/llm/__init__.py,sha256=1Oo2nv2uXJ2AzRlrQcqDi7gW1FNh9Fid84L89dvy4qM,683
 sdg_hub/core/blocks/llm/error_handler.py,sha256=7T-019ZFB9qgZoX1ybIiXyaLjPzrF96qcKmUu6vmO6g,12178
-sdg_hub/core/blocks/llm/llm_chat_block.py,sha256=MHhI2x9i6LrfDXgvAy2_6YxgyoD7j6BpCgNGsM69xDg,22194
-sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py,sha256=DW4b09IqXmcshvXawFheDyaLp3rz7vpO5VBrKdUQYW8,31703
-sdg_hub/core/blocks/llm/llm_parser_block.py,sha256=pCTaxAML5uFERZx0KTunvgVPHm1H2154VTvF79bGrB8,13699
-sdg_hub/core/blocks/llm/prompt_builder_block.py,sha256=fkJd718X1oYlMY1cjo_8WCO16Gl8Tm0bUPWR78E_uws,13935
-sdg_hub/core/blocks/llm/text_parser_block.py,sha256=NGwBdFmfbY3rbm_T7bqTJmaREo2MpSpQwgLrnHHZHqU,14255
+sdg_hub/core/blocks/llm/llm_chat_block.py,sha256=ckkjF_r9CxoX2sJiikFWFxNrAS4w_gMnedo70TrQo3Y,22730
+sdg_hub/core/blocks/llm/llm_parser_block.py,sha256=NFk8xXceK_F1Pzn9dFNX65ynavuoQiH2ltDLLY_6SXQ,12136
+sdg_hub/core/blocks/llm/prompt_builder_block.py,sha256=zI8DFz34abGnH2Mk0KQe4Mkkb5ophwV7brn4axNsZ2I,14146
+sdg_hub/core/blocks/llm/text_parser_block.py,sha256=CoyfgKcJL9JpokzMcKk4bYeEBr6xnN0XYk45hJANnBQ,12763
 sdg_hub/core/blocks/transform/__init__.py,sha256=lF9InjOzA6p_mjiwV-a2Kwstq9kqRiQ-dEwbsmR9yQs,825
-sdg_hub/core/blocks/transform/duplicate_columns.py,sha256=SaP7rIF4ZFEFFa50aU2xGNIuddXaEZrKxdWfHjzFpVI,2833
-sdg_hub/core/blocks/transform/index_based_mapper.py,sha256=XC_a7Skbd3mu7f4ra8fGWPxMwqUMSjJkQ7Ag7vflwJA,8235
-sdg_hub/core/blocks/transform/json_structure_block.py,sha256=hm-0M0NAyUREgJRPyV1u-laorgX6MZ1o17E9rNBhN78,5010
-sdg_hub/core/blocks/transform/melt_columns.py,sha256=vaYa5Taq6GhNZYWFL4uPK3-SfN2BsKEm-wvjd2EYYoI,4382
-sdg_hub/core/blocks/transform/rename_columns.py,sha256=W2hcDSJY6L73ZpElUhOML2sGLM9Y-v0gSo3xEF1LXDc,2749
-sdg_hub/core/blocks/transform/text_concat.py,sha256=_-B__Hob1WwgwkILPIZvTnsDzuwtoX1hKviyzHlnnes,3149
-sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=XnjiT29z3PzIPy8M-mmE2w-Miab6Ed5ahy32SaxTCTE,3263
+sdg_hub/core/blocks/transform/duplicate_columns.py,sha256=dYTxgkWq6X2B37pemJdmAVi56A29NF25YTwUUyN9xHs,2837
+sdg_hub/core/blocks/transform/index_based_mapper.py,sha256=W9ezZNgLUGbLk2U1UJCi2KFbSRPM0Q4vHnP5HGlhsoQ,8908
+sdg_hub/core/blocks/transform/json_structure_block.py,sha256=w7Ex2F3gvpG7uUnM2JM1a7D5xUKGE6HRKwyJpnfLPzc,5069
+sdg_hub/core/blocks/transform/melt_columns.py,sha256=zH3d3C0EO2DVRZqmhyr_g51xz1ZmuBRinrngUCiZkrM,4383
+sdg_hub/core/blocks/transform/rename_columns.py,sha256=EafchUDXvfXxqwRvNIcy92I1Zy6U8lsibtSqWaYdMPU,3150
+sdg_hub/core/blocks/transform/text_concat.py,sha256=Oo6VKGdmeiUmH3B0PDL1y_ot-bYmkT2jbGj7g7C84gg,3089
+sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=Osbz-jciBx5jFfzUbtbCBh_ET4CySG2h0IGWChESHi4,3239
 sdg_hub/core/flow/__init__.py,sha256=0_m_htuZfPxk8xQ9IKfp0Pz-JRE4O7lYMUFrKyLNoLA,409
-sdg_hub/core/flow/base.py,sha256=Z2P8QBLl7HWVISdI585hxnIiTu9FhnjlTXn-ngr36Jk,58189
-sdg_hub/core/flow/checkpointer.py,sha256=stm5ZtjjEiLk9ZkAAnoQQn5Y8Yl_d7qCsQLZTrCXR48,11867
+sdg_hub/core/flow/base.py,sha256=9nCXrCdKMzMAoIpiv2Zo7RzZhiLluXJ9XQAtg3wh_40,59104
+sdg_hub/core/flow/checkpointer.py,sha256=MJay3Q5cfRgJDetk82DaMKJ3ZZUYRHxQabEQTxhGukk,11850
 sdg_hub/core/flow/metadata.py,sha256=cFrpJjWOaK87aCuRFyC3Pdf83oYU93mrmZEMdUnhsN8,10540
 sdg_hub/core/flow/registry.py,sha256=N6KfX-L7QRkooznIFxDuhRZYuDA5g3N5zC-KRm2jVhk,12109
-sdg_hub/core/flow/validation.py,sha256=pUJvgaUjLpKNwvW6djcqVOF-HShOjegEmGOnUnoX4BA,9722
+sdg_hub/core/flow/validation.py,sha256=6hs16DnusUYPo6vD_7DcgzRP5JOHDf2wPvgqvBn6hB0,9727
 sdg_hub/core/utils/__init__.py,sha256=KcT56JhobC5sBg0MKEMn5hc4OyKa9_Vnn45Mt_kS4jQ,610
-sdg_hub/core/utils/datautils.py,sha256=__HkUe1DxcJVHKrFX68z_hDXwxJygBlJDfjJLnj7rHc,4230
+sdg_hub/core/utils/datautils.py,sha256=7YzG_IpMHj04zHl-r7mswOd3IzTQKJJdfmMBgm7VXWM,4082
 sdg_hub/core/utils/error_handling.py,sha256=yku8cGj_nKCyXDsnb-mHCpgukkkAMucJ4iAUrIzqysc,5510
 sdg_hub/core/utils/flow_id_words.yaml,sha256=5QHpQdP7zwahRuooyAlJIwBY7WcDR7vtbJXxVJqujbg,2317
 sdg_hub/core/utils/flow_identifier.py,sha256=aAHfK_G9AwEtMglLRMdMpi_AI1dciub5UqBGm4yb2HE,2841
-sdg_hub/core/utils/flow_metrics.py,sha256=3G-xbfr-rFA578wV4KUbQePTMVGZHr9-rXvyYL4Kt2Q,12604
+sdg_hub/core/utils/flow_metrics.py,sha256=84ihZHOwbxhqPTdnUXclytf5Tva-IoA1oKIruIXv0Eo,12650
 sdg_hub/core/utils/logger_config.py,sha256=6_cnsIHtSAdq1iTTZ7Q7nAJ1dmldlxSZ0AB49yLiQ20,2034
 sdg_hub/core/utils/path_resolution.py,sha256=yWof4kGNpQ5dKcrVHg0h9KfOKLZ6ROjdfsLAZsQT5rM,2000
-sdg_hub/core/utils/temp_manager.py,sha256=moSPWMxoDEw5FmeuwKTC8f3tYcarQDN0ozv0796CeGg,1484
 sdg_hub/core/utils/time_estimator.py,sha256=rM3_R-Ka5DEtvOtlJoA_5pXSyQ6tT6t4h6qh3_5BCZo,12639
 sdg_hub/core/utils/yaml_utils.py,sha256=tShCd-FFkp0xlKnLe7dXsMOR4AvT9d2qRUmu4ZnPSEY,1458
 sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -69,7 +67,7 @@ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/j
 sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/atomic_facts_ja.yaml,sha256=OjPZaSCOSLxEWgW3pmNwF7mmLhGhFGTmKL_3rKdqeW4,2488
 sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/detailed_summary_ja.yaml,sha256=nEy_RcotHGiiENrmUANpKkbIFsrARAeSwECrBeHi2so,391
 sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/extractive_summary_ja.yaml,sha256=V90W0IeJQZTFThA8v0UOs3DtZbtU3BI9jkpChw1BULo,402
-sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml,sha256=Q6RusV-_HHMr5jlFNOP6UVuEf8d6btHENMOP3MnB3u0,9291
+sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml,sha256=U9DBWSKkYGGtwWQ39o8l7g-mLb93505APTEFePyzqIc,9312
 sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/generate_questions_responses_ja.yaml,sha256=96SQqXG7fmb-50SdX85sgVtrFcQ-oNKe_0BoQdZmY5g,2638
 sdg_hub/flows/text_analysis/__init__.py,sha256=WStks4eM_KHNTVsHglcj8vFghmI0PH9P1hUrijBLbwc,125
 sdg_hub/flows/text_analysis/structured_insights/__init__.py,sha256=_DT4NR05JD9CZoSWROPr2lC6se0VjSqQPZJJlEV79mk,274
@@ -78,8 +76,8 @@ sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml,sha256=Q_S
 sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml,sha256=_nPPMdHnxag_lYbhYUjGJGo-CvRwWvwdGX7cQhdZ1S0,847
 sdg_hub/flows/text_analysis/structured_insights/flow.yaml,sha256=BBV18SdvuVTAESjwkJ7V1jbb-cSTBvNl3SCycd0oEQ4,4934
 sdg_hub/flows/text_analysis/structured_insights/summarize.yaml,sha256=WXwQak1pF8e1OwnOoI1EHu8QB6iUNW89rfkTdi1Oq54,687
-sdg_hub-0.5.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-sdg_hub-0.5.1.dist-info/METADATA,sha256=f5pTZHWrt0JQPHysvca3M7U7HU0Yus5jnGK8KrT2U-g,9775
-sdg_hub-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-sdg_hub-0.5.1.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
-sdg_hub-0.5.1.dist-info/RECORD,,
+sdg_hub-0.6.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+sdg_hub-0.6.1.dist-info/METADATA,sha256=JQxLH1YwDrV5D1cAaaRziFFiF17buxN-fnyse5lQVV8,9584
+sdg_hub-0.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+sdg_hub-0.6.1.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
+sdg_hub-0.6.1.dist-info/RECORD,,

sdg-hub 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

sdg-hub 0.5.1py3-none-any.whl → 0.6.1py3-none-any.whl