PyPI - sdg-hub - Versions diffs - 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

sdg-hub 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

sdg_hub/_version.py +2 -2
sdg_hub/core/blocks/base.py +60 -58
sdg_hub/core/blocks/filtering/column_value_filter.py +29 -16
sdg_hub/core/blocks/llm/__init__.py +0 -2
sdg_hub/core/blocks/llm/llm_chat_block.py +42 -36
sdg_hub/core/blocks/llm/llm_parser_block.py +13 -7
sdg_hub/core/blocks/llm/prompt_builder_block.py +15 -10
sdg_hub/core/blocks/llm/text_parser_block.py +14 -9
sdg_hub/core/blocks/transform/duplicate_columns.py +9 -8
sdg_hub/core/blocks/transform/index_based_mapper.py +29 -15
sdg_hub/core/blocks/transform/json_structure_block.py +16 -13
sdg_hub/core/blocks/transform/melt_columns.py +13 -12
sdg_hub/core/blocks/transform/rename_columns.py +20 -9
sdg_hub/core/blocks/transform/text_concat.py +20 -21
sdg_hub/core/blocks/transform/uniform_col_val_setter.py +6 -5
sdg_hub/core/flow/base.py +139 -57
sdg_hub/core/flow/checkpointer.py +34 -36
sdg_hub/core/flow/validation.py +4 -4
sdg_hub/core/utils/datautils.py +52 -54
sdg_hub/core/utils/flow_metrics.py +9 -6
{sdg_hub-0.5.0.dist-info → sdg_hub-0.6.0.dist-info}/METADATA +2 -8
{sdg_hub-0.5.0.dist-info → sdg_hub-0.6.0.dist-info}/RECORD +25 -26
sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -771
{sdg_hub-0.5.0.dist-info → sdg_hub-0.6.0.dist-info}/WHEEL +0 -0
{sdg_hub-0.5.0.dist-info → sdg_hub-0.6.0.dist-info}/licenses/LICENSE +0 -0
{sdg_hub-0.5.0.dist-info → sdg_hub-0.6.0.dist-info}/top_level.txt +0 -0

sdg_hub/core/utils/datautils.py CHANGED Viewed

@@ -1,33 +1,67 @@
 # Third Party
-from datasets import Dataset, concatenate_datasets
 import numpy as np
+import pandas as pd
 # Local
 from .error_handling import FlowValidationError
+def _is_hashable(x):
+    """Check if a value is hashable."""
+    try:
+        hash(x)
+        return True
+    except TypeError:
+        return False
+def _make_hashable(x):
+    """Convert any value to a hashable representation for duplicate detection.
+    Handles numpy arrays, dicts, sets, lists, and other complex types by
+    converting them to hashable equivalents (tuples, frozensets, etc.).
+    """
+    if _is_hashable(x):
+        return x
+    if isinstance(x, np.ndarray):
+        if x.ndim == 0:
+            return _make_hashable(x.item())
+        return tuple(_make_hashable(i) for i in x)
+    if isinstance(x, dict):
+        return tuple(
+            sorted(
+                ((k, _make_hashable(v)) for k, v in x.items()),
+                key=lambda kv: repr(kv[0]),
+            )
+        )
+    if isinstance(x, (set, frozenset)):
+        return frozenset(_make_hashable(i) for i in x)
+    if hasattr(x, "__iter__"):
+        return tuple(_make_hashable(i) for i in x)
+    return repr(x)
 def safe_concatenate_datasets(datasets: list):
     """Concatenate datasets safely, ignoring any datasets that are None or empty."""
-    filtered_datasets = [ds for ds in datasets if ds is not None and ds.num_rows > 0]
+    filtered_datasets = [ds for ds in datasets if ds is not None and len(ds) > 0]
     if not filtered_datasets:
         return None
-    return concatenate_datasets(filtered_datasets)
+    return pd.concat(filtered_datasets, ignore_index=True)
-def validate_no_duplicates(dataset: Dataset) -> None:
+def validate_no_duplicates(dataset: pd.DataFrame) -> None:
     """
     Validate that the input dataset contains only unique rows.
     Uses pandas `.duplicated()` for efficient duplicate detection, with preprocessing
-    to handle numpy arrays that cause TypeError in pandas duplicate detection.
-    Raises FlowValidationError if duplicates are found, including a count
-    of the duplicate rows detected.
+    to handle numpy arrays and other unhashable types that cause TypeError in pandas
+    duplicate detection.
     Parameters
     ----------
-    dataset : Dataset
+    dataset : pd.DataFrame
         Input dataset to validate.
     Raises
@@ -38,47 +72,11 @@ def validate_no_duplicates(dataset: Dataset) -> None:
     if len(dataset) == 0:
         return
-    df = dataset.to_pandas()
-    def is_hashable(x):
-        try:
-            hash(x)
-            return True
-        except TypeError:
-            return False
-    def make_hashable(x):
-        if is_hashable(x):
-            # int, float, str, bytes, None etc. are already hashable
-            return x
-        if isinstance(x, np.ndarray):
-            if x.ndim == 0:
-                return make_hashable(x.item())
-            return tuple(make_hashable(i) for i in x)
-        if isinstance(x, dict):
-            # sort robustly even with heterogeneous key types
-            return tuple(
-                sorted(
-                    ((k, make_hashable(v)) for k, v in x.items()),
-                    key=lambda kv: repr(kv[0]),
-                )
-            )
-        if isinstance(x, (set, frozenset)):
-            # order‑insensitive
-            return frozenset(make_hashable(i) for i in x)
-        if hasattr(x, "__iter__"):
-            # lists, tuples, custom iterables
-            return tuple(make_hashable(i) for i in x)
-        # last‑resort fallback to a stable representation
-        return repr(x)
-    # Apply to the whole dataframe to ensure every cell is hashable
-    if hasattr(df, "map"):
-        df = df.map(make_hashable)
-    else:
-        df = df.applymap(make_hashable)
-    duplicate_count = int(df.duplicated(keep="first").sum())
+    # Transform all cells to hashable representations for duplicate detection
+    # This creates a temporary copy but is necessary for reliable duplicate detection
+    hashable_df = dataset.map(_make_hashable)
+    duplicate_count = int(hashable_df.duplicated(keep="first").sum())
     if duplicate_count > 0:
         raise FlowValidationError(
             f"Input dataset contains {duplicate_count} duplicate rows. "
@@ -89,19 +87,19 @@ def validate_no_duplicates(dataset: Dataset) -> None:
 def safe_concatenate_with_validation(
     datasets: list, context: str = "datasets"
-) -> Dataset:
+) -> pd.DataFrame:
     """Safely concatenate datasets with schema validation and clear error messages.
     Parameters
     ----------
-    datasets : list[Dataset]
+    datasets : list[pd.DataFrame]
         List of datasets to concatenate
     context : str
         Description of what's being concatenated for error messages
     Returns
     -------
-    Dataset
+    pd.DataFrame
         Concatenated dataset
     Raises
@@ -119,12 +117,12 @@ def safe_concatenate_with_validation(
         return valid_datasets[0]
     try:
-        return concatenate_datasets(valid_datasets)
+        return pd.concat(valid_datasets, ignore_index=True)
     except Exception as e:
         # Schema mismatch or other concatenation error
         schema_info = []
         for i, ds in enumerate(valid_datasets):
-            schema_info.append(f"Dataset {i}: columns={ds.column_names}")
+            schema_info.append(f"Dataset {i}: columns={ds.columns.tolist()}")
         schema_details = "\n".join(schema_info)
         raise FlowValidationError(

sdg_hub/core/utils/flow_metrics.py CHANGED Viewed

@@ -8,12 +8,13 @@ from typing import Any, Optional
 import json
 import time
-# Third Party
-from datasets import Dataset
 from rich.console import Console
 from rich.panel import Panel
 from rich.table import Table
+# Third Party
+import pandas as pd
 def aggregate_block_metrics(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
     """Aggregate per-block metrics, coalescing chunked runs.
@@ -71,7 +72,7 @@ def aggregate_block_metrics(entries: list[dict[str, Any]]) -> list[dict[str, Any
 def display_metrics_summary(
     block_metrics: list[dict[str, Any]],
     flow_name: str,
-    final_dataset: Optional[Dataset] = None,
+    final_dataset: Optional[pd.DataFrame] = None,
 ) -> None:
     """Display a rich table summarizing block execution metrics.
@@ -81,7 +82,7 @@ def display_metrics_summary(
         Raw block metrics from flow execution.
     flow_name : str
         Name of the flow for display title.
-    final_dataset : Optional[Dataset], optional
+    final_dataset : Optional[pd.DataFrame], optional
         Final dataset from flow execution. None if flow failed.
     """
     if not block_metrics:
@@ -146,8 +147,10 @@ def display_metrics_summary(
     # Add summary row
     table.add_section()
-    final_row_count = len(final_dataset) if final_dataset else 0
-    final_col_count = len(final_dataset.column_names) if final_dataset else 0
+    final_row_count = len(final_dataset) if final_dataset is not None else 0
+    final_col_count = (
+        len(final_dataset.columns.tolist()) if final_dataset is not None else 0
+    )
     table.add_row(
         "[bold]TOTAL[/bold]",

{sdg_hub-0.5.0.dist-info → sdg_hub-0.6.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sdg_hub
-Version: 0.5.0
+Version: 0.6.0
 Summary: Synthetic Data Generation
 Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
 License: Apache-2.0
@@ -28,23 +28,17 @@ Requires-Dist: httpx<1.0.0,>=0.25.0
 Requires-Dist: jinja2
 Requires-Dist: litellm<1.75.0,>=1.73.0
 Requires-Dist: rich
+Requires-Dist: pandas
 Requires-Dist: pydantic<3.0.0,>=2.0.0
 Requires-Dist: python-dotenv<2.0.0,>=1.0.0
 Requires-Dist: tenacity!=8.4.0,>=8.3.0
 Requires-Dist: tqdm<5.0.0,>=4.66.2
-Provides-Extra: vllm
-Requires-Dist: vllm>=0.9.1; extra == "vllm"
-Requires-Dist: torch>=2.0.0; extra == "vllm"
-Requires-Dist: transformers>=4.37.0; extra == "vllm"
-Requires-Dist: accelerate>=0.21.0; extra == "vllm"
-Requires-Dist: xformers>=0.0.22.post7; extra == "vllm"
 Provides-Extra: examples
 Requires-Dist: tabulate>=0.9.0; extra == "examples"
 Requires-Dist: transformers>=4.37.0; extra == "examples"
 Requires-Dist: langchain-text-splitters; extra == "examples"
 Requires-Dist: docling>=2.3.0; extra == "examples"
 Requires-Dist: scikit-learn; extra == "examples"
-Requires-Dist: pandas; extra == "examples"
 Requires-Dist: polars; extra == "examples"
 Requires-Dist: matplotlib; extra == "examples"
 Requires-Dist: spacy; extra == "examples"

{sdg_hub-0.5.0.dist-info → sdg_hub-0.6.0.dist-info}/RECORD RENAMED Viewed

@@ -1,39 +1,38 @@
 sdg_hub/__init__.py,sha256=TlkZT40-70urdcWLqv3kupaJj8s-SVgd2QyvlSFwb4A,510
-sdg_hub/_version.py,sha256=fvHpBU3KZKRinkriKdtAt3crenOyysELF-M9y3ozg3U,704
+sdg_hub/_version.py,sha256=MAYWefOLb6kbIRub18WSzK6ggSjz1LNLy9aDRlX9Ea4,704
 sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sdg_hub/core/__init__.py,sha256=e3BoejbqjYhasf9t__L4qE52lkD9EBjx4o--2kqKdro,460
 sdg_hub/core/blocks/__init__.py,sha256=8Rn1SglH8V3jGmTD_cG-h7qk9ktAab2eaBdyk7RN_hY,865
-sdg_hub/core/blocks/base.py,sha256=-SOdBpJwtRTMsrmCEuLjUBQMRCo_PLYlHEBRrz8sF9g,13031
+sdg_hub/core/blocks/base.py,sha256=EpHvqXySIdx0f672c-csGKKs7N57ablC8pad_SiB1s8,13066
 sdg_hub/core/blocks/registry.py,sha256=FuEN_pnq-nSH1LguY3_oCubT6Kz3SuJjk3TcUpLT-lw,10695
 sdg_hub/core/blocks/filtering/__init__.py,sha256=isxSVSvDqkMjG8dQSl3Q2M4g5c1t9fTjBSA21icf-yA,275
-sdg_hub/core/blocks/filtering/column_value_filter.py,sha256=2Z9j_CiiTn5mHZ9gfXU-itLXDmeXSh0UI0x1x7j-LQ0,6001
-sdg_hub/core/blocks/llm/__init__.py,sha256=AyS0dd3pkPPXH5a9aj4mT5HsKjX2vjXfkmQc6rkFV4A,795
+sdg_hub/core/blocks/filtering/column_value_filter.py,sha256=tHNykB-Q_ItbjDzvlpnjt0Z46mR67O6ZY29ed2ecOwo,6493
+sdg_hub/core/blocks/llm/__init__.py,sha256=1Oo2nv2uXJ2AzRlrQcqDi7gW1FNh9Fid84L89dvy4qM,683
 sdg_hub/core/blocks/llm/error_handler.py,sha256=7T-019ZFB9qgZoX1ybIiXyaLjPzrF96qcKmUu6vmO6g,12178
-sdg_hub/core/blocks/llm/llm_chat_block.py,sha256=MHhI2x9i6LrfDXgvAy2_6YxgyoD7j6BpCgNGsM69xDg,22194
-sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py,sha256=DW4b09IqXmcshvXawFheDyaLp3rz7vpO5VBrKdUQYW8,31703
-sdg_hub/core/blocks/llm/llm_parser_block.py,sha256=aoHqsDDhaIgCDfPpv7acc0DVN-zUgzFflRVB4win0aM,12012
-sdg_hub/core/blocks/llm/prompt_builder_block.py,sha256=fkJd718X1oYlMY1cjo_8WCO16Gl8Tm0bUPWR78E_uws,13935
-sdg_hub/core/blocks/llm/text_parser_block.py,sha256=975HK6NfXiU9Any4UDMpBNidRpyhHmc76BXUN69SVyc,12566
+sdg_hub/core/blocks/llm/llm_chat_block.py,sha256=ckkjF_r9CxoX2sJiikFWFxNrAS4w_gMnedo70TrQo3Y,22730
+sdg_hub/core/blocks/llm/llm_parser_block.py,sha256=NFk8xXceK_F1Pzn9dFNX65ynavuoQiH2ltDLLY_6SXQ,12136
+sdg_hub/core/blocks/llm/prompt_builder_block.py,sha256=zI8DFz34abGnH2Mk0KQe4Mkkb5ophwV7brn4axNsZ2I,14146
+sdg_hub/core/blocks/llm/text_parser_block.py,sha256=CoyfgKcJL9JpokzMcKk4bYeEBr6xnN0XYk45hJANnBQ,12763
 sdg_hub/core/blocks/transform/__init__.py,sha256=lF9InjOzA6p_mjiwV-a2Kwstq9kqRiQ-dEwbsmR9yQs,825
-sdg_hub/core/blocks/transform/duplicate_columns.py,sha256=SaP7rIF4ZFEFFa50aU2xGNIuddXaEZrKxdWfHjzFpVI,2833
-sdg_hub/core/blocks/transform/index_based_mapper.py,sha256=XC_a7Skbd3mu7f4ra8fGWPxMwqUMSjJkQ7Ag7vflwJA,8235
-sdg_hub/core/blocks/transform/json_structure_block.py,sha256=hm-0M0NAyUREgJRPyV1u-laorgX6MZ1o17E9rNBhN78,5010
-sdg_hub/core/blocks/transform/melt_columns.py,sha256=vaYa5Taq6GhNZYWFL4uPK3-SfN2BsKEm-wvjd2EYYoI,4382
-sdg_hub/core/blocks/transform/rename_columns.py,sha256=W2hcDSJY6L73ZpElUhOML2sGLM9Y-v0gSo3xEF1LXDc,2749
-sdg_hub/core/blocks/transform/text_concat.py,sha256=_-B__Hob1WwgwkILPIZvTnsDzuwtoX1hKviyzHlnnes,3149
-sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=XnjiT29z3PzIPy8M-mmE2w-Miab6Ed5ahy32SaxTCTE,3263
+sdg_hub/core/blocks/transform/duplicate_columns.py,sha256=dYTxgkWq6X2B37pemJdmAVi56A29NF25YTwUUyN9xHs,2837
+sdg_hub/core/blocks/transform/index_based_mapper.py,sha256=W9ezZNgLUGbLk2U1UJCi2KFbSRPM0Q4vHnP5HGlhsoQ,8908
+sdg_hub/core/blocks/transform/json_structure_block.py,sha256=w7Ex2F3gvpG7uUnM2JM1a7D5xUKGE6HRKwyJpnfLPzc,5069
+sdg_hub/core/blocks/transform/melt_columns.py,sha256=zH3d3C0EO2DVRZqmhyr_g51xz1ZmuBRinrngUCiZkrM,4383
+sdg_hub/core/blocks/transform/rename_columns.py,sha256=EafchUDXvfXxqwRvNIcy92I1Zy6U8lsibtSqWaYdMPU,3150
+sdg_hub/core/blocks/transform/text_concat.py,sha256=Oo6VKGdmeiUmH3B0PDL1y_ot-bYmkT2jbGj7g7C84gg,3089
+sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=Osbz-jciBx5jFfzUbtbCBh_ET4CySG2h0IGWChESHi4,3239
 sdg_hub/core/flow/__init__.py,sha256=0_m_htuZfPxk8xQ9IKfp0Pz-JRE4O7lYMUFrKyLNoLA,409
-sdg_hub/core/flow/base.py,sha256=64YJJujNRaSIbT1YKn9nAxij_hdJ9xRVH_uiUY1IUcI,55788
-sdg_hub/core/flow/checkpointer.py,sha256=stm5ZtjjEiLk9ZkAAnoQQn5Y8Yl_d7qCsQLZTrCXR48,11867
+sdg_hub/core/flow/base.py,sha256=9nCXrCdKMzMAoIpiv2Zo7RzZhiLluXJ9XQAtg3wh_40,59104
+sdg_hub/core/flow/checkpointer.py,sha256=MJay3Q5cfRgJDetk82DaMKJ3ZZUYRHxQabEQTxhGukk,11850
 sdg_hub/core/flow/metadata.py,sha256=cFrpJjWOaK87aCuRFyC3Pdf83oYU93mrmZEMdUnhsN8,10540
 sdg_hub/core/flow/registry.py,sha256=N6KfX-L7QRkooznIFxDuhRZYuDA5g3N5zC-KRm2jVhk,12109
-sdg_hub/core/flow/validation.py,sha256=pUJvgaUjLpKNwvW6djcqVOF-HShOjegEmGOnUnoX4BA,9722
+sdg_hub/core/flow/validation.py,sha256=6hs16DnusUYPo6vD_7DcgzRP5JOHDf2wPvgqvBn6hB0,9727
 sdg_hub/core/utils/__init__.py,sha256=KcT56JhobC5sBg0MKEMn5hc4OyKa9_Vnn45Mt_kS4jQ,610
-sdg_hub/core/utils/datautils.py,sha256=__HkUe1DxcJVHKrFX68z_hDXwxJygBlJDfjJLnj7rHc,4230
+sdg_hub/core/utils/datautils.py,sha256=7YzG_IpMHj04zHl-r7mswOd3IzTQKJJdfmMBgm7VXWM,4082
 sdg_hub/core/utils/error_handling.py,sha256=yku8cGj_nKCyXDsnb-mHCpgukkkAMucJ4iAUrIzqysc,5510
 sdg_hub/core/utils/flow_id_words.yaml,sha256=5QHpQdP7zwahRuooyAlJIwBY7WcDR7vtbJXxVJqujbg,2317
 sdg_hub/core/utils/flow_identifier.py,sha256=aAHfK_G9AwEtMglLRMdMpi_AI1dciub5UqBGm4yb2HE,2841
-sdg_hub/core/utils/flow_metrics.py,sha256=3G-xbfr-rFA578wV4KUbQePTMVGZHr9-rXvyYL4Kt2Q,12604
+sdg_hub/core/utils/flow_metrics.py,sha256=84ihZHOwbxhqPTdnUXclytf5Tva-IoA1oKIruIXv0Eo,12650
 sdg_hub/core/utils/logger_config.py,sha256=6_cnsIHtSAdq1iTTZ7Q7nAJ1dmldlxSZ0AB49yLiQ20,2034
 sdg_hub/core/utils/path_resolution.py,sha256=yWof4kGNpQ5dKcrVHg0h9KfOKLZ6ROjdfsLAZsQT5rM,2000
 sdg_hub/core/utils/time_estimator.py,sha256=rM3_R-Ka5DEtvOtlJoA_5pXSyQ6tT6t4h6qh3_5BCZo,12639
@@ -77,8 +76,8 @@ sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml,sha256=Q_S
 sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml,sha256=_nPPMdHnxag_lYbhYUjGJGo-CvRwWvwdGX7cQhdZ1S0,847
 sdg_hub/flows/text_analysis/structured_insights/flow.yaml,sha256=BBV18SdvuVTAESjwkJ7V1jbb-cSTBvNl3SCycd0oEQ4,4934
 sdg_hub/flows/text_analysis/structured_insights/summarize.yaml,sha256=WXwQak1pF8e1OwnOoI1EHu8QB6iUNW89rfkTdi1Oq54,687
-sdg_hub-0.5.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-sdg_hub-0.5.0.dist-info/METADATA,sha256=z4tCCtWlTBzu5DF1K44RtWjIs7ZNL6__2Aae7I0EfxQ,9775
-sdg_hub-0.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-sdg_hub-0.5.0.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
-sdg_hub-0.5.0.dist-info/RECORD,,
+sdg_hub-0.6.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+sdg_hub-0.6.0.dist-info/METADATA,sha256=euJInCQlprp43574c5bg11C_GHCu4nhivfB3vYIRC-c,9485
+sdg_hub-0.6.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+sdg_hub-0.6.0.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
+sdg_hub-0.6.0.dist-info/RECORD,,

sdg-hub 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

sdg-hub 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl