sdg-hub 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,33 +1,67 @@
1
1
  # Third Party
2
- from datasets import Dataset, concatenate_datasets
3
2
  import numpy as np
3
+ import pandas as pd
4
4
 
5
5
  # Local
6
6
  from .error_handling import FlowValidationError
7
7
 
8
8
 
9
+ def _is_hashable(x):
10
+ """Check if a value is hashable."""
11
+ try:
12
+ hash(x)
13
+ return True
14
+ except TypeError:
15
+ return False
16
+
17
+
18
+ def _make_hashable(x):
19
+ """Convert any value to a hashable representation for duplicate detection.
20
+
21
+ Handles numpy arrays, dicts, sets, lists, and other complex types by
22
+ converting them to hashable equivalents (tuples, frozensets, etc.).
23
+ """
24
+ if _is_hashable(x):
25
+ return x
26
+ if isinstance(x, np.ndarray):
27
+ if x.ndim == 0:
28
+ return _make_hashable(x.item())
29
+ return tuple(_make_hashable(i) for i in x)
30
+ if isinstance(x, dict):
31
+ return tuple(
32
+ sorted(
33
+ ((k, _make_hashable(v)) for k, v in x.items()),
34
+ key=lambda kv: repr(kv[0]),
35
+ )
36
+ )
37
+ if isinstance(x, (set, frozenset)):
38
+ return frozenset(_make_hashable(i) for i in x)
39
+ if hasattr(x, "__iter__"):
40
+ return tuple(_make_hashable(i) for i in x)
41
+ return repr(x)
42
+
43
+
9
44
  def safe_concatenate_datasets(datasets: list):
10
45
  """Concatenate datasets safely, ignoring any datasets that are None or empty."""
11
- filtered_datasets = [ds for ds in datasets if ds is not None and ds.num_rows > 0]
46
+ filtered_datasets = [ds for ds in datasets if ds is not None and len(ds) > 0]
12
47
 
13
48
  if not filtered_datasets:
14
49
  return None
15
50
 
16
- return concatenate_datasets(filtered_datasets)
51
+ return pd.concat(filtered_datasets, ignore_index=True)
17
52
 
18
53
 
19
- def validate_no_duplicates(dataset: Dataset) -> None:
54
+ def validate_no_duplicates(dataset: pd.DataFrame) -> None:
20
55
  """
21
56
  Validate that the input dataset contains only unique rows.
22
57
 
23
58
  Uses pandas `.duplicated()` for efficient duplicate detection, with preprocessing
24
- to handle numpy arrays that cause TypeError in pandas duplicate detection.
25
- Raises FlowValidationError if duplicates are found, including a count
26
- of the duplicate rows detected.
59
+ to handle numpy arrays and other unhashable types that cause TypeError in pandas
60
+ duplicate detection.
27
61
 
28
62
  Parameters
29
63
  ----------
30
- dataset : Dataset
64
+ dataset : pd.DataFrame
31
65
  Input dataset to validate.
32
66
 
33
67
  Raises
@@ -38,47 +72,11 @@ def validate_no_duplicates(dataset: Dataset) -> None:
38
72
  if len(dataset) == 0:
39
73
  return
40
74
 
41
- df = dataset.to_pandas()
42
-
43
- def is_hashable(x):
44
- try:
45
- hash(x)
46
- return True
47
- except TypeError:
48
- return False
49
-
50
- def make_hashable(x):
51
- if is_hashable(x):
52
- # int, float, str, bytes, None etc. are already hashable
53
- return x
54
- if isinstance(x, np.ndarray):
55
- if x.ndim == 0:
56
- return make_hashable(x.item())
57
- return tuple(make_hashable(i) for i in x)
58
- if isinstance(x, dict):
59
- # sort robustly even with heterogeneous key types
60
- return tuple(
61
- sorted(
62
- ((k, make_hashable(v)) for k, v in x.items()),
63
- key=lambda kv: repr(kv[0]),
64
- )
65
- )
66
- if isinstance(x, (set, frozenset)):
67
- # order‑insensitive
68
- return frozenset(make_hashable(i) for i in x)
69
- if hasattr(x, "__iter__"):
70
- # lists, tuples, custom iterables
71
- return tuple(make_hashable(i) for i in x)
72
- # last‑resort fallback to a stable representation
73
- return repr(x)
74
-
75
- # Apply to the whole dataframe to ensure every cell is hashable
76
- if hasattr(df, "map"):
77
- df = df.map(make_hashable)
78
- else:
79
- df = df.applymap(make_hashable)
80
-
81
- duplicate_count = int(df.duplicated(keep="first").sum())
75
+ # Transform all cells to hashable representations for duplicate detection
76
+ # This creates a temporary copy but is necessary for reliable duplicate detection
77
+ hashable_df = dataset.map(_make_hashable)
78
+
79
+ duplicate_count = int(hashable_df.duplicated(keep="first").sum())
82
80
  if duplicate_count > 0:
83
81
  raise FlowValidationError(
84
82
  f"Input dataset contains {duplicate_count} duplicate rows. "
@@ -89,19 +87,19 @@ def validate_no_duplicates(dataset: Dataset) -> None:
89
87
 
90
88
  def safe_concatenate_with_validation(
91
89
  datasets: list, context: str = "datasets"
92
- ) -> Dataset:
90
+ ) -> pd.DataFrame:
93
91
  """Safely concatenate datasets with schema validation and clear error messages.
94
92
 
95
93
  Parameters
96
94
  ----------
97
- datasets : list[Dataset]
95
+ datasets : list[pd.DataFrame]
98
96
  List of datasets to concatenate
99
97
  context : str
100
98
  Description of what's being concatenated for error messages
101
99
 
102
100
  Returns
103
101
  -------
104
- Dataset
102
+ pd.DataFrame
105
103
  Concatenated dataset
106
104
 
107
105
  Raises
@@ -119,12 +117,12 @@ def safe_concatenate_with_validation(
119
117
  return valid_datasets[0]
120
118
 
121
119
  try:
122
- return concatenate_datasets(valid_datasets)
120
+ return pd.concat(valid_datasets, ignore_index=True)
123
121
  except Exception as e:
124
122
  # Schema mismatch or other concatenation error
125
123
  schema_info = []
126
124
  for i, ds in enumerate(valid_datasets):
127
- schema_info.append(f"Dataset {i}: columns={ds.column_names}")
125
+ schema_info.append(f"Dataset {i}: columns={ds.columns.tolist()}")
128
126
 
129
127
  schema_details = "\n".join(schema_info)
130
128
  raise FlowValidationError(
@@ -8,12 +8,13 @@ from typing import Any, Optional
8
8
  import json
9
9
  import time
10
10
 
11
- # Third Party
12
- from datasets import Dataset
13
11
  from rich.console import Console
14
12
  from rich.panel import Panel
15
13
  from rich.table import Table
16
14
 
15
+ # Third Party
16
+ import pandas as pd
17
+
17
18
 
18
19
  def aggregate_block_metrics(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
19
20
  """Aggregate per-block metrics, coalescing chunked runs.
@@ -71,7 +72,7 @@ def aggregate_block_metrics(entries: list[dict[str, Any]]) -> list[dict[str, Any
71
72
  def display_metrics_summary(
72
73
  block_metrics: list[dict[str, Any]],
73
74
  flow_name: str,
74
- final_dataset: Optional[Dataset] = None,
75
+ final_dataset: Optional[pd.DataFrame] = None,
75
76
  ) -> None:
76
77
  """Display a rich table summarizing block execution metrics.
77
78
 
@@ -81,7 +82,7 @@ def display_metrics_summary(
81
82
  Raw block metrics from flow execution.
82
83
  flow_name : str
83
84
  Name of the flow for display title.
84
- final_dataset : Optional[Dataset], optional
85
+ final_dataset : Optional[pd.DataFrame], optional
85
86
  Final dataset from flow execution. None if flow failed.
86
87
  """
87
88
  if not block_metrics:
@@ -146,8 +147,10 @@ def display_metrics_summary(
146
147
 
147
148
  # Add summary row
148
149
  table.add_section()
149
- final_row_count = len(final_dataset) if final_dataset else 0
150
- final_col_count = len(final_dataset.column_names) if final_dataset else 0
150
+ final_row_count = len(final_dataset) if final_dataset is not None else 0
151
+ final_col_count = (
152
+ len(final_dataset.columns.tolist()) if final_dataset is not None else 0
153
+ )
151
154
 
152
155
  table.add_row(
153
156
  "[bold]TOTAL[/bold]",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -28,23 +28,17 @@ Requires-Dist: httpx<1.0.0,>=0.25.0
28
28
  Requires-Dist: jinja2
29
29
  Requires-Dist: litellm<1.75.0,>=1.73.0
30
30
  Requires-Dist: rich
31
+ Requires-Dist: pandas
31
32
  Requires-Dist: pydantic<3.0.0,>=2.0.0
32
33
  Requires-Dist: python-dotenv<2.0.0,>=1.0.0
33
34
  Requires-Dist: tenacity!=8.4.0,>=8.3.0
34
35
  Requires-Dist: tqdm<5.0.0,>=4.66.2
35
- Provides-Extra: vllm
36
- Requires-Dist: vllm>=0.9.1; extra == "vllm"
37
- Requires-Dist: torch>=2.0.0; extra == "vllm"
38
- Requires-Dist: transformers>=4.37.0; extra == "vllm"
39
- Requires-Dist: accelerate>=0.21.0; extra == "vllm"
40
- Requires-Dist: xformers>=0.0.22.post7; extra == "vllm"
41
36
  Provides-Extra: examples
42
37
  Requires-Dist: tabulate>=0.9.0; extra == "examples"
43
38
  Requires-Dist: transformers>=4.37.0; extra == "examples"
44
39
  Requires-Dist: langchain-text-splitters; extra == "examples"
45
40
  Requires-Dist: docling>=2.3.0; extra == "examples"
46
41
  Requires-Dist: scikit-learn; extra == "examples"
47
- Requires-Dist: pandas; extra == "examples"
48
42
  Requires-Dist: polars; extra == "examples"
49
43
  Requires-Dist: matplotlib; extra == "examples"
50
44
  Requires-Dist: spacy; extra == "examples"
@@ -1,39 +1,38 @@
1
1
  sdg_hub/__init__.py,sha256=TlkZT40-70urdcWLqv3kupaJj8s-SVgd2QyvlSFwb4A,510
2
- sdg_hub/_version.py,sha256=fvHpBU3KZKRinkriKdtAt3crenOyysELF-M9y3ozg3U,704
2
+ sdg_hub/_version.py,sha256=MAYWefOLb6kbIRub18WSzK6ggSjz1LNLy9aDRlX9Ea4,704
3
3
  sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  sdg_hub/core/__init__.py,sha256=e3BoejbqjYhasf9t__L4qE52lkD9EBjx4o--2kqKdro,460
5
5
  sdg_hub/core/blocks/__init__.py,sha256=8Rn1SglH8V3jGmTD_cG-h7qk9ktAab2eaBdyk7RN_hY,865
6
- sdg_hub/core/blocks/base.py,sha256=-SOdBpJwtRTMsrmCEuLjUBQMRCo_PLYlHEBRrz8sF9g,13031
6
+ sdg_hub/core/blocks/base.py,sha256=EpHvqXySIdx0f672c-csGKKs7N57ablC8pad_SiB1s8,13066
7
7
  sdg_hub/core/blocks/registry.py,sha256=FuEN_pnq-nSH1LguY3_oCubT6Kz3SuJjk3TcUpLT-lw,10695
8
8
  sdg_hub/core/blocks/filtering/__init__.py,sha256=isxSVSvDqkMjG8dQSl3Q2M4g5c1t9fTjBSA21icf-yA,275
9
- sdg_hub/core/blocks/filtering/column_value_filter.py,sha256=2Z9j_CiiTn5mHZ9gfXU-itLXDmeXSh0UI0x1x7j-LQ0,6001
10
- sdg_hub/core/blocks/llm/__init__.py,sha256=AyS0dd3pkPPXH5a9aj4mT5HsKjX2vjXfkmQc6rkFV4A,795
9
+ sdg_hub/core/blocks/filtering/column_value_filter.py,sha256=tHNykB-Q_ItbjDzvlpnjt0Z46mR67O6ZY29ed2ecOwo,6493
10
+ sdg_hub/core/blocks/llm/__init__.py,sha256=1Oo2nv2uXJ2AzRlrQcqDi7gW1FNh9Fid84L89dvy4qM,683
11
11
  sdg_hub/core/blocks/llm/error_handler.py,sha256=7T-019ZFB9qgZoX1ybIiXyaLjPzrF96qcKmUu6vmO6g,12178
12
- sdg_hub/core/blocks/llm/llm_chat_block.py,sha256=MHhI2x9i6LrfDXgvAy2_6YxgyoD7j6BpCgNGsM69xDg,22194
13
- sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py,sha256=DW4b09IqXmcshvXawFheDyaLp3rz7vpO5VBrKdUQYW8,31703
14
- sdg_hub/core/blocks/llm/llm_parser_block.py,sha256=aoHqsDDhaIgCDfPpv7acc0DVN-zUgzFflRVB4win0aM,12012
15
- sdg_hub/core/blocks/llm/prompt_builder_block.py,sha256=fkJd718X1oYlMY1cjo_8WCO16Gl8Tm0bUPWR78E_uws,13935
16
- sdg_hub/core/blocks/llm/text_parser_block.py,sha256=975HK6NfXiU9Any4UDMpBNidRpyhHmc76BXUN69SVyc,12566
12
+ sdg_hub/core/blocks/llm/llm_chat_block.py,sha256=ckkjF_r9CxoX2sJiikFWFxNrAS4w_gMnedo70TrQo3Y,22730
13
+ sdg_hub/core/blocks/llm/llm_parser_block.py,sha256=NFk8xXceK_F1Pzn9dFNX65ynavuoQiH2ltDLLY_6SXQ,12136
14
+ sdg_hub/core/blocks/llm/prompt_builder_block.py,sha256=zI8DFz34abGnH2Mk0KQe4Mkkb5ophwV7brn4axNsZ2I,14146
15
+ sdg_hub/core/blocks/llm/text_parser_block.py,sha256=CoyfgKcJL9JpokzMcKk4bYeEBr6xnN0XYk45hJANnBQ,12763
17
16
  sdg_hub/core/blocks/transform/__init__.py,sha256=lF9InjOzA6p_mjiwV-a2Kwstq9kqRiQ-dEwbsmR9yQs,825
18
- sdg_hub/core/blocks/transform/duplicate_columns.py,sha256=SaP7rIF4ZFEFFa50aU2xGNIuddXaEZrKxdWfHjzFpVI,2833
19
- sdg_hub/core/blocks/transform/index_based_mapper.py,sha256=XC_a7Skbd3mu7f4ra8fGWPxMwqUMSjJkQ7Ag7vflwJA,8235
20
- sdg_hub/core/blocks/transform/json_structure_block.py,sha256=hm-0M0NAyUREgJRPyV1u-laorgX6MZ1o17E9rNBhN78,5010
21
- sdg_hub/core/blocks/transform/melt_columns.py,sha256=vaYa5Taq6GhNZYWFL4uPK3-SfN2BsKEm-wvjd2EYYoI,4382
22
- sdg_hub/core/blocks/transform/rename_columns.py,sha256=W2hcDSJY6L73ZpElUhOML2sGLM9Y-v0gSo3xEF1LXDc,2749
23
- sdg_hub/core/blocks/transform/text_concat.py,sha256=_-B__Hob1WwgwkILPIZvTnsDzuwtoX1hKviyzHlnnes,3149
24
- sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=XnjiT29z3PzIPy8M-mmE2w-Miab6Ed5ahy32SaxTCTE,3263
17
+ sdg_hub/core/blocks/transform/duplicate_columns.py,sha256=dYTxgkWq6X2B37pemJdmAVi56A29NF25YTwUUyN9xHs,2837
18
+ sdg_hub/core/blocks/transform/index_based_mapper.py,sha256=W9ezZNgLUGbLk2U1UJCi2KFbSRPM0Q4vHnP5HGlhsoQ,8908
19
+ sdg_hub/core/blocks/transform/json_structure_block.py,sha256=w7Ex2F3gvpG7uUnM2JM1a7D5xUKGE6HRKwyJpnfLPzc,5069
20
+ sdg_hub/core/blocks/transform/melt_columns.py,sha256=zH3d3C0EO2DVRZqmhyr_g51xz1ZmuBRinrngUCiZkrM,4383
21
+ sdg_hub/core/blocks/transform/rename_columns.py,sha256=EafchUDXvfXxqwRvNIcy92I1Zy6U8lsibtSqWaYdMPU,3150
22
+ sdg_hub/core/blocks/transform/text_concat.py,sha256=Oo6VKGdmeiUmH3B0PDL1y_ot-bYmkT2jbGj7g7C84gg,3089
23
+ sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=Osbz-jciBx5jFfzUbtbCBh_ET4CySG2h0IGWChESHi4,3239
25
24
  sdg_hub/core/flow/__init__.py,sha256=0_m_htuZfPxk8xQ9IKfp0Pz-JRE4O7lYMUFrKyLNoLA,409
26
- sdg_hub/core/flow/base.py,sha256=64YJJujNRaSIbT1YKn9nAxij_hdJ9xRVH_uiUY1IUcI,55788
27
- sdg_hub/core/flow/checkpointer.py,sha256=stm5ZtjjEiLk9ZkAAnoQQn5Y8Yl_d7qCsQLZTrCXR48,11867
25
+ sdg_hub/core/flow/base.py,sha256=9nCXrCdKMzMAoIpiv2Zo7RzZhiLluXJ9XQAtg3wh_40,59104
26
+ sdg_hub/core/flow/checkpointer.py,sha256=MJay3Q5cfRgJDetk82DaMKJ3ZZUYRHxQabEQTxhGukk,11850
28
27
  sdg_hub/core/flow/metadata.py,sha256=cFrpJjWOaK87aCuRFyC3Pdf83oYU93mrmZEMdUnhsN8,10540
29
28
  sdg_hub/core/flow/registry.py,sha256=N6KfX-L7QRkooznIFxDuhRZYuDA5g3N5zC-KRm2jVhk,12109
30
- sdg_hub/core/flow/validation.py,sha256=pUJvgaUjLpKNwvW6djcqVOF-HShOjegEmGOnUnoX4BA,9722
29
+ sdg_hub/core/flow/validation.py,sha256=6hs16DnusUYPo6vD_7DcgzRP5JOHDf2wPvgqvBn6hB0,9727
31
30
  sdg_hub/core/utils/__init__.py,sha256=KcT56JhobC5sBg0MKEMn5hc4OyKa9_Vnn45Mt_kS4jQ,610
32
- sdg_hub/core/utils/datautils.py,sha256=__HkUe1DxcJVHKrFX68z_hDXwxJygBlJDfjJLnj7rHc,4230
31
+ sdg_hub/core/utils/datautils.py,sha256=7YzG_IpMHj04zHl-r7mswOd3IzTQKJJdfmMBgm7VXWM,4082
33
32
  sdg_hub/core/utils/error_handling.py,sha256=yku8cGj_nKCyXDsnb-mHCpgukkkAMucJ4iAUrIzqysc,5510
34
33
  sdg_hub/core/utils/flow_id_words.yaml,sha256=5QHpQdP7zwahRuooyAlJIwBY7WcDR7vtbJXxVJqujbg,2317
35
34
  sdg_hub/core/utils/flow_identifier.py,sha256=aAHfK_G9AwEtMglLRMdMpi_AI1dciub5UqBGm4yb2HE,2841
36
- sdg_hub/core/utils/flow_metrics.py,sha256=3G-xbfr-rFA578wV4KUbQePTMVGZHr9-rXvyYL4Kt2Q,12604
35
+ sdg_hub/core/utils/flow_metrics.py,sha256=84ihZHOwbxhqPTdnUXclytf5Tva-IoA1oKIruIXv0Eo,12650
37
36
  sdg_hub/core/utils/logger_config.py,sha256=6_cnsIHtSAdq1iTTZ7Q7nAJ1dmldlxSZ0AB49yLiQ20,2034
38
37
  sdg_hub/core/utils/path_resolution.py,sha256=yWof4kGNpQ5dKcrVHg0h9KfOKLZ6ROjdfsLAZsQT5rM,2000
39
38
  sdg_hub/core/utils/time_estimator.py,sha256=rM3_R-Ka5DEtvOtlJoA_5pXSyQ6tT6t4h6qh3_5BCZo,12639
@@ -77,8 +76,8 @@ sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml,sha256=Q_S
77
76
  sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml,sha256=_nPPMdHnxag_lYbhYUjGJGo-CvRwWvwdGX7cQhdZ1S0,847
78
77
  sdg_hub/flows/text_analysis/structured_insights/flow.yaml,sha256=BBV18SdvuVTAESjwkJ7V1jbb-cSTBvNl3SCycd0oEQ4,4934
79
78
  sdg_hub/flows/text_analysis/structured_insights/summarize.yaml,sha256=WXwQak1pF8e1OwnOoI1EHu8QB6iUNW89rfkTdi1Oq54,687
80
- sdg_hub-0.5.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
81
- sdg_hub-0.5.0.dist-info/METADATA,sha256=z4tCCtWlTBzu5DF1K44RtWjIs7ZNL6__2Aae7I0EfxQ,9775
82
- sdg_hub-0.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
83
- sdg_hub-0.5.0.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
84
- sdg_hub-0.5.0.dist-info/RECORD,,
79
+ sdg_hub-0.6.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
80
+ sdg_hub-0.6.0.dist-info/METADATA,sha256=euJInCQlprp43574c5bg11C_GHCu4nhivfB3vYIRC-c,9485
81
+ sdg_hub-0.6.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
82
+ sdg_hub-0.6.0.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
83
+ sdg_hub-0.6.0.dist-info/RECORD,,