sdg-hub 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sdg_hub/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.3.0'
32
- __version_tuple__ = version_tuple = (0, 3, 0)
31
+ __version__ = version = '0.3.1'
32
+ __version_tuple__ = version_tuple = (0, 3, 1)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -214,8 +214,33 @@ class LLMClientManager:
214
214
  messages_list = messages
215
215
 
216
216
  if max_concurrency is not None:
217
+ if max_concurrency < 1:
218
+ raise ValueError(
219
+ "max_concurrency must be greater than 0, got {max_concurrency}"
220
+ )
221
+ # Adjust concurrency based on n parameter to avoid overwhelming API
222
+ # when n > 1 (multiple completions per request)
223
+ n_value = overrides.get("n") or self.config.n or 1
224
+ if n_value > 1:
225
+ # Warn if max_concurrency is less than n
226
+ if max_concurrency < n_value:
227
+ logger.warning(
228
+ f"max_concurrency ({max_concurrency}) is less than n ({n_value}). "
229
+ f"This may result in very low concurrency. Consider increasing max_concurrency "
230
+ f"or reducing n for better performance."
231
+ )
232
+
233
+ # Reduce concurrency when generating multiple completions per request
234
+ adjusted_concurrency = max(1, max_concurrency // n_value)
235
+ logger.debug(
236
+ f"Adjusted max_concurrency from {max_concurrency} to {adjusted_concurrency} "
237
+ f"for n={n_value} completions per request"
238
+ )
239
+ else:
240
+ adjusted_concurrency = max_concurrency
241
+
217
242
  # Use semaphore for concurrency control
218
- semaphore = asyncio.Semaphore(max_concurrency)
243
+ semaphore = asyncio.Semaphore(adjusted_concurrency)
219
244
 
220
245
  async def _create_with_semaphore(msgs):
221
246
  async with semaphore:
@@ -1,5 +1,6 @@
1
1
  # Third Party
2
2
  from datasets import Dataset, concatenate_datasets
3
+ import numpy as np
3
4
 
4
5
  # Local
5
6
  from .error_handling import FlowValidationError
@@ -39,28 +40,45 @@ def validate_no_duplicates(dataset: Dataset) -> None:
39
40
 
40
41
  df = dataset.to_pandas()
41
42
 
42
- # Try pandas duplicated() first - only convert types if we hit unhashable error
43
- try:
44
- duplicate_count = int(df.duplicated(keep="first").sum())
45
- except TypeError as e:
46
- if "unhashable type" in str(e):
47
- # Convert unhashable types to tuples so pandas can hash them
48
- for col in df.columns:
49
- if df[col].dtype == "object": # Only check object columns
50
- df[col] = df[col].apply(
51
- lambda x: (
52
- tuple(sorted(x.items()))
53
- if isinstance(x, dict)
54
- else tuple(x)
55
- if hasattr(x, "__iter__")
56
- and not isinstance(x, (str, bytes))
57
- else x
58
- )
59
- )
60
- duplicate_count = int(df.duplicated(keep="first").sum())
61
- else:
62
- raise # Re-raise if it's a different TypeError
63
-
43
+ def is_hashable(x):
44
+ try:
45
+ hash(x)
46
+ return True
47
+ except TypeError:
48
+ return False
49
+
50
+ def make_hashable(x):
51
+ if is_hashable(x):
52
+ # int, float, str, bytes, None etc. are already hashable
53
+ return x
54
+ if isinstance(x, np.ndarray):
55
+ if x.ndim == 0:
56
+ return make_hashable(x.item())
57
+ return tuple(make_hashable(i) for i in x)
58
+ if isinstance(x, dict):
59
+ # sort robustly even with heterogeneous key types
60
+ return tuple(
61
+ sorted(
62
+ ((k, make_hashable(v)) for k, v in x.items()),
63
+ key=lambda kv: repr(kv[0]),
64
+ )
65
+ )
66
+ if isinstance(x, (set, frozenset)):
67
+ # order‑insensitive
68
+ return frozenset(make_hashable(i) for i in x)
69
+ if hasattr(x, "__iter__"):
70
+ # lists, tuples, custom iterables
71
+ return tuple(make_hashable(i) for i in x)
72
+ # last‑resort fallback to a stable representation
73
+ return repr(x)
74
+
75
+ # Apply to the whole dataframe to ensure every cell is hashable
76
+ if hasattr(df, "map"):
77
+ df = df.map(make_hashable)
78
+ else:
79
+ df = df.applymap(make_hashable)
80
+
81
+ duplicate_count = int(df.duplicated(keep="first").sum())
64
82
  if duplicate_count > 0:
65
83
  raise FlowValidationError(
66
84
  f"Input dataset contains {duplicate_count} duplicate rows. "
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdg_hub
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: Synthetic Data Generation
5
5
  Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
6
  License: Apache-2.0
@@ -1,5 +1,5 @@
1
1
  sdg_hub/__init__.py,sha256=Tw-6R5a8_W1kJcTAsW3R9ltBDP1dy5-fe7Tvt3cSyCQ,550
2
- sdg_hub/_version.py,sha256=5zTqm8rgXsWYBpB2M3Zw_K1D-aV8wP7NsBLrmMKkrAQ,704
2
+ sdg_hub/_version.py,sha256=gGLpQUQx-ty9SEy9PYw9OgJWWzJLBnCpfJOfzL7SjlI,704
3
3
  sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  sdg_hub/core/__init__.py,sha256=NwqB4fwhC29W50VW7QXZssLxx122YvgO9LHDLdgAnrI,496
5
5
  sdg_hub/core/blocks/__init__.py,sha256=9sCkCvDQzJGSedaePVlEIpbNwrkBz_K500VW_6FLhuE,1601
@@ -22,7 +22,7 @@ sdg_hub/core/blocks/evaluation/verify_question_block.py,sha256=LKoIHdxUuTVO24n_M
22
22
  sdg_hub/core/blocks/filtering/__init__.py,sha256=isxSVSvDqkMjG8dQSl3Q2M4g5c1t9fTjBSA21icf-yA,275
23
23
  sdg_hub/core/blocks/filtering/column_value_filter.py,sha256=H8Gif0q9Wc_d1TnVow8Zpsg7blJOFGN1EZmV6OPpkcg,5971
24
24
  sdg_hub/core/blocks/llm/__init__.py,sha256=N6-Prgd4X85oWbMQzhYMrq7OX-NTJm57cghowK-val0,844
25
- sdg_hub/core/blocks/llm/client_manager.py,sha256=IpMUwECL9_oNFC3yxg9A6BRqMcdg0Wdpzx28BhX45Xo,14742
25
+ sdg_hub/core/blocks/llm/client_manager.py,sha256=6RNqYvFIh4SF6jopI6tTY5MA01y8Qo-tAhsE0GeHZZ0,16109
26
26
  sdg_hub/core/blocks/llm/config.py,sha256=gc4xp5D20MSlKMFEos0QAaKUwgbZpBtMGXmn6LsIk78,11289
27
27
  sdg_hub/core/blocks/llm/error_handler.py,sha256=7T-019ZFB9qgZoX1ybIiXyaLjPzrF96qcKmUu6vmO6g,12178
28
28
  sdg_hub/core/blocks/llm/llm_chat_block.py,sha256=9ytjxjADM0FydkLapZPSQPfzjjrFIdFONs3EJEoKnaw,23007
@@ -45,7 +45,7 @@ sdg_hub/core/flow/migration.py,sha256=6and-RBqV0t2gRipr1GiOOVnyBJdtyyjw1kO08Z--d
45
45
  sdg_hub/core/flow/registry.py,sha256=DzCqEEgwhvwnCBAGLogoMVdwXh4pCHrxOWqoxam7O8I,12162
46
46
  sdg_hub/core/flow/validation.py,sha256=pUJvgaUjLpKNwvW6djcqVOF-HShOjegEmGOnUnoX4BA,9722
47
47
  sdg_hub/core/utils/__init__.py,sha256=C2FzLn3dHprwGJDEgI4fyFS3aoCJR-9PhHsunxropJ8,351
48
- sdg_hub/core/utils/datautils.py,sha256=q94NzBEtNwRFhzpk3FHofgJJU0gVRgAV3AAWZ1MroFk,3860
48
+ sdg_hub/core/utils/datautils.py,sha256=__HkUe1DxcJVHKrFX68z_hDXwxJygBlJDfjJLnj7rHc,4230
49
49
  sdg_hub/core/utils/error_handling.py,sha256=yku8cGj_nKCyXDsnb-mHCpgukkkAMucJ4iAUrIzqysc,5510
50
50
  sdg_hub/core/utils/flow_id_words.yaml,sha256=5QHpQdP7zwahRuooyAlJIwBY7WcDR7vtbJXxVJqujbg,2317
51
51
  sdg_hub/core/utils/flow_identifier.py,sha256=aAHfK_G9AwEtMglLRMdMpi_AI1dciub5UqBGm4yb2HE,2841
@@ -83,8 +83,8 @@ sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml,sha256=Q_S
83
83
  sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml,sha256=_nPPMdHnxag_lYbhYUjGJGo-CvRwWvwdGX7cQhdZ1S0,847
84
84
  sdg_hub/flows/text_analysis/structured_insights/flow.yaml,sha256=2HuGTyNwYe6a8Ev-QdKZXwe29NL4wOkq4ecEV9a7NDg,4221
85
85
  sdg_hub/flows/text_analysis/structured_insights/summarize.yaml,sha256=WXwQak1pF8e1OwnOoI1EHu8QB6iUNW89rfkTdi1Oq54,687
86
- sdg_hub-0.3.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
87
- sdg_hub-0.3.0.dist-info/METADATA,sha256=eVLM1fK2-9uD_eWhSRW5VTbdUs-XIn_Va3Z-rY31Utk,9735
88
- sdg_hub-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
89
- sdg_hub-0.3.0.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
90
- sdg_hub-0.3.0.dist-info/RECORD,,
86
+ sdg_hub-0.3.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
87
+ sdg_hub-0.3.1.dist-info/METADATA,sha256=-dPDzTaPfnMb_n6p7Jcvkqv3Y-Ihi76psItQL7DQBX8,9735
88
+ sdg_hub-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
89
+ sdg_hub-0.3.1.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
90
+ sdg_hub-0.3.1.dist-info/RECORD,,