PyPI - sdg-hub - Versions diffs - 0.3.0__tar.gz → 0.3.1__tar.gz - Mend

sdg-hub 0.3.0tar.gz → 0.3.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (212) hide show

{sdg_hub-0.3.0/src/sdg_hub.egg-info → sdg_hub-0.3.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sdg_hub
-Version: 0.3.0
+Version: 0.3.1
 Summary: Synthetic Data Generation
 Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
 License: Apache-2.0

{sdg_hub-0.3.0 → sdg_hub-0.3.1}/src/sdg_hub/_version.py RENAMED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.3.0'
-__version_tuple__ = version_tuple = (0, 3, 0)
+__version__ = version = '0.3.1'
+__version_tuple__ = version_tuple = (0, 3, 1)
-__commit_id__ = commit_id = 'g5b81eba8c'
+__commit_id__ = commit_id = 'g4e0f10375'

{sdg_hub-0.3.0 → sdg_hub-0.3.1}/src/sdg_hub/core/blocks/llm/client_manager.py RENAMED Viewed

@@ -214,8 +214,33 @@ class LLMClientManager:
             messages_list = messages
             if max_concurrency is not None:
+                if max_concurrency < 1:
+                    raise ValueError(
+                        "max_concurrency must be greater than 0, got {max_concurrency}"
+                    )
+                # Adjust concurrency based on n parameter to avoid overwhelming API
+                # when n > 1 (multiple completions per request)
+                n_value = overrides.get("n") or self.config.n or 1
+                if n_value > 1:
+                    # Warn if max_concurrency is less than n
+                    if max_concurrency < n_value:
+                        logger.warning(
+                            f"max_concurrency ({max_concurrency}) is less than n ({n_value}). "
+                            f"This may result in very low concurrency. Consider increasing max_concurrency "
+                            f"or reducing n for better performance."
+                        )
+                    # Reduce concurrency when generating multiple completions per request
+                    adjusted_concurrency = max(1, max_concurrency // n_value)
+                    logger.debug(
+                        f"Adjusted max_concurrency from {max_concurrency} to {adjusted_concurrency} "
+                        f"for n={n_value} completions per request"
+                    )
+                else:
+                    adjusted_concurrency = max_concurrency
                 # Use semaphore for concurrency control
-                semaphore = asyncio.Semaphore(max_concurrency)
+                semaphore = asyncio.Semaphore(adjusted_concurrency)
                 async def _create_with_semaphore(msgs):
                     async with semaphore:

{sdg_hub-0.3.0 → sdg_hub-0.3.1}/src/sdg_hub/core/utils/datautils.py RENAMED Viewed

@@ -1,5 +1,6 @@
 # Third Party
 from datasets import Dataset, concatenate_datasets
+import numpy as np
 # Local
 from .error_handling import FlowValidationError
@@ -39,28 +40,45 @@ def validate_no_duplicates(dataset: Dataset) -> None:
     df = dataset.to_pandas()
-    # Try pandas duplicated() first - only convert types if we hit unhashable error
-    try:
-        duplicate_count = int(df.duplicated(keep="first").sum())
-    except TypeError as e:
-        if "unhashable type" in str(e):
-            # Convert unhashable types to tuples so pandas can hash them
-            for col in df.columns:
-                if df[col].dtype == "object":  # Only check object columns
-                    df[col] = df[col].apply(
-                        lambda x: (
-                            tuple(sorted(x.items()))
-                            if isinstance(x, dict)
-                            else tuple(x)
-                            if hasattr(x, "__iter__")
-                            and not isinstance(x, (str, bytes))
-                            else x
-                        )
-                    )
-            duplicate_count = int(df.duplicated(keep="first").sum())
-        else:
-            raise  # Re-raise if it's a different TypeError
+    def is_hashable(x):
+        try:
+            hash(x)
+            return True
+        except TypeError:
+            return False
+    def make_hashable(x):
+        if is_hashable(x):
+            # int, float, str, bytes, None etc. are already hashable
+            return x
+        if isinstance(x, np.ndarray):
+            if x.ndim == 0:
+                return make_hashable(x.item())
+            return tuple(make_hashable(i) for i in x)
+        if isinstance(x, dict):
+            # sort robustly even with heterogeneous key types
+            return tuple(
+                sorted(
+                    ((k, make_hashable(v)) for k, v in x.items()),
+                    key=lambda kv: repr(kv[0]),
+                )
+            )
+        if isinstance(x, (set, frozenset)):
+            # order‑insensitive
+            return frozenset(make_hashable(i) for i in x)
+        if hasattr(x, "__iter__"):
+            # lists, tuples, custom iterables
+            return tuple(make_hashable(i) for i in x)
+        # last‑resort fallback to a stable representation
+        return repr(x)
+    # Apply to the whole dataframe to ensure every cell is hashable
+    if hasattr(df, "map"):
+        df = df.map(make_hashable)
+    else:
+        df = df.applymap(make_hashable)
+    duplicate_count = int(df.duplicated(keep="first").sum())
     if duplicate_count > 0:
         raise FlowValidationError(
             f"Input dataset contains {duplicate_count} duplicate rows. "

{sdg_hub-0.3.0 → sdg_hub-0.3.1/src/sdg_hub.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sdg_hub
-Version: 0.3.0
+Version: 0.3.1
 Summary: Synthetic Data Generation
 Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
 License: Apache-2.0

{sdg_hub-0.3.0 → sdg_hub-0.3.1}/tests/blocks/llm/test_llm_chat_block.py RENAMED Viewed

@@ -488,6 +488,37 @@ class TestLLMChatBlock:
 class TestErrorHandling:
     """Test error handling for LLMChatBlock."""
+    def test_max_concurrency_value_error(
+        self, mock_litellm_acompletion, sample_dataset
+    ):
+        """Test ValueError is raised when max_concurrency < 1."""
+        block = LLMChatBlock(
+            block_name="test_max_concurrency_error",
+            input_cols="messages",
+            output_cols="response",
+            model="openai/gpt-4",
+            api_key="test-key",
+            async_mode=True,
+        )
+        # Test with max_concurrency = 0
+        with pytest.raises(
+            ValueError, match="max_concurrency must be greater than 0, got"
+        ):
+            block.generate(sample_dataset, _flow_max_concurrency=0)
+        # Test with max_concurrency = -1
+        with pytest.raises(
+            ValueError, match="max_concurrency must be greater than 0, got"
+        ):
+            block.generate(sample_dataset, _flow_max_concurrency=-1)
+        # Test with max_concurrency = -5
+        with pytest.raises(
+            ValueError, match="max_concurrency must be greater than 0, got"
+        ):
+            block.generate(sample_dataset, _flow_max_concurrency=-5)
     def test_litellm_rate_limit_error(self, sample_dataset):
         """Test handling of LiteLLM rate limit errors."""
         with patch(
@@ -660,6 +691,131 @@ class TestMultipleResponses:
         assert mock_litellm_completion_multiple.call_count == 2  # One call per sample
+    def test_concurrency_adjustment_with_n_greater_than_1(
+        self, mock_litellm_acompletion, sample_dataset
+    ):
+        """Test concurrency is adjusted when n > 1 to avoid overwhelming API."""
+        with patch("sdg_hub.core.blocks.llm.client_manager.logger") as mock_logger:
+            block = LLMChatBlock(
+                block_name="test_concurrency_adjustment",
+                input_cols="messages",
+                output_cols="responses",
+                model="openai/gpt-4",
+                api_key="test-key",
+                n=4,  # Generate 4 responses per input
+                async_mode=True,
+            )
+            # Test with max_concurrency = 8, should be adjusted to 2 (8 // 4)
+            result = block.generate(sample_dataset, _flow_max_concurrency=8)
+            assert "responses" in result.column_names
+            assert len(result["responses"]) == 2
+            # Verify debug log was called for concurrency adjustment
+            mock_logger.debug.assert_called()
+            debug_calls = [
+                call
+                for call in mock_logger.debug.call_args_list
+                if "Adjusted max_concurrency" in str(call)
+            ]
+            assert len(debug_calls) > 0
+            assert "Adjusted max_concurrency from 8 to 2" in str(debug_calls[0])
+            assert "for n=4 completions per request" in str(debug_calls[0])
+    def test_concurrency_warning_when_max_concurrency_less_than_n(
+        self, mock_litellm_acompletion, sample_dataset
+    ):
+        """Test warning is logged when max_concurrency < n."""
+        with patch("sdg_hub.core.blocks.llm.client_manager.logger") as mock_logger:
+            block = LLMChatBlock(
+                block_name="test_concurrency_warning",
+                input_cols="messages",
+                output_cols="responses",
+                model="openai/gpt-4",
+                api_key="test-key",
+                n=5,  # Generate 5 responses per input
+                async_mode=True,
+            )
+            # Test with max_concurrency = 3, which is less than n=5
+            result = block.generate(sample_dataset, _flow_max_concurrency=3)
+            assert "responses" in result.column_names
+            assert len(result["responses"]) == 2
+            # Verify warning log was called
+            mock_logger.warning.assert_called()
+            warning_calls = [
+                call
+                for call in mock_logger.warning.call_args_list
+                if "max_concurrency" in str(call)
+            ]
+            assert len(warning_calls) > 0
+            assert "max_concurrency (3) is less than n (5)" in str(warning_calls[0])
+            assert "Consider increasing max_concurrency" in str(warning_calls[0])
+    def test_concurrency_not_adjusted_when_n_is_1(
+        self, mock_litellm_acompletion, sample_dataset
+    ):
+        """Test concurrency is not adjusted when n=1 or n=None."""
+        with patch("sdg_hub.core.blocks.llm.client_manager.logger") as mock_logger:
+            # Test with n=1
+            block_n1 = LLMChatBlock(
+                block_name="test_no_adjustment_n1",
+                input_cols="messages",
+                output_cols="response",
+                model="openai/gpt-4",
+                api_key="test-key",
+                n=1,
+                async_mode=True,
+            )
+            result = block_n1.generate(sample_dataset, _flow_max_concurrency=8)
+            assert "response" in result.column_names
+            assert len(result["response"]) == 2
+            # No adjustment should happen, so no debug log about adjustment
+            debug_calls = [
+                call
+                for call in mock_logger.debug.call_args_list
+                if "Adjusted max_concurrency" in str(call)
+            ]
+            assert len(debug_calls) == 0
+    def test_concurrency_override_in_generate_call(
+        self, mock_litellm_acompletion, sample_dataset
+    ):
+        """Test concurrency adjustment works when n is overridden in generate call."""
+        with patch("sdg_hub.core.blocks.llm.client_manager.logger") as mock_logger:
+            block = LLMChatBlock(
+                block_name="test_override_adjustment",
+                input_cols="messages",
+                output_cols="responses",
+                model="openai/gpt-4",
+                api_key="test-key",
+                n=1,  # Initially set to 1
+                async_mode=True,
+            )
+            # Override n to 3 at runtime with max_concurrency=9
+            result = block.generate(sample_dataset, n=3, _flow_max_concurrency=9)
+            assert "responses" in result.column_names
+            assert len(result["responses"]) == 2
+            # Verify debug log shows adjustment based on runtime n=3
+            mock_logger.debug.assert_called()
+            debug_calls = [
+                call
+                for call in mock_logger.debug.call_args_list
+                if "Adjusted max_concurrency" in str(call)
+            ]
+            assert len(debug_calls) > 0
+            assert "Adjusted max_concurrency from 9 to 3" in str(debug_calls[0])
+            assert "for n=3 completions per request" in str(debug_calls[0])
     def test_single_response_still_works(self, mock_litellm_completion, sample_dataset):
         """Test that n=1 or n=None still returns single strings."""
         # Test n=1

sdg-hub 0.3.0__tar.gz → 0.3.1__tar.gz

sdg-hub 0.3.0tar.gz → 0.3.1tar.gz