PyPI - sdg-hub - Versions diffs - 0.2.0__tar.gz → 0.2.2__tar.gz - Mend

sdg-hub 0.2.0tar.gz → 0.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

{sdg_hub-0.2.0/src/sdg_hub.egg-info → sdg_hub-0.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sdg_hub
-Version: 0.2.0
+Version: 0.2.2
 Summary: Synthetic Data Generation
 Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
 License: Apache-2.0
@@ -27,7 +27,6 @@ Requires-Dist: datasets<4.0.0,>=2.18.0
 Requires-Dist: httpx<1.0.0,>=0.25.0
 Requires-Dist: jinja2
 Requires-Dist: litellm<1.75.0,>=1.73.0
-Requires-Dist: openai<2.0.0,>=1.13.3
 Requires-Dist: rich
 Requires-Dist: pydantic<3.0.0,>=2.0.0
 Requires-Dist: python-dotenv<2.0.0,>=1.0.0
@@ -92,6 +91,8 @@ A modular Python framework for building synthetic data generation pipelines usin
 **📊 Rich Monitoring** - Detailed logging with progress bars and execution summaries.
+**📋 Dataset Schema Discovery** - Instantly discover required data formats. Get empty datasets with correct schema for easy validation and data preparation.
 **🧩 Easily Extensible** - Create custom blocks with simple inheritance. Rich logging and monitoring built-in.
@@ -121,7 +122,7 @@ uv pip install sdg-hub[examples]
 ## 🚀 Quick Start
-### 🧱 Core Concepts
+### Core Concepts
 **Blocks** are composable units that transform datasets - think of them as data processing Lego pieces. Each block performs a specific task: LLM chat, text parsing, evaluation, or transformation.
@@ -136,7 +137,7 @@ dataset → Block₁ → Block₂ → Block₃ → enriched_dataset
 #### Flow Discovery
 ```python
-from sdg_hub import FlowRegistry
+from sdg_hub import FlowRegistry, Flow
 # Auto-discover all available flows (no setup needed!)
 FlowRegistry.discover_flows()
@@ -150,16 +151,20 @@ qa_flows = FlowRegistry.search_flows(tag="question-generation")
 print(f"QA flows: {qa_flows}")
 ```
-#### Using Flows
+Each flow has a **unique, human-readable ID** automatically generated from its name. These IDs provide a convenient shorthand for referencing flows:
 ```python
-from sdg_hub import FlowRegistry, Flow
-from datasets import Dataset
+# Every flow gets a deterministic ID
+# Same flow name always generates the same ID
+flow_id = "small-rock-799"
-# Load the flow by name
-flow_name = "Advanced Document Grounded Question-Answer Generation Flow for Knowledge Tuning"
-flow_path = FlowRegistry.get_flow_path(flow_name)
+# Use ID to reference the flow
+flow_path = FlowRegistry.get_flow_path(flow_id)
 flow = Flow.from_yaml(flow_path)
+```
+#### Discovering Models and Configuring them
+```python
 # Discover recommended models
 default_model = flow.get_default_model()
 recommendations = flow.get_model_recommendations()
@@ -171,21 +176,52 @@ flow.set_model_config(
     api_base="http://localhost:8000/v1",
     api_key="your_key",
 )
-# Create your dataset with required columns
-dataset = Dataset.from_dict({
-    'document': ['Your document text here...'],
-    'document_outline': ['1. Topic A; 2. Topic B; 3. Topic C'],
-    'domain': ['Computer Science'],
-    'icl_document': ['Example document for in-context learning...'],
-    'icl_query_1': ['Example question 1?'],
-    'icl_response_1': ['Example answer 1'],
-    'icl_query_2': ['Example question 2?'],
-    'icl_response_2': ['Example answer 2'],
-    'icl_query_3': ['Example question 3?'],
-    'icl_response_3': ['Example answer 3']
+```
+#### Discover dataset requirements and create your dataset
+```python
+# First, discover what data the flow needs
+# Get an empty dataset with the exact schema needed
+schema_dataset = flow.get_dataset_schema()  # Get empty dataset with correct schema
+print(f"Required columns: {schema_dataset.column_names}")
+print(f"Schema: {schema_dataset.features}")
+# Option 1: Add data directly to the schema dataset
+dataset = schema_dataset.add_item({
+    'document': 'Your document text here...',
+    'document_outline': '1. Topic A; 2. Topic B; 3. Topic C',
+    'domain': 'Computer Science',
+    'icl_document': 'Example document for in-context learning...',
+    'icl_query_1': 'Example question 1?',
+    'icl_response_1': 'Example answer 1',
+    'icl_query_2': 'Example question 2?',
+    'icl_response_2': 'Example answer 2',
+    'icl_query_3': 'Example question 3?',
+    'icl_response_3': 'Example answer 3'
 })
+# Option 2: Create your own dataset and validate the schema
+my_dataset = Dataset.from_dict(my_data_dict)
+if my_dataset.features == schema_dataset.features:
+    print("✅ Schema matches - ready to generate!")
+    dataset = my_dataset
+else:
+    print("❌ Schema mismatch - check your columns")
+# Option 3: Get raw requirements for detailed inspection
+requirements = flow.get_dataset_requirements()
+if requirements:
+    print(f"Required: {requirements.required_columns}")
+    print(f"Optional: {requirements.optional_columns}")
+    print(f"Min samples: {requirements.min_samples}")
+```
+#### Dry Run and Generate
+```python
+# Quick Testing with Dry Run
+dry_result = flow.dry_run(dataset, sample_size=1)
+print(f"Dry run completed in {dry_result['execution_time_seconds']:.2f}s")
+print(f"Output columns: {dry_result['final_dataset']['columns']}")
 # Generate high-quality QA pairs
 result = flow.generate(dataset)
@@ -196,14 +232,6 @@ faithfulness_scores = result['faithfulness_judgment']
 relevancy_scores = result['relevancy_score']
 ```
-#### Quick Testing with Dry Run
-```python
-# Test the flow with a small sample first
-dry_result = flow.dry_run(dataset, sample_size=1)
-print(f"Dry run completed in {dry_result['execution_time_seconds']:.2f}s")
-print(f"Output columns: {dry_result['final_dataset']['columns']}")
-```
 ## 📄 License

{sdg_hub-0.2.0 → sdg_hub-0.2.2}/README.md RENAMED Viewed

@@ -24,6 +24,8 @@ A modular Python framework for building synthetic data generation pipelines usin
 **📊 Rich Monitoring** - Detailed logging with progress bars and execution summaries.
+**📋 Dataset Schema Discovery** - Instantly discover required data formats. Get empty datasets with correct schema for easy validation and data preparation.
 **🧩 Easily Extensible** - Create custom blocks with simple inheritance. Rich logging and monitoring built-in.
@@ -53,7 +55,7 @@ uv pip install sdg-hub[examples]
 ## 🚀 Quick Start
-### 🧱 Core Concepts
+### Core Concepts
 **Blocks** are composable units that transform datasets - think of them as data processing Lego pieces. Each block performs a specific task: LLM chat, text parsing, evaluation, or transformation.
@@ -68,7 +70,7 @@ dataset → Block₁ → Block₂ → Block₃ → enriched_dataset
 #### Flow Discovery
 ```python
-from sdg_hub import FlowRegistry
+from sdg_hub import FlowRegistry, Flow
 # Auto-discover all available flows (no setup needed!)
 FlowRegistry.discover_flows()
@@ -82,16 +84,20 @@ qa_flows = FlowRegistry.search_flows(tag="question-generation")
 print(f"QA flows: {qa_flows}")
 ```
-#### Using Flows
+Each flow has a **unique, human-readable ID** automatically generated from its name. These IDs provide a convenient shorthand for referencing flows:
 ```python
-from sdg_hub import FlowRegistry, Flow
-from datasets import Dataset
+# Every flow gets a deterministic ID
+# Same flow name always generates the same ID
+flow_id = "small-rock-799"
-# Load the flow by name
-flow_name = "Advanced Document Grounded Question-Answer Generation Flow for Knowledge Tuning"
-flow_path = FlowRegistry.get_flow_path(flow_name)
+# Use ID to reference the flow
+flow_path = FlowRegistry.get_flow_path(flow_id)
 flow = Flow.from_yaml(flow_path)
+```
+#### Discovering Models and Configuring them
+```python
 # Discover recommended models
 default_model = flow.get_default_model()
 recommendations = flow.get_model_recommendations()
@@ -103,21 +109,52 @@ flow.set_model_config(
     api_base="http://localhost:8000/v1",
     api_key="your_key",
 )
-# Create your dataset with required columns
-dataset = Dataset.from_dict({
-    'document': ['Your document text here...'],
-    'document_outline': ['1. Topic A; 2. Topic B; 3. Topic C'],
-    'domain': ['Computer Science'],
-    'icl_document': ['Example document for in-context learning...'],
-    'icl_query_1': ['Example question 1?'],
-    'icl_response_1': ['Example answer 1'],
-    'icl_query_2': ['Example question 2?'],
-    'icl_response_2': ['Example answer 2'],
-    'icl_query_3': ['Example question 3?'],
-    'icl_response_3': ['Example answer 3']
+```
+#### Discover dataset requirements and create your dataset
+```python
+# First, discover what data the flow needs
+# Get an empty dataset with the exact schema needed
+schema_dataset = flow.get_dataset_schema()  # Get empty dataset with correct schema
+print(f"Required columns: {schema_dataset.column_names}")
+print(f"Schema: {schema_dataset.features}")
+# Option 1: Add data directly to the schema dataset
+dataset = schema_dataset.add_item({
+    'document': 'Your document text here...',
+    'document_outline': '1. Topic A; 2. Topic B; 3. Topic C',
+    'domain': 'Computer Science',
+    'icl_document': 'Example document for in-context learning...',
+    'icl_query_1': 'Example question 1?',
+    'icl_response_1': 'Example answer 1',
+    'icl_query_2': 'Example question 2?',
+    'icl_response_2': 'Example answer 2',
+    'icl_query_3': 'Example question 3?',
+    'icl_response_3': 'Example answer 3'
 })
+# Option 2: Create your own dataset and validate the schema
+my_dataset = Dataset.from_dict(my_data_dict)
+if my_dataset.features == schema_dataset.features:
+    print("✅ Schema matches - ready to generate!")
+    dataset = my_dataset
+else:
+    print("❌ Schema mismatch - check your columns")
+# Option 3: Get raw requirements for detailed inspection
+requirements = flow.get_dataset_requirements()
+if requirements:
+    print(f"Required: {requirements.required_columns}")
+    print(f"Optional: {requirements.optional_columns}")
+    print(f"Min samples: {requirements.min_samples}")
+```
+#### Dry Run and Generate
+```python
+# Quick Testing with Dry Run
+dry_result = flow.dry_run(dataset, sample_size=1)
+print(f"Dry run completed in {dry_result['execution_time_seconds']:.2f}s")
+print(f"Output columns: {dry_result['final_dataset']['columns']}")
 # Generate high-quality QA pairs
 result = flow.generate(dataset)
@@ -128,14 +165,6 @@ faithfulness_scores = result['faithfulness_judgment']
 relevancy_scores = result['relevancy_score']
 ```
-#### Quick Testing with Dry Run
-```python
-# Test the flow with a small sample first
-dry_result = flow.dry_run(dataset, sample_size=1)
-print(f"Dry run completed in {dry_result['execution_time_seconds']:.2f}s")
-print(f"Output columns: {dry_result['final_dataset']['columns']}")
-```
 ## 📄 License

{sdg_hub-0.2.0 → sdg_hub-0.2.2}/docs/blocks/llm-blocks.md RENAMED Viewed

@@ -34,12 +34,9 @@ The unified chat block that replaces provider-specific implementations with a si
 ### Basic Usage
 ```python
-from sdg_hub.core.blocks import BlockRegistry
+from sdg_hub.core.blocks import LLMChatBlock
 from datasets import Dataset
-# Get the LLM chat block
-LLMChatBlock = BlockRegistry.get_block("LLMChatBlock")
 # Configure for OpenAI
 chat_block = LLMChatBlock(
     block_name="question_answerer",
@@ -133,7 +130,7 @@ dataset = Dataset.from_dict({
 })
 ```
-#### Async Processing
+#### Async Processing & Concurrency Control
 ```python
 chat_block = LLMChatBlock(
     block_name="async_chat",
@@ -147,6 +144,56 @@ chat_block = LLMChatBlock(
 result = chat_block.generate(large_dataset)
 ```
+**Flow-Level Concurrency Control:**
+When using LLM blocks within flows, you can control concurrency to prevent overwhelming API servers or hitting rate limits:
+```python
+from sdg_hub import Flow
+# Load a flow with LLM blocks
+flow = Flow.from_yaml("path/to/your/flow.yaml")
+flow.set_model_config(model="openai/gpt-4o", api_key="your-key")
+# Control concurrency for each LLM block in the flow
+result = flow.generate(
+    dataset,
+    max_concurrency=5  # Max 5 concurrent requests at any time
+)
+```
+**Benefits of Concurrency Control:**
+- **Rate Limit Management** - Prevent API throttling by limiting concurrent requests
+- **Resource Control** - Manage memory and network usage for large datasets
+- **Provider-Friendly** - Respect API provider recommendations for concurrent requests
+- **Automatic Scaling** - No concurrency limit = maximum parallelism for fastest processing
+**How It Works:**
+The unified async system automatically detects whether you're processing single or multiple messages and applies concurrency control appropriately:
+```python
+# Single message - processed immediately
+single_message = [{"role": "user", "content": "Hello"}]
+# Multiple messages - concurrency controlled via semaphore
+batch_messages = [
+    [{"role": "user", "content": "Question 1"}],
+    [{"role": "user", "content": "Question 2"}],
+    [{"role": "user", "content": "Question 3"}],
+    # ... up to thousands of messages
+]
+# Both cases use the same unified API under the hood
+# Concurrency is managed transparently
+```
+**Performance Guidelines:**
+- **Small datasets (<100 samples)**: No concurrency limit needed
+- **Medium datasets (100-1000 samples)**: `max_concurrency=10-20`
+- **Large datasets (1000+ samples)**: `max_concurrency=5-10` (respect API limits)
+- **Production workloads**: Start conservative and tune based on error rates
 ### Message Format
 LLMChatBlock expects messages in OpenAI chat format:

{sdg_hub-0.2.0 → sdg_hub-0.2.2}/docs/blocks/overview.md RENAMED Viewed

@@ -30,14 +30,15 @@ All blocks inherit from `BaseBlock`, which provides:
 ### Standard Configuration
 ```python
-from sdg_hub.core.blocks import BlockRegistry
+# Import the specific block you need
+from sdg_hub.core.blocks import LLMChatBlock
 # Every block has these standard fields
-MyBlock = BlockRegistry.get_block("SomeBlockType")
-block = MyBlock(
+block = LLMChatBlock(
     block_name="my_unique_block",     # Required: unique identifier
-    input_cols=["column1", "column2"], # Columns this block needs
-    output_cols=["new_column"],       # Columns this block creates
+    input_cols=["input_text"],        # Column this block needs
+    output_cols=["response"],         # Column this block creates
+    model="openai/gpt-4o",            # Required: provider/model format
     # ... block-specific configuration
 )
 ```
@@ -86,13 +87,13 @@ print(f"Found {len(available_blocks)} blocks")
 ### 2. Block Instantiation
 ```python
-# Get a block class by name
-ChatBlock = BlockRegistry.get_block("LLMChatBlock")
+# Import the specific block you need
+from sdg_hub.core.blocks import LLMChatBlock
 # Create an instance with configuration
-chat_block = ChatBlock(
+chat_block = LLMChatBlock(
     block_name="question_answerer",
-    llm_config={"model": "gpt-4o"},
+    model="openai/gpt-4o",
     input_cols=["question"],
     output_cols=["answer"],
     prompt_template="Answer this question: {question}"

{sdg_hub-0.2.0 → sdg_hub-0.2.2}/docs/concepts.md RENAMED Viewed

@@ -159,7 +159,13 @@ Every block validates data at runtime:
 - Watch execution logs for bottlenecks
 - Use async-friendly blocks for LLM operations
-### 4. Design for Reuse
+### 4. Optimize for Scale
+- Use `max_concurrency` parameter to control API request rates
+- Start with conservative concurrency limits (5-10) for production
+- Increase concurrency carefully while monitoring error rates
+- Consider provider-specific rate limits and costs
+### 5. Design for Reuse
 - Create modular flows that can be combined
 - Use parameters for customization points

{sdg_hub-0.2.0 → sdg_hub-0.2.2}/docs/development.md RENAMED Viewed

@@ -123,15 +123,16 @@ Create comprehensive tests following this pattern:
 import pytest
 from datasets import Dataset
-from sdg_hub.core.blocks import BlockRegistry
 from sdg_hub.core.utils.error_handling import MissingColumnError
+# Import your custom block directly
+from .my_new_block import MyNewBlock
 class TestMyNewBlock:
     """Test suite for MyNewBlock."""
     def test_basic_functionality(self):
         """Test basic block functionality."""
-        block = BlockRegistry.get_block("MyNewBlock")(
+        block = MyNewBlock(
             block_name="test_block",
             input_cols=["input"],
             output_cols=["output"]
@@ -149,7 +150,7 @@ class TestMyNewBlock:
     def test_configuration_validation(self):
         """Test parameter validation."""
         with pytest.raises(ValueError):
-            BlockRegistry.get_block("MyNewBlock")(
+            MyNewBlock(
                 block_name="bad_config",
                 input_cols=["input"],
                 output_cols=["output"],
@@ -158,7 +159,7 @@ class TestMyNewBlock:
     def test_missing_columns(self):
         """Test error handling for missing columns."""
-        block = BlockRegistry.get_block("MyNewBlock")(
+        block = MyNewBlock(
             block_name="test_block",
             input_cols=["missing_column"],
             output_cols=["output"]

{sdg_hub-0.2.0 → sdg_hub-0.2.2}/docs/flows/overview.md RENAMED Viewed

@@ -269,17 +269,78 @@ print(f"Sample output: {dry_result['sample_output']}")
 Customize flow behavior at runtime:
 ```python
-# Override default parameters
+# Override default runtime parameters
 result = flow.generate(
     dataset,
-    parameters={
+    runtime_params={
         "max_tokens": 200,
         "temperature": 0.9,
-        "enable_evaluation": False
     }
 )
 ```
+### Block-Specific Runtime Arguments
+You can enable or disable advanced features—such as "thinking mode"—for individual blocks at runtime using the `runtime_params` argument. This allows fine-grained control over block behavior without modifying the flow YAML.
+For example, to disable "thinking mode" for several blocks:
+```python
+# Set runtime_params for specific blocks
+result = flow.generate(
+    dataset,
+    runtime_params = {
+    # LLMChatBlock blocks
+    "llm_chat_block_1": {"extra_body": {"chat_template_kwargs": {"enable_thinking": False}}},
+    }
+)
+```
+### Concurrency Control
+For flows containing LLM blocks, you can control the maximum number of concurrent API requests to prevent overwhelming servers or hitting rate limits:
+```python
+# Basic concurrency control
+result = flow.generate(
+    dataset,
+    max_concurrency=5  # Max 5 concurrent requests per LLM block execution
+)
+# Combined with other parameters
+result = flow.generate(
+    dataset,
+    max_concurrency=10,
+    runtime_params={
+        "temperature": 0.7,
+        "max_tokens": 200
+    }
+)
+```
+**When to Use Concurrency Control:**
+- **Large Datasets** - Process thousands of samples without overwhelming APIs
+- **Rate Limit Management** - Respect provider-specific concurrent request limits
+- **Production Workloads** - Ensure stable, predictable resource usage
+- **Cost Optimization** - Prevent burst API charges from uncontrolled parallelism
+**Recommended Settings:**
+```python
+# Conservative (recommended for production)
+result = flow.generate(dataset, max_concurrency=5)
+# Moderate (good for development/testing)
+result = flow.generate(dataset, max_concurrency=10)
+# Aggressive (only for robust APIs and small datasets)
+result = flow.generate(dataset, max_concurrency=20)
+# No limit (maximum speed, use with caution)
+result = flow.generate(dataset)  # Default behavior
+```
 ## 🚀 Next Steps
 Ready to master the flow system? Explore these detailed guides:

{sdg_hub-0.2.0 → sdg_hub-0.2.2}/docs/quick-start.md RENAMED Viewed

@@ -107,11 +107,16 @@ print(f"🔎 QA Generation Flows: {qa_flows}")
 eval_flows = FlowRegistry.search_flows(tag="evaluation")
 print(f"📊 Evaluation Flows: {eval_flows}")
+# List all blocks by categories
+all_blocks = BlockRegistry.list_blocks(grouped=True)
+for category, blocks in all_blocks.items():
+    print(f"Blocks for category {category}: {blocks}")
 # Find blocks by category
-llm_blocks = BlockRegistry.search_blocks(category="llm")
+llm_blocks = BlockRegistry.list_blocks(category="llm")
 print(f"🧠 LLM Blocks: {llm_blocks}")
-transform_blocks = BlockRegistry.search_blocks(category="transform")
+transform_blocks = BlockRegistry.list_blocks(category="transform")
 print(f"🔄 Transform Blocks: {transform_blocks}")
 ```

sdg-hub 0.2.0__tar.gz → 0.2.2__tar.gz

sdg-hub 0.2.0tar.gz → 0.2.2tar.gz