PyPI - sdg-hub - Versions diffs - 0.1.0a4__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

sdg-hub 0.1.0a4py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

sdg_hub/_version.py +2 -2
sdg_hub/blocks/__init__.py +41 -5
sdg_hub/blocks/block.py +58 -16
sdg_hub/blocks/llmblock.py +121 -193
sdg_hub/blocks/openaichatblock.py +556 -0
sdg_hub/blocks/utilblocks.py +500 -43
sdg_hub/checkpointer.py +139 -0
sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
sdg_hub/configs/knowledge/atomic_facts.yaml +1 -0
sdg_hub/configs/knowledge/detailed_summary.yaml +1 -0
sdg_hub/configs/knowledge/extractive_summary.yaml +1 -0
sdg_hub/configs/knowledge/generate_questions.yaml +82 -0
sdg_hub/configs/knowledge/generate_responses.yaml +86 -0
sdg_hub/configs/skills/contexts.yaml +18 -11
sdg_hub/configs/skills/evaluate_freeform_pair.yaml +79 -12
sdg_hub/configs/skills/evaluate_freeform_questions.yaml +60 -28
sdg_hub/configs/skills/evaluate_grounded_pair.yaml +95 -30
sdg_hub/configs/skills/freeform_questions.yaml +21 -16
sdg_hub/configs/skills/freeform_responses.yaml +19 -25
sdg_hub/configs/skills/router.yaml +53 -6
sdg_hub/flow.py +366 -33
sdg_hub/flow_runner.py +437 -0
sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +21 -9
sdg_hub/flows/generation/skills/{agentic_improve_skill.yaml → improve_responses.yaml} +26 -31
sdg_hub/flows/generation/skills/synth_skills.yaml +4 -4
sdg_hub/pipeline.py +67 -12
sdg_hub/prompts.py +52 -0
sdg_hub/sdg.py +128 -86
sdg_hub/utils/__init__.py +5 -0
sdg_hub/utils/config_validation.py +91 -0
sdg_hub/utils/error_handling.py +94 -0
sdg_hub/utils/path_resolution.py +62 -0
sdg_hub/utils/validation_result.py +10 -0
sdg_hub-0.1.2.dist-info/METADATA +190 -0
sdg_hub-0.1.2.dist-info/RECORD +89 -0
{sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/WHEEL +1 -1
sdg_hub/blocks/filterblock.py +0 -76
sdg_hub/blocks/iterblock.py +0 -31
sdg_hub/blocks/rmblocks.py +0 -194
sdg_hub/configs/annotations/simple.yaml +0 -10
sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +0 -3
sdg_hub/configs/skills/data_recipe/default_recipe.yaml +0 -6
sdg_hub/flows/annotation/emotion/detailed_description.yaml +0 -19
sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +0 -19
sdg_hub/flows/annotation/emotion/simple.yaml +0 -19
sdg_hub/utils/chunking.py +0 -73
sdg_hub/utils/docprocessor.py +0 -357
sdg_hub/utils/parse_and_convert.py +0 -392
sdg_hub-0.1.0a4.dist-info/METADATA +0 -309
sdg_hub-0.1.0a4.dist-info/RECORD +0 -90
/sdg_hub/configs/{knowledge/data_recipe → reasoning}/__init__.py +0 -0
/sdg_hub/configs/skills/{_G_.yaml → icl_examples/STEM.yaml} +0 -0
/sdg_hub/configs/skills/{data_recipe → icl_examples}/__init__.py +0 -0
/sdg_hub/configs/skills/{_A_.yaml → icl_examples/coding.yaml} +0 -0
/sdg_hub/configs/skills/{_B_.yaml → icl_examples/extraction.yaml} +0 -0
/sdg_hub/configs/skills/{_C_.yaml → icl_examples/humanities.yaml} +0 -0
/sdg_hub/configs/skills/{_D_.yaml → icl_examples/math.yaml} +0 -0
/sdg_hub/configs/skills/{_E_.yaml → icl_examples/reasoning.yaml} +0 -0
/sdg_hub/configs/skills/{_F_.yaml → icl_examples/roleplay.yaml} +0 -0
/sdg_hub/configs/skills/{_H_.yaml → icl_examples/writing.yaml} +0 -0
{sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/licenses/LICENSE +0 -0
{sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/top_level.txt +0 -0

sdg_hub/pipeline.py CHANGED Viewed

@@ -1,6 +1,17 @@
+"""
+Deprecated Pipeline class for data generation pipelines.
+Use the Flow class directly for new code.
+"""
 # SPDX-License-Identifier: Apache-2.0
+# Standard
+import warnings
+from typing import List, Dict, Any
 # Third Party
 from datasets import Dataset
+from datasets.data_files import EmptyDatasetError
 # Local
 from .logger_config import setup_logger
@@ -8,31 +19,75 @@ from .logger_config import setup_logger
 logger = setup_logger(__name__)
-class EmptyDatasetError(Exception):
-    pass
+class Pipeline:
+    """A class representing a data generation pipeline.
+    This class is deprecated and will be removed in a future version.
+    Use the Flow class directly instead.
-class Pipeline:
-    def __init__(self, chained_blocks: list) -> None:
+    Parameters
+    ----------
+    chained_blocks : List[Dict[str, Any]]
+        List of block configurations to execute in sequence.
+    Attributes
+    ----------
+    chained_blocks : List[Dict[str, Any]]
+        List of block configurations to execute in sequence.
+    """
+    def __init__(self, chained_blocks: List[Dict[str, Any]]) -> None:
         """
         Initialize the Pipeline class with a configuration dictionary.
-        config_dict: the run config py or yaml loaded into a dictionary
+        DEPRECATED: This class is deprecated and will be removed in a future version.
+        Use the Flow class directly instead.
         """
+        warnings.warn(
+            "Pipeline class is deprecated and will be removed in a future version. "
+            "Use Flow class directly instead of wrapping it with Pipeline.",
+            DeprecationWarning,
+            stacklevel=2
+        )
         # pipeline config is the run configuration that consists of the pipeline steps
         self.chained_blocks = chained_blocks
-    def _drop_duplicates(self, dataset, cols):
-        """
-        Drop duplicates from the dataset based on the columns provided.
+    def _drop_duplicates(self, dataset: Dataset, cols: List[str]) -> Dataset:
+        """Drop duplicates from the dataset based on the columns provided.
+        Parameters
+        ----------
+        dataset : Dataset
+            The input dataset.
+        cols : List[str]
+            Columns to consider for duplicate detection.
+        Returns
+        -------
+        Dataset
+            Dataset with duplicates removed.
         """
         df = dataset.to_pandas()
         df = df.drop_duplicates(subset=cols).reset_index(drop=True)
         return Dataset.from_pandas(df)
-    def generate(self, dataset) -> Dataset:
-        """
-        Generate the dataset by running the pipeline steps.
-        dataset: the input dataset
+    def generate(self, dataset: Dataset) -> Dataset:
+        """Generate the dataset by running the pipeline steps.
+        Parameters
+        ----------
+        dataset : Dataset
+            The input dataset to process.
+        Returns
+        -------
+        Dataset
+            The processed dataset.
+        Raises
+        ------
+        EmptyDatasetError
+            If a block produces an empty dataset.
         """
         for block_prop in self.chained_blocks:
             block_type = block_prop["block_type"]

sdg_hub/prompts.py CHANGED Viewed

@@ -20,3 +20,55 @@ def mistral_chat_template():
 @PromptRegistry.register("meta-llama/Llama-3.3")
 def meta_llama_chat_template():
     return """{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n    {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n    {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n            {%- for arg_name, arg_val in tool_call.arguments | items %}\n                {{- arg_name + '=\"' + arg_val + '\"' }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- endif %}\n                {%- endfor %}\n            {{- \")\" }}\n        {%- else  %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n            {{- '\"parameters\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \"}\" }}\n        {%- endif %}\n        {%- if builtin_tools is defined %}\n            {#- This means we're in ipython mode #}\n            {{- \"<|eom_id|>\" }}\n        {%- else %}\n            {{- \"<|eot_id|>\" }}\n        {%- endif %}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n"""
+@PromptRegistry.register("microsoft/phi-4")
+def microsoft_phi_chat_template():
+    return """{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'assistant') %}{{'<|im_start|>assistant<|im_sep|>' + message['content'] + '<|im_end|>'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %}"""
+@PromptRegistry.register("nvidia/Llama-3_3-Nemotron-Super-49B-v1")
+def nemotron_chat_template():
+    """
+    Format chat messages for the Nemotron model, including a system prompt and structured message headers.
+    The template starts with a system message containing "detailed thinking on", then iterates over messages, wrapping each with start and end header tokens and an end-of-text token. For assistant messages containing a `</think>` tag, only the content after this tag is included. Optionally appends an assistant prompt if generation is requested.
+    """
+    return """{{- bos_token }}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}detailed thinking on{{- "<|eot_id|>" }}
+{%- for message in messages %}
+  {%- if message['role'] == 'assistant' and '</think>' in message['content'] %}
+    {%- set content = message['content'].split('</think>')[-1].lstrip() %}
+  {%- else %}
+    {%- set content = message['content'] %}
+  {%- endif %}
+  {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + content | trim + '<|eot_id|>' }}
+{%- endfor %}
+{%- if add_generation_prompt %}
+  {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}"""
+@PromptRegistry.register("Qwen/Qwen2.5")
+def qwen_2_5_chat_template():
+    """
+    Formats chat messages into the prompt structure required by the Qwen 2.5 model family, supporting system messages, tool descriptions, function call instructions, and role-based message formatting.
+    If tools are provided, includes tool signatures and instructions for function calls in the system prompt. User, assistant, and tool messages are wrapped with special tokens, and assistant tool calls are serialized as JSON within XML tags. Optionally appends a generation prompt for the assistant.
+    """
+    return """{%- if tools %}\n    {{- \'<|im_start|>system\\n\' }}\n    {%- if messages[0][\'role\'] == \'system\' %}\n        {{- messages[0][\'content\'] }}\n    {%- else %}\n        {{- \'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\' }}\n    {%- endif %}\n    {{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}\n    {%- for tool in tools %}\n        {{- "\\n" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}\n    {%- if messages[0][\'role\'] == \'system\' %}\n        {{- \'<|im_start|>system\\n\' + messages[0][\'content\'] + \'<|im_end|>\\n\' }}\n    {%- else %}\n        {{- \'<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n\' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}\n        {{- \'<|im_start|>\' + message.role + \'\\n\' + message.content + \'<|im_end|>\' + \'\\n\' }}\n    {%- elif message.role == "assistant" %}\n        {{- \'<|im_start|>\' + message.role }}\n        {%- if message.content %}\n            {{- \'\\n\' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- \'\\n<tool_call>\\n{"name": "\' }}\n            {{- tool_call.name }}\n            {{- \'", "arguments": \' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \'}\\n</tool_call>\' }}\n        {%- endfor %}\n        {{- \'<|im_end|>\\n\' }}\n    {%- elif message.role == "tool" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}\n            {{- \'<|im_start|>user\' }}\n        {%- endif %}\n        {{- \'\\n<tool_response>\\n\' }}\n        {{- message.content }}\n        {{- \'\\n</tool_response>\' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}\n            {{- \'<|im_end|>\\n\' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- \'<|im_start|>assistant\\n\' }}\n{%- endif %}\n"""
+@PromptRegistry.register("Qwen/Qwen3")
+def qwen_3_chat_template():
+    """
+    Formats chat messages for the Qwen 3 model family, supporting multi-step tool usage, reasoning content, and special XML tags for tool calls and responses.
+    This template handles system messages, user and assistant roles, and tool interactions. When tools are provided, it outputs their signatures and instructions for function calls. It tracks the last user query to determine where to insert assistant reasoning content within `<think>` tags. Assistant tool calls are serialized as JSON within `<tool_call>` tags, and tool responses are grouped inside `<tool_response>` tags. Optionally, a generation prompt and empty reasoning block can be added.
+    Parameters:
+        tools (optional): List of tool signature objects to be included in the prompt.
+        messages: List of message objects, each with a role and content, and optionally tool_calls or reasoning_content.
+        add_generation_prompt (optional): If true, appends an assistant prompt for generation.
+        enable_thinking (optional): If false, inserts an empty reasoning block in the assistant prompt.
+    """
+    return """{%- if tools %}\n    {{- \'<|im_start|>system\\n\' }}\n    {%- if messages[0].role == \'system\' %}\n        {{- messages[0].content + \'\\n\\n\' }}\n    {%- endif %}\n    {{- "# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}\n    {%- for tool in tools %}\n        {{- "\\n" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}\n    {%- if messages[0].role == \'system\' %}\n        {{- \'<|im_start|>system\\n\' + messages[0].content + \'<|im_end|>\\n\' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith(\'<tool_response>\') and message.content.endswith(\'</tool_response>\')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- if message.content is string %}\n        {%- set content = message.content %}\n    {%- else %}\n        {%- set content = \'\' %}\n    {%- endif %}\n    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}\n        {{- \'<|im_start|>\' + message.role + \'\\n\' + content + \'<|im_end|>\' + \'\\n\' }}\n    {%- elif message.role == "assistant" %}\n        {%- set reasoning_content = \'\' %}\n        {%- if message.reasoning_content is string %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if \'</think>\' in content %}\n                {%- set reasoning_content = content.split(\'</think>\')[0].rstrip(\'\\n\').split(\'<think>\')[-1].lstrip(\'\\n\') %}\n                {%- set content = content.split(\'</think>\')[-1].lstrip(\'\\n\') %}\n            {%- endif %}\n        {%- endif %}\n        {%- if loop.index0 > ns.last_query_index %}\n            {%- if loop.last or (not loop.last and reasoning_content) %}\n                {{- \'<|im_start|>\' + message.role + \'\\n<think>\\n\' + reasoning_content.strip(\'\\n\') + \'\\n</think>\\n\\n\' + content.lstrip(\'\\n\') }}\n            {%- else %}\n                {{- \'<|im_start|>\' + message.role + \'\\n\' + content }}\n            {%- endif %}\n        {%- else %}\n            {{- \'<|im_start|>\' + message.role + \'\\n\' + content }}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- \'\\n\' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- \'<tool_call>\\n{"name": "\' }}\n                {{- tool_call.name }}\n                {{- \'", "arguments": \' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- \'}\\n</tool_call>\' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- \'<|im_end|>\\n\' }}\n    {%- elif message.role == "tool" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}\n            {{- \'<|im_start|>user\' }}\n        {%- endif %}\n        {{- \'\\n<tool_response>\\n\' }}\n        {{- content }}\n        {{- \'\\n</tool_response>\' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}\n            {{- \'<|im_end|>\\n\' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- \'<|im_start|>assistant\\n\' }}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- \'<think>\\n\\n</think>\\n\\n\' }}\n    {%- endif %}\n{%- endif %}"""

sdg_hub/sdg.py CHANGED Viewed

@@ -1,35 +1,83 @@
 # SPDX-License-Identifier: Apache-2.0
+"""Synthetic Data Generator (SDG) module for managing data generation flows."""
 # Standard
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import List
+from typing import List, Optional, Tuple
 import traceback
-import uuid
 # Third Party
-from datasets import Dataset, load_dataset
-from datasets.data_files import EmptyDatasetError
+from datasets import Dataset
 from tqdm import tqdm
 # Local
+from .checkpointer import Checkpointer
+from .flow import Flow
 from .logger_config import setup_logger
-from .pipeline import Pipeline
 from .utils.datautils import safe_concatenate_datasets
 logger = setup_logger(__name__)
 class SDG:
+    """Synthetic Data Generator class.
+    This class manages the generation of synthetic data using one or more
+    data generation flows.
+    Parameters
+    ----------
+    flows : List[Flow]
+        List of flows to execute.
+    num_workers : int, optional
+        Number of worker threads to use, by default 1
+    batch_size : Optional[int], optional
+        Size of batches to process, by default None
+    save_freq : Optional[int], optional
+        Frequency of checkpoint saves, by default None
+    Attributes
+    ----------
+    flows : List[Flow]
+        List of flows to execute.
+    num_workers : int
+        Number of worker threads to use.
+    batch_size : Optional[int]
+        Size of batches to process.
+    save_freq : Optional[int]
+        Frequency of checkpoint saves.
+    """
     def __init__(
-        self, pipelines: List[Pipeline], num_workers=1, batch_size=None, save_freq=None
+        self,
+        flows: List[Flow],
+        num_workers: int = 1,
+        batch_size: Optional[int] = None,
+        save_freq: Optional[int] = None,
     ) -> None:
-        self.pipelines = pipelines
+        self.flows = flows
         self.num_workers = num_workers
         self.batch_size = batch_size
         self.save_freq = save_freq
-    def _split_dataset(self, dataset: Dataset, batch_size: int) -> List[Dataset]:
-        """Split the dataset into smaller batches."""
+    def _split_dataset(
+        self, dataset: Dataset, batch_size: int
+    ) -> List[Tuple[int, int]]:
+        """Split the dataset into smaller batches.
+        Parameters
+        ----------
+        dataset : Dataset
+            The dataset to split.
+        batch_size : int
+            Size of each batch.
+        Returns
+        -------
+        List[Tuple[int, int]]
+            List of (start, end) indices for each batch.
+        """
         total_size = len(dataset)
         num_batches = (total_size + batch_size - 1) // batch_size
@@ -40,94 +88,87 @@ class SDG:
         return batches
-    def _get_missing_data(self, seed_data, generated_data):
-        # Get the common columns between the two datasets
-        common_columns = list(
-            set(seed_data.column_names) & set(generated_data.column_names)
-        )
-        # Extract the relevant data based on common columns
-        seed_data_common = seed_data.select_columns(common_columns)
-        generated_data_common = generated_data.select_columns(common_columns)
-        # Convert to Pandas DataFrames for easier comparison
-        seed_df = seed_data_common.to_pandas()
-        generated_df = generated_data_common.to_pandas()
-        # Identify missing rows
-        missing_df = seed_df[
-            ~seed_df.apply(tuple, 1).isin(generated_df.apply(tuple, 1))
-        ]
-        # Convert back to Dataset
-        missing_data = Dataset.from_pandas(missing_df, preserve_index=False)
-        return missing_data
-    def _save_intermediate_checkpoint(self, dataset, checkpoint_dir):
-        checkpoint_id = uuid.uuid4().hex
-        checkpoint_file = f"{checkpoint_dir}/data_checkpoint_{checkpoint_id}.jsonl"
-        logger.info(f"Saving checkpoint to {checkpoint_file}")
-        dataset.to_json(checkpoint_file, orient="records", lines=True)
     @staticmethod
-    def _generate_data(pipelines, input_split, ds, i=None):
+    def _generate_data(
+        flows: List[Flow],
+        input_split: Tuple[int, int],
+        ds: Dataset,
+        i: Optional[int] = None,
+    ) -> Optional[Dataset]:
+        """Generate data for a single split using the provided flows.
+        Parameters
+        ----------
+        flows : List[Flow]
+            List of flows to execute.
+        input_split : Tuple[int, int]
+            (start, end) indices for the current split.
+        ds : Dataset
+            The full input dataset.
+        i : Optional[int], optional
+            Split index for logging, by default None
+        Returns
+        -------
+        Optional[Dataset]
+            Generated dataset for the split, or None if generation failed.
+        """
         logger.info(f"Processing split {i}")
         input_split = ds.select(range(input_split[0], input_split[1]))
         try:
-            for pipeline in pipelines:
-                input_split = pipeline.generate(input_split)
+            for flow in flows:
+                input_split = flow.generate(input_split)
             return input_split
         except Exception as e:
             logger.error(f"Error processing split {i}: {e}")
             traceback.print_exc()
             return None
-    def generate(self, dataset: Dataset, checkpoint_dir=None) -> Dataset:
-        # check if checkpoint_dir exists
-        pre_generated_data = []
-        if checkpoint_dir is not None:
-            try:
-                # check if there are any existing checkpoints
-                pre_generated_data = load_dataset(
-                    "json", data_dir=checkpoint_dir, split="train"
-                )
-                logger.info(
-                    f"Loading existing checkpoints from {checkpoint_dir}, with {pre_generated_data.num_rows} rows"
-                )
-                seed_data = self._get_missing_data(dataset, pre_generated_data)
-                if seed_data.num_rows == 0:
-                    logger.info(
-                        f"All seed data has been generated, no missing rows found, returning data from {checkpoint_dir}"
-                    )
-                    return pre_generated_data
-                logger.info(f"Found {seed_data.num_rows} missing rows in the dataset")
-            except EmptyDatasetError:
-                logger.info(
-                    f"No existing checkpoints found in {checkpoint_dir}, generating from scratch"
-                )
-                seed_data = dataset
-        else:
-            seed_data = dataset
+    def generate(
+        self, dataset: Dataset, checkpoint_dir: Optional[str] = None
+    ) -> Dataset:
+        """Generate synthetic data using the configured flows.
+        Parameters
+        ----------
+        dataset : Dataset
+            The input dataset to process.
+        checkpoint_dir : Optional[str], optional
+            Directory to save checkpoints, by default None
+        Returns
+        -------
+        Dataset
+            The generated dataset.
+        Notes
+        -----
+        If checkpoint_dir is provided, the generation process can be resumed
+        from the last checkpoint in case of interruption.
+        """
+        # Initialize checkpointer
+        checkpointer = Checkpointer(checkpoint_dir, self.save_freq)
+        # Load existing checkpoints and determine missing data
+        seed_data, pre_generated_data = checkpointer.load_existing_data(dataset)
+        # If all data has been generated, return the pre-generated data
+        if seed_data.num_rows == 0 and pre_generated_data is not None:
+            return pre_generated_data
         if not self.batch_size:
             # If batch size is not provided, generate the dataset in a single pass
             generated_dataset = seed_data
-            # generated_data is initialized with seed_data, and it gets updated with each pipeline
-            for pipeline in self.pipelines:
-                generated_dataset = pipeline.generate(seed_data)
+            # generated_data is initialized with seed_data, and it gets updated with each flow
+            for flow in self.flows:
+                generated_dataset = flow.generate(generated_dataset)
             return generated_dataset
         logger.info("Splitting the dataset into smaller batches")
-        input_splits = (
-            self._split_dataset(seed_data, self.batch_size)
-            if self.batch_size
-            else [seed_data]
-        )
+        input_splits = self._split_dataset(seed_data, self.batch_size)
         logger.info(
-            f"Generating dataset with {len(input_splits)} splits, batch size {self.batch_size}, and {self.num_workers} workers"
+            f"Generating dataset with {len(input_splits)} splits, "
+            f"batch size {self.batch_size}, and {self.num_workers} workers"
         )
         generated_data = [pre_generated_data] if pre_generated_data else []
@@ -136,7 +177,7 @@ class SDG:
         with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
             futures = [
                 executor.submit(
-                    self._generate_data, self.pipelines, input_split, seed_data, i
+                    self._generate_data, self.flows, input_split, seed_data, i
                 )
                 for i, input_split in enumerate(input_splits)
             ]
@@ -147,16 +188,17 @@ class SDG:
                 if generated_data_split:
                     generated_data.append(generated_data_split)
                     logger.info(f"Finished future processing split {i} \n\n")
-                    if self.save_freq and (i + 1) % self.save_freq == 0:
+                    # Use checkpointer to handle intermediate saves
+                    if checkpointer.should_save_checkpoint(i):
                         # Save only the new splits since the last checkpoint
                         new_splits = generated_data[last_saved_split_index : i + 1]
                         checkpoint_dataset = safe_concatenate_datasets(new_splits)
                         # check if checkpoint_dataset is not None
                         if checkpoint_dataset:
-                            self._save_intermediate_checkpoint(
-                                checkpoint_dataset, checkpoint_dir
+                            checkpointer.save_intermediate_checkpoint(
+                                checkpoint_dataset
                             )
                             last_saved_split_index = i + 1
         generated_dataset = safe_concatenate_datasets(generated_data)

sdg_hub/utils/__init__.py CHANGED Viewed

@@ -3,3 +3,8 @@
 # This is part of the public API, and used by instructlab
 class GenerateException(Exception):
     """An exception raised during generate step."""
+from .path_resolution import resolve_path
+__all__ = ["GenerateException", "resolve_path"]

sdg_hub/utils/config_validation.py ADDED Viewed

@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Configuration validation utilities for SDG Hub.
+This module provides functions to validate configuration files used by blocks,
+ensuring they meet the required schema and contain all necessary fields.
+"""
+# Standard
+from typing import Any, Dict, List
+# Local
+from ..logger_config import setup_logger
+logger = setup_logger(__name__)
+def validate_prompt_config_schema(
+    config: Dict[str, Any], config_path: str
+) -> tuple[bool, List[str]]:
+    """Validate that a prompt configuration file has the required schema fields.
+    For prompt template configs, 'system' and 'generation' are required fields.
+    Other fields like 'introduction', 'principles', 'examples', 'start_tags', 'end_tags' are optional.
+    Parameters
+    ----------
+    config : Dict[str, Any]
+        The loaded configuration dictionary.
+    config_path : str
+        The path to the configuration file (for error reporting).
+    Returns
+    -------
+    tuple[bool, List[str]]
+        A tuple containing:
+        - bool: True if schema is valid, False otherwise
+        - List[str]: List of validation error messages (empty if valid)
+    """
+    required_fields = ["system", "generation"]
+    errors = []
+    # Ensure config is a dictionary
+    if not isinstance(config, dict):
+        errors.append(f"Configuration must be a dictionary, got {type(config).__name__}")
+        return False, errors
+    # Check for missing required fields
+    missing_fields = [field for field in required_fields if field not in config]
+    if missing_fields:
+        errors.append(f"Missing required fields: {missing_fields}")
+    # Check for empty or null required fields and validate they are strings
+    for field in required_fields:
+        if field in config:
+            value = config[field]
+            if value is None:
+                errors.append(f"Required field '{field}' is null")
+            elif not isinstance(value, str):
+                errors.append(f"Required field '{field}' must be a string, got {type(value).__name__}")
+            elif not value.strip():
+                errors.append(f"Required field '{field}' is empty")
+    # Check optional string fields are strings when present
+    string_fields = ["introduction", "principles", "examples"]
+    for field in string_fields:
+        if field in config:
+            value = config[field]
+            if value is not None and not isinstance(value, str):
+                errors.append(f"Field '{field}' must be a string, got {type(value).__name__}")
+    # Check start_tags and end_tags are lists of strings when present
+    tag_fields = ["start_tags", "end_tags"]
+    for field in tag_fields:
+        if field in config:
+            value = config[field]
+            if value is not None:
+                if not isinstance(value, list):
+                    errors.append(f"Field '{field}' must be a list, got {type(value).__name__}")
+                else:
+                    for i, tag in enumerate(value):
+                        if not isinstance(tag, str):
+                            errors.append(f"Field '{field}[{i}]' must be a string, got {type(tag).__name__}")
+    # Log validation results
+    if errors:
+        for error in errors:
+            logger.error(f"Config validation failed for {config_path}: {error}")
+        return False, errors
+    logger.debug(f"Config validation passed for {config_path}")
+    return True, []

sdg_hub/utils/error_handling.py ADDED Viewed

@@ -0,0 +1,94 @@
+"""Custom exception classes for SDG Hub error handling."""
+class SDGHubError(Exception):
+    """Base exception class for all SDG Hub errors."""
+    def __init__(self, message: str, details: str = None):
+        """Initialize SDGHubError.
+        Parameters
+        ----------
+        message : str
+            The main error message.
+        details : str, optional
+            Additional details about the error.
+        """
+        self.message = message
+        self.details = details
+        full_message = message
+        if details:
+            full_message = f"{message}\nDetails: {details}"
+        super().__init__(full_message)
+class FlowRunnerError(SDGHubError):
+    """Base exception class for flow runner errors."""
+    pass
+class DatasetLoadError(FlowRunnerError):
+    """Raised when dataset loading fails."""
+    pass
+class FlowConfigurationError(FlowRunnerError):
+    """Raised when flow configuration is invalid."""
+    pass
+class APIConnectionError(FlowRunnerError):
+    """Raised when API connection fails."""
+    pass
+class DataGenerationError(FlowRunnerError):
+    """Raised when data generation fails."""
+    pass
+class DataSaveError(FlowRunnerError):
+    """Raised when saving generated data fails."""
+    pass
+class BlockError(SDGHubError):
+    """Base exception class for block-related errors."""
+    pass
+class BlockConfigurationError(BlockError):
+    """Raised when block configuration is invalid."""
+    pass
+class BlockExecutionError(BlockError):
+    """Raised when block execution fails."""
+    pass
+class FlowError(SDGHubError):
+    """Base exception class for flow-related errors."""
+    pass
+class FlowValidationError(FlowError):
+    """Raised when flow validation fails."""
+    pass
+class FlowExecutionError(FlowError):
+    """Raised when flow execution fails."""
+    pass

sdg-hub 0.1.0a4__py3-none-any.whl → 0.1.2__py3-none-any.whl

sdg-hub 0.1.0a4py3-none-any.whl → 0.1.2py3-none-any.whl