PyPI - sdg-hub - Versions diffs - 0.1.0a3__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

sdg-hub 0.1.0a3py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

sdg_hub/_version.py +2 -2
sdg_hub/blocks/__init__.py +35 -5
sdg_hub/blocks/block.py +58 -16
sdg_hub/blocks/llmblock.py +149 -204
sdg_hub/blocks/utilblocks.py +500 -43
sdg_hub/checkpointer.py +139 -0
sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
sdg_hub/configs/knowledge/atomic_facts.yaml +1 -0
sdg_hub/configs/knowledge/detailed_summary.yaml +1 -0
sdg_hub/configs/knowledge/extractive_summary.yaml +1 -0
sdg_hub/configs/knowledge/generate_questions.yaml +82 -0
sdg_hub/configs/knowledge/generate_responses.yaml +86 -0
sdg_hub/configs/skills/contexts.yaml +18 -11
sdg_hub/configs/skills/evaluate_freeform_pair.yaml +79 -12
sdg_hub/configs/skills/evaluate_freeform_questions.yaml +60 -28
sdg_hub/configs/skills/evaluate_grounded_pair.yaml +95 -30
sdg_hub/configs/skills/freeform_questions.yaml +21 -16
sdg_hub/configs/skills/freeform_responses.yaml +19 -25
sdg_hub/configs/skills/router.yaml +53 -6
sdg_hub/flow.py +351 -21
sdg_hub/flow_runner.py +216 -0
sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +26 -9
sdg_hub/flows/generation/skills/{agentic_improve_skill.yaml → improve_responses.yaml} +26 -31
sdg_hub/flows/generation/skills/synth_skills.yaml +4 -4
sdg_hub/pipeline.py +67 -12
sdg_hub/prompts.py +26 -0
sdg_hub/sdg.py +128 -86
sdg_hub/utils/config_validation.py +91 -0
sdg_hub/utils/validation_result.py +10 -0
sdg_hub-0.1.1.dist-info/METADATA +190 -0
sdg_hub-0.1.1.dist-info/RECORD +86 -0
{sdg_hub-0.1.0a3.dist-info → sdg_hub-0.1.1.dist-info}/WHEEL +1 -1
sdg_hub/blocks/filterblock.py +0 -76
sdg_hub/blocks/iterblock.py +0 -31
sdg_hub/blocks/rmblocks.py +0 -194
sdg_hub/configs/annotations/simple.yaml +0 -10
sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +0 -3
sdg_hub/configs/skills/data_recipe/default_recipe.yaml +0 -6
sdg_hub/flows/annotation/emotion/detailed_description.yaml +0 -19
sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +0 -19
sdg_hub/flows/annotation/emotion/simple.yaml +0 -19
sdg_hub/utils/chunking.py +0 -73
sdg_hub/utils/docprocessor.py +0 -357
sdg_hub/utils/parse_and_convert.py +0 -392
sdg_hub-0.1.0a3.dist-info/METADATA +0 -154
sdg_hub-0.1.0a3.dist-info/RECORD +0 -90
/sdg_hub/configs/{knowledge/data_recipe → reasoning}/__init__.py +0 -0
/sdg_hub/configs/skills/{_G_.yaml → icl_examples/STEM.yaml} +0 -0
/sdg_hub/configs/skills/{data_recipe → icl_examples}/__init__.py +0 -0
/sdg_hub/configs/skills/{_A_.yaml → icl_examples/coding.yaml} +0 -0
/sdg_hub/configs/skills/{_B_.yaml → icl_examples/extraction.yaml} +0 -0
/sdg_hub/configs/skills/{_C_.yaml → icl_examples/humanities.yaml} +0 -0
/sdg_hub/configs/skills/{_D_.yaml → icl_examples/math.yaml} +0 -0
/sdg_hub/configs/skills/{_E_.yaml → icl_examples/reasoning.yaml} +0 -0
/sdg_hub/configs/skills/{_F_.yaml → icl_examples/roleplay.yaml} +0 -0
/sdg_hub/configs/skills/{_H_.yaml → icl_examples/writing.yaml} +0 -0
{sdg_hub-0.1.0a3.dist-info → sdg_hub-0.1.1.dist-info}/licenses/LICENSE +0 -0
{sdg_hub-0.1.0a3.dist-info → sdg_hub-0.1.1.dist-info}/top_level.txt +0 -0

sdg_hub/flows/generation/skills/{agentic_improve_skill.yaml → improve_responses.yaml} RENAMED Viewed

@@ -2,34 +2,34 @@
   block_config:
     block_name: router
     config_path: configs/skills/router.yaml
-    model_id: skill-classifier-v3-clm
+    model_id: meta-llama/Llama-3.3-70B-Instruct
     output_cols:
       - route
   gen_kwargs:
     temperature: 0
-    max_tokens: 1
+    max_tokens: 5
     extra_body:
-      allowed_token_ids:
-        - 32001
-        - 32002
-        - 32003
-        - 32004
-        - 32005
-        - 32006
-        - 32007
-        - 32008
+      guided_choice:
+        - "coding"
+        - "extraction"
+        - "humanities"
+        - "math"
+        - "reasoning"
+        - "roleplay"
+        - "STEM"
+        - "writing"
 - block_type: SamplePopulatorBlock
   block_config:
     block_name: icl_populator
     config_paths:
-      - configs/skills/_A_.yaml
-      - configs/skills/_B_.yaml
-      - configs/skills/_C_.yaml
-      - configs/skills/_D_.yaml
-      - configs/skills/_E_.yaml
-      - configs/skills/_F_.yaml
-      - configs/skills/_G_.yaml
-      - configs/skills/_H_.yaml
+      - configs/skills/icl_examples/coding.yaml
+      - configs/skills/icl_examples/extraction.yaml
+      - configs/skills/icl_examples/humanities.yaml
+      - configs/skills/icl_examples/math.yaml
+      - configs/skills/icl_examples/reasoning.yaml
+      - configs/skills/icl_examples/roleplay.yaml
+      - configs/skills/icl_examples/STEM.yaml
+      - configs/skills/icl_examples/writing.yaml
     column_name: route
     batch_kwargs:
       num_procs: 8
@@ -37,8 +37,7 @@
   block_config:
     block_name: analyzer
     config_path: configs/skills/analyzer.yaml
-    model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
-    model_prompt: <s> [INST] {prompt} [/INST]
+    model_id: meta-llama/Llama-3.3-70B-Instruct
     output_cols:
       - analysis
       - rubric
@@ -46,24 +45,21 @@
   block_config:
     block_name: critic
     config_path: configs/skills/critic.yaml
-    model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
-    model_prompt: <s> [INST] {prompt} [/INST]
+    model_id: meta-llama/Llama-3.3-70B-Instruct
     output_cols:
       - critique
 - block_type: LLMBlock
   block_config:
     block_name: planner
     config_path: configs/skills/planner.yaml
-    model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
-    model_prompt: <s> [INST] {prompt} [/INST]
+    model_id: meta-llama/Llama-3.3-70B-Instruct
     output_cols:
       - plan
 - block_type: LLMBlock
   block_config:
     block_name: revised_responder
     config_path: configs/skills/revised_responder.yaml
-    model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
-    model_prompt: <s> [INST] {prompt} [/INST]
+    model_id: meta-llama/Llama-3.3-70B-Instruct
     output_cols:
       - revised_response
   drop_columns:
@@ -78,8 +74,7 @@
   block_config:
     block_name: judge
     config_path: configs/skills/judge.yaml
-    model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
-    model_prompt: <s> [INST] {prompt} [/INST]
+    model_id: meta-llama/Llama-3.3-70B-Instruct
     output_cols:
       - judgement
       - verdict
@@ -100,9 +95,9 @@
        Assistant A: "response"
        Assistant B: "revised_response"
     choice_col: verdict
-    output_col: chosen_reponse
+    output_col: chosen_response
     batch_kwargs:
       num_procs: 8
   drop_columns:
     - judgemnent
-    - verdict
+    - verdict

sdg_hub/flows/generation/skills/synth_skills.yaml CHANGED Viewed

@@ -2,7 +2,7 @@
   block_config:
     block_name: gen_questions
     config_path: configs/skills/freeform_questions.yaml
-    model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
+    model_id: meta-llama/Llama-3.3-70B-Instruct
     output_cols:
       - question
     batch_kwargs:
@@ -13,7 +13,7 @@
   block_config:
     block_name: eval_questions
     config_path: configs/skills/evaluate_freeform_questions.yaml
-    model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
+    model_id: meta-llama/Llama-3.3-70B-Instruct
     output_cols:
       - evaluation
       - score
@@ -34,14 +34,14 @@
   block_config:
     block_name: gen_responses
     config_path: configs/skills/freeform_responses.yaml
-    model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
+    model_id: meta-llama/Llama-3.3-70B-Instruct
     output_cols:
       - response
 - block_type: LLMBlock
   block_config:
     block_name: evaluate_qa_pair
     config_path: configs/skills/evaluate_freeform_pair.yaml
-    model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
+    model_id: meta-llama/Llama-3.3-70B-Instruct
     output_cols:
       - evaluation
       - score

sdg_hub/pipeline.py CHANGED Viewed

@@ -1,6 +1,17 @@
+"""
+Deprecated Pipeline class for data generation pipelines.
+Use the Flow class directly for new code.
+"""
 # SPDX-License-Identifier: Apache-2.0
+# Standard
+import warnings
+from typing import List, Dict, Any
 # Third Party
 from datasets import Dataset
+from datasets.data_files import EmptyDatasetError
 # Local
 from .logger_config import setup_logger
@@ -8,31 +19,75 @@ from .logger_config import setup_logger
 logger = setup_logger(__name__)
-class EmptyDatasetError(Exception):
-    pass
+class Pipeline:
+    """A class representing a data generation pipeline.
+    This class is deprecated and will be removed in a future version.
+    Use the Flow class directly instead.
-class Pipeline:
-    def __init__(self, chained_blocks: list) -> None:
+    Parameters
+    ----------
+    chained_blocks : List[Dict[str, Any]]
+        List of block configurations to execute in sequence.
+    Attributes
+    ----------
+    chained_blocks : List[Dict[str, Any]]
+        List of block configurations to execute in sequence.
+    """
+    def __init__(self, chained_blocks: List[Dict[str, Any]]) -> None:
         """
         Initialize the Pipeline class with a configuration dictionary.
-        config_dict: the run config py or yaml loaded into a dictionary
+        DEPRECATED: This class is deprecated and will be removed in a future version.
+        Use the Flow class directly instead.
         """
+        warnings.warn(
+            "Pipeline class is deprecated and will be removed in a future version. "
+            "Use Flow class directly instead of wrapping it with Pipeline.",
+            DeprecationWarning,
+            stacklevel=2
+        )
         # pipeline config is the run configuration that consists of the pipeline steps
         self.chained_blocks = chained_blocks
-    def _drop_duplicates(self, dataset, cols):
-        """
-        Drop duplicates from the dataset based on the columns provided.
+    def _drop_duplicates(self, dataset: Dataset, cols: List[str]) -> Dataset:
+        """Drop duplicates from the dataset based on the columns provided.
+        Parameters
+        ----------
+        dataset : Dataset
+            The input dataset.
+        cols : List[str]
+            Columns to consider for duplicate detection.
+        Returns
+        -------
+        Dataset
+            Dataset with duplicates removed.
         """
         df = dataset.to_pandas()
         df = df.drop_duplicates(subset=cols).reset_index(drop=True)
         return Dataset.from_pandas(df)
-    def generate(self, dataset) -> Dataset:
-        """
-        Generate the dataset by running the pipeline steps.
-        dataset: the input dataset
+    def generate(self, dataset: Dataset) -> Dataset:
+        """Generate the dataset by running the pipeline steps.
+        Parameters
+        ----------
+        dataset : Dataset
+            The input dataset to process.
+        Returns
+        -------
+        Dataset
+            The processed dataset.
+        Raises
+        ------
+        EmptyDatasetError
+            If a block produces an empty dataset.
         """
         for block_prop in self.chained_blocks:
             block_type = block_prop["block_type"]

sdg_hub/prompts.py CHANGED Viewed

@@ -15,3 +15,29 @@ def instructlab_chat_template():
 @PromptRegistry.register("mistralai")
 def mistral_chat_template():
     return """{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n<s>\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n        {%- else %}\n            {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- ' ' + message['content'] + '</s>'}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n    {%- endif %}\n{%- endfor %}\n"""
+@PromptRegistry.register("meta-llama/Llama-3.3")
+def meta_llama_chat_template():
+    return """{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n    {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n    {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n            {%- for arg_name, arg_val in tool_call.arguments | items %}\n                {{- arg_name + '=\"' + arg_val + '\"' }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- endif %}\n                {%- endfor %}\n            {{- \")\" }}\n        {%- else  %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n            {{- '\"parameters\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \"}\" }}\n        {%- endif %}\n        {%- if builtin_tools is defined %}\n            {#- This means we're in ipython mode #}\n            {{- \"<|eom_id|>\" }}\n        {%- else %}\n            {{- \"<|eot_id|>\" }}\n        {%- endif %}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n"""
+@PromptRegistry.register("microsoft/phi-4")
+def microsoft_phi_chat_template():
+    return """{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'assistant') %}{{'<|im_start|>assistant<|im_sep|>' + message['content'] + '<|im_end|>'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %}"""
+@PromptRegistry.register("nvidia/Llama-3_3-Nemotron-Super-49B-v1")
+def nemotron_chat_template():
+    return """{{- bos_token }}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}detailed thinking on{{- "<|eot_id|>" }}
+{%- for message in messages %}
+  {%- if message['role'] == 'assistant' and '</think>' in message['content'] %}
+    {%- set content = message['content'].split('</think>')[-1].lstrip() %}
+  {%- else %}
+    {%- set content = message['content'] %}
+  {%- endif %}
+  {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + content | trim + '<|eot_id|>' }}
+{%- endfor %}
+{%- if add_generation_prompt %}
+  {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}"""

sdg_hub/sdg.py CHANGED Viewed

@@ -1,35 +1,83 @@
 # SPDX-License-Identifier: Apache-2.0
+"""Synthetic Data Generator (SDG) module for managing data generation flows."""
 # Standard
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import List
+from typing import List, Optional, Tuple
 import traceback
-import uuid
 # Third Party
-from datasets import Dataset, load_dataset
-from datasets.data_files import EmptyDatasetError
+from datasets import Dataset
 from tqdm import tqdm
 # Local
+from .checkpointer import Checkpointer
+from .flow import Flow
 from .logger_config import setup_logger
-from .pipeline import Pipeline
 from .utils.datautils import safe_concatenate_datasets
 logger = setup_logger(__name__)
 class SDG:
+    """Synthetic Data Generator class.
+    This class manages the generation of synthetic data using one or more
+    data generation flows.
+    Parameters
+    ----------
+    flows : List[Flow]
+        List of flows to execute.
+    num_workers : int, optional
+        Number of worker threads to use, by default 1
+    batch_size : Optional[int], optional
+        Size of batches to process, by default None
+    save_freq : Optional[int], optional
+        Frequency of checkpoint saves, by default None
+    Attributes
+    ----------
+    flows : List[Flow]
+        List of flows to execute.
+    num_workers : int
+        Number of worker threads to use.
+    batch_size : Optional[int]
+        Size of batches to process.
+    save_freq : Optional[int]
+        Frequency of checkpoint saves.
+    """
     def __init__(
-        self, pipelines: List[Pipeline], num_workers=1, batch_size=None, save_freq=None
+        self,
+        flows: List[Flow],
+        num_workers: int = 1,
+        batch_size: Optional[int] = None,
+        save_freq: Optional[int] = None,
     ) -> None:
-        self.pipelines = pipelines
+        self.flows = flows
         self.num_workers = num_workers
         self.batch_size = batch_size
         self.save_freq = save_freq
-    def _split_dataset(self, dataset: Dataset, batch_size: int) -> List[Dataset]:
-        """Split the dataset into smaller batches."""
+    def _split_dataset(
+        self, dataset: Dataset, batch_size: int
+    ) -> List[Tuple[int, int]]:
+        """Split the dataset into smaller batches.
+        Parameters
+        ----------
+        dataset : Dataset
+            The dataset to split.
+        batch_size : int
+            Size of each batch.
+        Returns
+        -------
+        List[Tuple[int, int]]
+            List of (start, end) indices for each batch.
+        """
         total_size = len(dataset)
         num_batches = (total_size + batch_size - 1) // batch_size
@@ -40,94 +88,87 @@ class SDG:
         return batches
-    def _get_missing_data(self, seed_data, generated_data):
-        # Get the common columns between the two datasets
-        common_columns = list(
-            set(seed_data.column_names) & set(generated_data.column_names)
-        )
-        # Extract the relevant data based on common columns
-        seed_data_common = seed_data.select_columns(common_columns)
-        generated_data_common = generated_data.select_columns(common_columns)
-        # Convert to Pandas DataFrames for easier comparison
-        seed_df = seed_data_common.to_pandas()
-        generated_df = generated_data_common.to_pandas()
-        # Identify missing rows
-        missing_df = seed_df[
-            ~seed_df.apply(tuple, 1).isin(generated_df.apply(tuple, 1))
-        ]
-        # Convert back to Dataset
-        missing_data = Dataset.from_pandas(missing_df, preserve_index=False)
-        return missing_data
-    def _save_intermediate_checkpoint(self, dataset, checkpoint_dir):
-        checkpoint_id = uuid.uuid4().hex
-        checkpoint_file = f"{checkpoint_dir}/data_checkpoint_{checkpoint_id}.jsonl"
-        logger.info(f"Saving checkpoint to {checkpoint_file}")
-        dataset.to_json(checkpoint_file, orient="records", lines=True)
     @staticmethod
-    def _generate_data(pipelines, input_split, ds, i=None):
+    def _generate_data(
+        flows: List[Flow],
+        input_split: Tuple[int, int],
+        ds: Dataset,
+        i: Optional[int] = None,
+    ) -> Optional[Dataset]:
+        """Generate data for a single split using the provided flows.
+        Parameters
+        ----------
+        flows : List[Flow]
+            List of flows to execute.
+        input_split : Tuple[int, int]
+            (start, end) indices for the current split.
+        ds : Dataset
+            The full input dataset.
+        i : Optional[int], optional
+            Split index for logging, by default None
+        Returns
+        -------
+        Optional[Dataset]
+            Generated dataset for the split, or None if generation failed.
+        """
         logger.info(f"Processing split {i}")
         input_split = ds.select(range(input_split[0], input_split[1]))
         try:
-            for pipeline in pipelines:
-                input_split = pipeline.generate(input_split)
+            for flow in flows:
+                input_split = flow.generate(input_split)
             return input_split
         except Exception as e:
             logger.error(f"Error processing split {i}: {e}")
             traceback.print_exc()
             return None
-    def generate(self, dataset: Dataset, checkpoint_dir=None) -> Dataset:
-        # check if checkpoint_dir exists
-        pre_generated_data = []
-        if checkpoint_dir is not None:
-            try:
-                # check if there are any existing checkpoints
-                pre_generated_data = load_dataset(
-                    "json", data_dir=checkpoint_dir, split="train"
-                )
-                logger.info(
-                    f"Loading existing checkpoints from {checkpoint_dir}, with {pre_generated_data.num_rows} rows"
-                )
-                seed_data = self._get_missing_data(dataset, pre_generated_data)
-                if seed_data.num_rows == 0:
-                    logger.info(
-                        f"All seed data has been generated, no missing rows found, returning data from {checkpoint_dir}"
-                    )
-                    return pre_generated_data
-                logger.info(f"Found {seed_data.num_rows} missing rows in the dataset")
-            except EmptyDatasetError:
-                logger.info(
-                    f"No existing checkpoints found in {checkpoint_dir}, generating from scratch"
-                )
-                seed_data = dataset
-        else:
-            seed_data = dataset
+    def generate(
+        self, dataset: Dataset, checkpoint_dir: Optional[str] = None
+    ) -> Dataset:
+        """Generate synthetic data using the configured flows.
+        Parameters
+        ----------
+        dataset : Dataset
+            The input dataset to process.
+        checkpoint_dir : Optional[str], optional
+            Directory to save checkpoints, by default None
+        Returns
+        -------
+        Dataset
+            The generated dataset.
+        Notes
+        -----
+        If checkpoint_dir is provided, the generation process can be resumed
+        from the last checkpoint in case of interruption.
+        """
+        # Initialize checkpointer
+        checkpointer = Checkpointer(checkpoint_dir, self.save_freq)
+        # Load existing checkpoints and determine missing data
+        seed_data, pre_generated_data = checkpointer.load_existing_data(dataset)
+        # If all data has been generated, return the pre-generated data
+        if seed_data.num_rows == 0 and pre_generated_data is not None:
+            return pre_generated_data
         if not self.batch_size:
             # If batch size is not provided, generate the dataset in a single pass
             generated_dataset = seed_data
-            # generated_data is initialized with seed_data, and it gets updated with each pipeline
-            for pipeline in self.pipelines:
-                generated_dataset = pipeline.generate(seed_data)
+            # generated_data is initialized with seed_data, and it gets updated with each flow
+            for flow in self.flows:
+                generated_dataset = flow.generate(generated_dataset)
             return generated_dataset
         logger.info("Splitting the dataset into smaller batches")
-        input_splits = (
-            self._split_dataset(seed_data, self.batch_size)
-            if self.batch_size
-            else [seed_data]
-        )
+        input_splits = self._split_dataset(seed_data, self.batch_size)
         logger.info(
-            f"Generating dataset with {len(input_splits)} splits, batch size {self.batch_size}, and {self.num_workers} workers"
+            f"Generating dataset with {len(input_splits)} splits, "
+            f"batch size {self.batch_size}, and {self.num_workers} workers"
         )
         generated_data = [pre_generated_data] if pre_generated_data else []
@@ -136,7 +177,7 @@ class SDG:
         with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
             futures = [
                 executor.submit(
-                    self._generate_data, self.pipelines, input_split, seed_data, i
+                    self._generate_data, self.flows, input_split, seed_data, i
                 )
                 for i, input_split in enumerate(input_splits)
             ]
@@ -147,16 +188,17 @@ class SDG:
                 if generated_data_split:
                     generated_data.append(generated_data_split)
                     logger.info(f"Finished future processing split {i} \n\n")
-                    if self.save_freq and (i + 1) % self.save_freq == 0:
+                    # Use checkpointer to handle intermediate saves
+                    if checkpointer.should_save_checkpoint(i):
                         # Save only the new splits since the last checkpoint
                         new_splits = generated_data[last_saved_split_index : i + 1]
                         checkpoint_dataset = safe_concatenate_datasets(new_splits)
                         # check if checkpoint_dataset is not None
                         if checkpoint_dataset:
-                            self._save_intermediate_checkpoint(
-                                checkpoint_dataset, checkpoint_dir
+                            checkpointer.save_intermediate_checkpoint(
+                                checkpoint_dataset
                             )
                             last_saved_split_index = i + 1
         generated_dataset = safe_concatenate_datasets(generated_data)

sdg_hub/utils/config_validation.py ADDED Viewed

@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Configuration validation utilities for SDG Hub.
+This module provides functions to validate configuration files used by blocks,
+ensuring they meet the required schema and contain all necessary fields.
+"""
+# Standard
+from typing import Any, Dict, List
+# Local
+from ..logger_config import setup_logger
+logger = setup_logger(__name__)
+def validate_prompt_config_schema(
+    config: Dict[str, Any], config_path: str
+) -> tuple[bool, List[str]]:
+    """Validate that a prompt configuration file has the required schema fields.
+    For prompt template configs, 'system' and 'generation' are required fields.
+    Other fields like 'introduction', 'principles', 'examples', 'start_tags', 'end_tags' are optional.
+    Parameters
+    ----------
+    config : Dict[str, Any]
+        The loaded configuration dictionary.
+    config_path : str
+        The path to the configuration file (for error reporting).
+    Returns
+    -------
+    tuple[bool, List[str]]
+        A tuple containing:
+        - bool: True if schema is valid, False otherwise
+        - List[str]: List of validation error messages (empty if valid)
+    """
+    required_fields = ["system", "generation"]
+    errors = []
+    # Ensure config is a dictionary
+    if not isinstance(config, dict):
+        errors.append(f"Configuration must be a dictionary, got {type(config).__name__}")
+        return False, errors
+    # Check for missing required fields
+    missing_fields = [field for field in required_fields if field not in config]
+    if missing_fields:
+        errors.append(f"Missing required fields: {missing_fields}")
+    # Check for empty or null required fields and validate they are strings
+    for field in required_fields:
+        if field in config:
+            value = config[field]
+            if value is None:
+                errors.append(f"Required field '{field}' is null")
+            elif not isinstance(value, str):
+                errors.append(f"Required field '{field}' must be a string, got {type(value).__name__}")
+            elif not value.strip():
+                errors.append(f"Required field '{field}' is empty")
+    # Check optional string fields are strings when present
+    string_fields = ["introduction", "principles", "examples"]
+    for field in string_fields:
+        if field in config:
+            value = config[field]
+            if value is not None and not isinstance(value, str):
+                errors.append(f"Field '{field}' must be a string, got {type(value).__name__}")
+    # Check start_tags and end_tags are lists of strings when present
+    tag_fields = ["start_tags", "end_tags"]
+    for field in tag_fields:
+        if field in config:
+            value = config[field]
+            if value is not None:
+                if not isinstance(value, list):
+                    errors.append(f"Field '{field}' must be a list, got {type(value).__name__}")
+                else:
+                    for i, tag in enumerate(value):
+                        if not isinstance(tag, str):
+                            errors.append(f"Field '{field}[{i}]' must be a string, got {type(tag).__name__}")
+    # Log validation results
+    if errors:
+        for error in errors:
+            logger.error(f"Config validation failed for {config_path}: {error}")
+        return False, errors
+    logger.debug(f"Config validation passed for {config_path}")
+    return True, []

sdg-hub 0.1.0a3__py3-none-any.whl → 0.1.1__py3-none-any.whl

sdg-hub 0.1.0a3py3-none-any.whl → 0.1.1py3-none-any.whl