PyPI - sdg-hub - Versions diffs - 0.1.0__py3-none-any.whl - Mend

sdg-hub 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

sdg_hub/__init__.py +3 -0
sdg_hub/_version.py +21 -0
sdg_hub/blocks/__init__.py +36 -0
sdg_hub/blocks/block.py +96 -0
sdg_hub/blocks/llmblock.py +375 -0
sdg_hub/blocks/utilblocks.py +597 -0
sdg_hub/checkpointer.py +139 -0
sdg_hub/configs/__init__.py +0 -0
sdg_hub/configs/annotations/__init__.py +0 -0
sdg_hub/configs/annotations/cot_reflection.yaml +34 -0
sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
sdg_hub/configs/annotations/detailed_description.yaml +10 -0
sdg_hub/configs/annotations/detailed_description_icl.yaml +32 -0
sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
sdg_hub/configs/knowledge/__init__.py +0 -0
sdg_hub/configs/knowledge/atomic_facts.yaml +45 -0
sdg_hub/configs/knowledge/auxilary_instructions.yaml +35 -0
sdg_hub/configs/knowledge/detailed_summary.yaml +17 -0
sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +68 -0
sdg_hub/configs/knowledge/evaluate_question.yaml +38 -0
sdg_hub/configs/knowledge/evaluate_relevancy.yaml +85 -0
sdg_hub/configs/knowledge/extractive_summary.yaml +17 -0
sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +39 -0
sdg_hub/configs/knowledge/generate_questions_responses.yaml +56 -0
sdg_hub/configs/knowledge/mcq_generation.yaml +83 -0
sdg_hub/configs/knowledge/router.yaml +12 -0
sdg_hub/configs/knowledge/simple_generate_qa.yaml +34 -0
sdg_hub/configs/reasoning/__init__.py +0 -0
sdg_hub/configs/reasoning/dynamic_cot.yaml +40 -0
sdg_hub/configs/skills/__init__.py +0 -0
sdg_hub/configs/skills/analyzer.yaml +48 -0
sdg_hub/configs/skills/annotation.yaml +36 -0
sdg_hub/configs/skills/contexts.yaml +28 -0
sdg_hub/configs/skills/critic.yaml +60 -0
sdg_hub/configs/skills/evaluate_freeform_pair.yaml +111 -0
sdg_hub/configs/skills/evaluate_freeform_questions.yaml +78 -0
sdg_hub/configs/skills/evaluate_grounded_pair.yaml +119 -0
sdg_hub/configs/skills/evaluate_grounded_questions.yaml +51 -0
sdg_hub/configs/skills/freeform_questions.yaml +34 -0
sdg_hub/configs/skills/freeform_responses.yaml +39 -0
sdg_hub/configs/skills/grounded_questions.yaml +38 -0
sdg_hub/configs/skills/grounded_responses.yaml +59 -0
sdg_hub/configs/skills/icl_examples/STEM.yaml +56 -0
sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
sdg_hub/configs/skills/icl_examples/coding.yaml +97 -0
sdg_hub/configs/skills/icl_examples/extraction.yaml +36 -0
sdg_hub/configs/skills/icl_examples/humanities.yaml +71 -0
sdg_hub/configs/skills/icl_examples/math.yaml +85 -0
sdg_hub/configs/skills/icl_examples/reasoning.yaml +30 -0
sdg_hub/configs/skills/icl_examples/roleplay.yaml +45 -0
sdg_hub/configs/skills/icl_examples/writing.yaml +80 -0
sdg_hub/configs/skills/judge.yaml +53 -0
sdg_hub/configs/skills/planner.yaml +67 -0
sdg_hub/configs/skills/respond.yaml +8 -0
sdg_hub/configs/skills/revised_responder.yaml +78 -0
sdg_hub/configs/skills/router.yaml +59 -0
sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +27 -0
sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +31 -0
sdg_hub/flow.py +306 -0
sdg_hub/flow_runner.py +204 -0
sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +13 -0
sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +12 -0
sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +89 -0
sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +136 -0
sdg_hub/flows/generation/skills/improve_responses.yaml +103 -0
sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +12 -0
sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +12 -0
sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +80 -0
sdg_hub/flows/generation/skills/synth_skills.yaml +59 -0
sdg_hub/logger_config.py +20 -0
sdg_hub/pipeline.py +121 -0
sdg_hub/prompts.py +43 -0
sdg_hub/py.typed +0 -0
sdg_hub/registry.py +122 -0
sdg_hub/sdg.py +206 -0
sdg_hub/utils/__init__.py +5 -0
sdg_hub/utils/datautils.py +14 -0
sdg_hub-0.1.0.dist-info/METADATA +190 -0
sdg_hub-0.1.0.dist-info/RECORD +82 -0
sdg_hub-0.1.0.dist-info/WHEEL +5 -0
sdg_hub-0.1.0.dist-info/licenses/LICENSE +201 -0
sdg_hub-0.1.0.dist-info/top_level.txt +1 -0

sdg_hub/flow.py ADDED Viewed

@@ -0,0 +1,306 @@
+"""
+Flow module for managing data generation pipelines.
+This module provides the core Flow class that handles both configuration loading and execution
+of data generation blocks. The Flow class serves as the main interface for defining and running
+data generation pipelines, supporting both direct usage with SDG and backward compatibility
+through the deprecated Pipeline class.
+Example:
+    >>> flow = Flow(llm_client)
+    >>> flow = flow.get_flow_from_file("path/to/flow.yaml")
+    >>> dataset = flow.generate(input_dataset)
+Note:
+    This module is part of the SDG Hub package and is designed to work in conjunction
+    with the SDG class for distributed data generation.
+"""
+# SPDX-License-Identifier: Apache-2.0
+# Standard
+from abc import ABC
+from importlib import resources
+from typing import Optional, List, Dict, Any, Callable
+import operator
+import os
+# Third Party
+import yaml
+from datasets import Dataset
+from datasets.data_files import EmptyDatasetError
+# Local
+from .blocks import *  # needed to register blocks
+from .prompts import *  # needed to register prompts
+from .registry import BlockRegistry, PromptRegistry
+from .logger_config import setup_logger
+logger = setup_logger(__name__)
+OPERATOR_MAP: Dict[str, Callable] = {
+    "operator.eq": operator.eq,
+    "operator.ge": operator.ge,
+    "operator.le": operator.le,
+    "operator.gt": operator.gt,
+    "operator.lt": operator.lt,
+    "operator.ne": operator.ne,
+    "operator.contains": operator.contains,
+}
+CONVERT_DTYPE_MAP: Dict[str, Callable] = {
+    "float": float,
+    "int": int,
+}
+class Flow(ABC):
+    """A class representing a data generation flow.
+    This class handles both configuration loading and execution of data generation
+    blocks. It can be used directly with SDG or through the deprecated Pipeline class.
+    """
+    def __init__(
+        self,
+        llm_client: Any,
+        num_samples_to_generate: Optional[int] = None,
+    ) -> None:
+        """
+        Initialize the Flow class.
+        Parameters
+        ----------
+        llm_client : Any
+            The LLM client to use for generation.
+        num_samples_to_generate : Optional[int], optional
+            Number of samples to generate, by default None
+        Attributes
+        ----------
+        llm_client : Any
+            The LLM client instance.
+        base_path : str
+            Base path for resource files.
+        registered_blocks : Dict[str, Any]
+            Registry of available blocks.
+        chained_blocks : Optional[List[Dict[str, Any]]]
+            List of block configurations.
+        num_samples_to_generate : Optional[int]
+            Number of samples to generate.
+        """
+        self.llm_client = llm_client
+        self.base_path = str(resources.files(__package__))
+        self.registered_blocks = BlockRegistry.get_registry()
+        self.chained_blocks = None  # Will be set by get_flow_from_file
+        self.num_samples_to_generate = num_samples_to_generate
+    def _getFilePath(self, dirs: List[str], filename: str) -> str:
+        """Find a named configuration file.
+        Files are checked in the following order:
+            1. Absolute path is always used
+            2. Checked relative to the directories in "dirs"
+            3. Relative to the current directory
+        Parameters
+        ----------
+        dirs : List[str]
+            Directories in which to search for the file.
+        filename : str
+            The path to the configuration file.
+        Returns
+        -------
+        str
+            Selected file path.
+        """
+        if os.path.isabs(filename):
+            return filename
+        for d in dirs:
+            full_file_path = os.path.join(d, filename)
+            if os.path.isfile(full_file_path):
+                return full_file_path
+        # If not found above then return the path unchanged i.e.
+        # assume the path is relative to the current directory
+        return filename
+    def _drop_duplicates(self, dataset: Dataset, cols: List[str]) -> Dataset:
+        """Drop duplicates from the dataset based on the columns provided.
+        Parameters
+        ----------
+        dataset : Dataset
+            The input dataset.
+        cols : List[str]
+            Columns to consider for duplicate detection.
+        Returns
+        -------
+        Dataset
+            Dataset with duplicates removed.
+        """
+        df = dataset.to_pandas()
+        df = df.drop_duplicates(subset=cols).reset_index(drop=True)
+        return Dataset.from_pandas(df)
+    def generate(self, dataset: Dataset) -> Dataset:
+        """Generate the dataset by running the pipeline steps.
+        Parameters
+        ----------
+        dataset : Dataset
+            The input dataset to process.
+        Returns
+        -------
+        Dataset
+            The processed dataset.
+        Raises
+        ------
+        ValueError
+            If Flow has not been initialized with blocks.
+        EmptyDatasetError
+            If a block produces an empty dataset.
+        """
+        if self.chained_blocks is None:
+            raise ValueError(
+                "Flow has not been initialized with blocks. "
+                "Call get_flow_from_file() first. "
+                "Or pass a list of blocks to the Flow constructor."
+            )
+        for block_prop in self.chained_blocks:
+            block_type = block_prop["block_type"]
+            block_config = block_prop["block_config"]
+            drop_columns = block_prop.get("drop_columns", [])
+            gen_kwargs = block_prop.get("gen_kwargs", {})
+            drop_duplicates_cols = block_prop.get("drop_duplicates", False)
+            block = block_type(**block_config)
+            logger.debug("------------------------------------\n")
+            logger.debug("Running block: %s", block_config["block_name"])
+            logger.debug("Input dataset: %s", dataset)
+            dataset = block.generate(dataset, **gen_kwargs)
+            if len(dataset) == 0:
+                raise EmptyDatasetError(
+                    f"Pipeline stopped: "
+                    f"Empty dataset after running block: "
+                    f"{block_config['block_name']}"
+                )
+            drop_columns_in_ds = [e for e in drop_columns if e in dataset.column_names]
+            if drop_columns:
+                dataset = dataset.remove_columns(drop_columns_in_ds)
+            if drop_duplicates_cols:
+                dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols)
+            logger.debug("Output dataset: %s", dataset)
+            logger.debug("------------------------------------\n\n")
+        return dataset
+    def get_flow_from_file(self, yaml_path: str) -> "Flow":
+        """Load and initialize flow configuration from a YAML file.
+        Parameters
+        ----------
+        yaml_path : str
+            Path to the YAML configuration file.
+        Returns
+        -------
+        Flow
+            Self with initialized chained_blocks.
+        Raises
+        ------
+        FileNotFoundError
+            If the YAML file cannot be found.
+        KeyError
+            If a required block or prompt is not found in the registry.
+        """
+        yaml_path_relative_to_base = os.path.join(self.base_path, yaml_path)
+        if os.path.isfile(yaml_path_relative_to_base):
+            yaml_path = yaml_path_relative_to_base
+        yaml_dir = os.path.dirname(yaml_path)
+        try:
+            with open(yaml_path, "r", encoding="utf-8") as yaml_file:
+                flow = yaml.safe_load(yaml_file)
+        except FileNotFoundError as exc:
+            raise FileNotFoundError(f"File not found: {yaml_path}") from exc
+        # update config with class instances
+        for block in flow:
+            # check if theres an llm block in the flow
+            if "LLM" in block["block_type"]:
+                block["block_config"]["client"] = self.llm_client
+                # model_id and prompt templates
+                # try to get a template using the model_id, but if model_prompt_template is provided, use that
+                if block["block_config"].get("model_prompt", None) is None:
+                    # try to find a match in the registry
+                    matched_prompt = next(
+                        (
+                            key
+                            for key in PromptRegistry.get_registry()
+                            if key in block["block_config"]["model_id"]
+                        ),
+                        None,
+                    )
+                    if matched_prompt is not None:
+                        block["block_config"]["model_prompt"] = matched_prompt
+                    else:
+                        raise KeyError(
+                            f"Prompt not found in registry: {block['block_config']['model_id']}"
+                        )
+                if self.num_samples_to_generate is not None:
+                    block["num_samples"] = self.num_samples_to_generate
+            # update block type to llm class instance
+            try:
+                block["block_type"] = self.registered_blocks[block["block_type"]]
+            except KeyError as exc:
+                raise KeyError(
+                    f"Block not found in registry: {block['block_type']}"
+                ) from exc
+            # update config path to absolute path
+            if "config_path" in block["block_config"]:
+                block["block_config"]["config_path"] = self._getFilePath(
+                    [yaml_dir, self.base_path], block["block_config"]["config_path"]
+                )
+            # update config paths to absolute paths - this might be a list or a dict
+            if "config_paths" in block["block_config"]:
+                if isinstance(block["block_config"]["config_paths"], dict):
+                    for key, path in block["block_config"]["config_paths"].items():
+                        block["block_config"]["config_paths"][key] = self._getFilePath(
+                            [yaml_dir, self.base_path], path
+                        )
+                elif isinstance(block["block_config"]["config_paths"], list):
+                    for i, path in enumerate(block["block_config"]["config_paths"]):
+                        block["block_config"]["config_paths"][i] = self._getFilePath(
+                            [yaml_dir, self.base_path], path
+                        )
+            if "operation" in block["block_config"]:
+                block["block_config"]["operation"] = OPERATOR_MAP[
+                    block["block_config"]["operation"]
+                ]
+            if "convert_dtype" in block["block_config"]:
+                block["block_config"]["convert_dtype"] = CONVERT_DTYPE_MAP[
+                    block["block_config"]["convert_dtype"]
+                ]
+        # Store the chained blocks and return self
+        self.chained_blocks = flow
+        return self

sdg_hub/flow_runner.py ADDED Viewed

@@ -0,0 +1,204 @@
+"""Script for running data generation flows with configurable parameters."""
+# Standard
+import os
+# Third Party
+from datasets import load_dataset
+from openai import OpenAI
+import click
+# First Party
+from sdg_hub.flow import Flow
+from sdg_hub.logger_config import setup_logger
+from sdg_hub.sdg import SDG
+logger = setup_logger(__name__)
+def run_flow(
+    ds_path: str,
+    save_path: str,
+    endpoint: str,
+    flow_path: str,
+    checkpoint_dir: str,
+    batch_size: int = 8,
+    num_workers: int = 32,
+    save_freq: int = 2,
+    debug: bool = False,
+) -> None:
+    """Process the dataset using the specified configuration.
+    Parameters
+    ----------
+    ds_path : str
+        Path to the dataset file.
+    save_path : str
+        Path where the output will be saved.
+    endpoint : str
+        API endpoint for data processing.
+    flow_path : str
+        Path to the flow configuration file.
+    checkpoint_dir : str
+        Directory path for saving checkpoints.
+    batch_size : int, optional
+        Batch size for processing, by default 8.
+    num_workers : int, optional
+        Number of worker processes to use, by default 32.
+    save_freq : int, optional
+        Frequency (in batches) at which to save checkpoints, by default 2.
+    debug : bool, optional
+        If True, enables debug mode with a smaller dataset subset, by default False.
+    Returns
+    -------
+    None
+    Raises
+    ------
+    FileNotFoundError
+        If the flow configuration file is not found.
+    """
+    logger.info(f"Generation configuration: {locals()}\n\n")
+    ds = load_dataset("json", data_files=ds_path, split="train")
+    if debug:
+        ds = ds.shuffle(seed=42).select(range(30))
+        logger.info("Debug mode enabled. Using a subset of the dataset.")
+    openai_api_key = os.environ.get("OPENAI_API_KEY", "EMPTY")
+    openai_api_base = endpoint
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    if not os.path.exists(flow_path):
+        raise FileNotFoundError(f"Flow file not found: {flow_path}")
+    flow = Flow(client).get_flow_from_file(flow_path)
+    sdg = SDG(
+        flows=[flow],
+        num_workers=num_workers,
+        batch_size=batch_size,
+        save_freq=save_freq,
+    )
+    generated_data = sdg.generate(ds, checkpoint_dir=checkpoint_dir)
+    generated_data.to_json(save_path, orient="records", lines=True)
+    logger.info(f"Data saved to {save_path}")
+@click.command()
+@click.option(
+    "--ds_path",
+    type=click.Path(exists=True),
+    required=True,
+    help="Path to the dataset.",
+)
+@click.option(
+    "--bs",
+    type=int,
+    default=8,
+    show_default=True,
+    help="Batch size for processing.",
+)
+@click.option(
+    "--num_workers",
+    type=int,
+    default=32,
+    show_default=True,
+    help="Number of worker processes to use.",
+)
+@click.option(
+    "--save_path",
+    type=click.Path(),
+    required=True,
+    help="Path to save the output.",
+)
+@click.option(
+    "--endpoint",
+    type=str,
+    required=True,
+    help="API endpoint for data processing.",
+)
+@click.option(
+    "--flow",
+    type=click.Path(exists=True),
+    required=True,
+    help="Flow configuration for the process.",
+)
+@click.option(
+    "--checkpoint_dir",
+    type=click.Path(),
+    required=True,
+    help="Path to save checkpoints.",
+)
+@click.option(
+    "--save_freq",
+    type=int,
+    default=2,
+    show_default=True,
+    help="Frequency to save checkpoints.",
+)
+@click.option(
+    "--debug",
+    is_flag=True,
+    help="Enable debug mode with a smaller dataset subset.",
+)
+def main(
+    ds_path: str,
+    bs: int,
+    num_workers: int,
+    save_path: str,
+    endpoint: str,
+    flow: str,
+    checkpoint_dir: str,
+    save_freq: int,
+    debug: bool,
+) -> None:
+    """CLI entry point for running data generation flows.
+    Parameters
+    ----------
+    ds_path : str
+        Path to the dataset file.
+    bs : int
+        Batch size for processing.
+    num_workers : int
+        Number of worker processes to use.
+    save_path : str
+        Path where the output will be saved.
+    endpoint : str
+        API endpoint for data processing.
+    flow : str
+        Path to the flow configuration file.
+    checkpoint_dir : str
+        Directory path for saving checkpoints.
+    save_freq : int
+        Frequency (in batches) at which to save checkpoints.
+    debug : bool
+        If True, enables debug mode with a smaller dataset subset.
+    Returns
+    -------
+    None
+    """
+    run_flow(
+        ds_path=ds_path,
+        batch_size=bs,
+        num_workers=num_workers,
+        save_path=save_path,
+        endpoint=endpoint,
+        flow_path=flow,
+        checkpoint_dir=checkpoint_dir,
+        save_freq=save_freq,
+        debug=debug,
+    )
+if __name__ == "__main__":
+    # pylint: disable=no-value-for-parameter
+    main()

sdg_hub/flows/generation/knowledge/mmlu_bench.yaml ADDED Viewed

@@ -0,0 +1,13 @@
+- block_type: LLMBlock
+  block_config:
+    block_name: gen_mmlu_knowledge
+    config_path: configs/knowledge/mcq_generation.yaml
+    model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
+    output_cols:
+      - mmlubench_question
+      - mmlubench_answer
+  gen_kwargs:
+    temperature: 0
+    max_tokens: 2048
+  drop_duplicates:
+    - mmlubench_question

sdg_hub/flows/generation/knowledge/simple_knowledge.yaml ADDED Viewed

@@ -0,0 +1,12 @@
+- block_type: LLMBlock
+  block_config:
+    block_name: gen_knowledge
+    config_path: configs/knowledge/simple_generate_qa.yaml
+    model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
+    output_cols:
+      - output
+  gen_kwargs:
+    temperature: 0.7
+    max_tokens: 2048
+  drop_duplicates:
+    - output

sdg_hub/flows/generation/knowledge/synth_knowledge.yaml ADDED Viewed

@@ -0,0 +1,89 @@
+- block_type: LLMBlock
+  block_config:
+    block_name: gen_knowledge
+    config_path: configs/knowledge/generate_questions_responses.yaml
+    model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
+    output_cols:
+      - question
+      - response
+    parser_kwargs:
+      parser_name: custom
+      parsing_pattern: "\\[(?:Question|QUESTION)\\]\\s*(.*?)\\s*\\[(?:Answer|ANSWER)\\]\\s*(.*?)\\s*(?=\\[(?:Question|QUESTION)\\]|$)"
+      parser_cleanup_tags:
+        - "[END]"
+  gen_kwargs:
+    max_tokens: 2048
+  drop_duplicates:
+    - question
+- block_type: LLMBlock
+  block_config:
+    block_name: eval_faithfulness_qa_pair
+    config_path: configs/knowledge/evaluate_faithfulness.yaml
+    model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
+    output_cols:
+      - explanation
+      - judgment
+  gen_kwargs:
+    max_tokens: 2048
+- block_type: FilterByValueBlock
+  block_config:
+    block_name: filter_faithfulness
+    filter_column: judgment
+    filter_value: "YES"
+    operation: operator.eq
+    batch_kwargs:
+      num_procs: 8
+  drop_columns:
+    - judgment
+    - explanation
+- block_type: LLMBlock
+  block_config:
+    block_name: eval_relevancy_qa_pair
+    config_path: configs/knowledge/evaluate_relevancy.yaml
+    model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
+    output_cols:
+      - feedback
+      - score
+  gen_kwargs:
+    max_tokens: 2048
+- block_type: FilterByValueBlock
+  block_config:
+    block_name: filter_relevancy
+    filter_column: score
+    filter_value: 2.0
+    operation: operator.eq
+    convert_dtype: float
+    batch_kwargs:
+      num_procs: 8
+  drop_columns:
+    - feedback
+    - score
+- block_type: LLMBlock
+  block_config:
+    block_name: eval_verify_question
+    config_path: configs/knowledge/evaluate_question.yaml
+    model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
+    output_cols:
+      - explanation
+      - rating
+  gen_kwargs:
+    max_tokens: 2048
+- block_type: FilterByValueBlock
+  block_config:
+    block_name: filter_verify_question
+    filter_column: rating
+    filter_value: 1.0
+    operation: operator.eq
+    convert_dtype: float
+    batch_kwargs:
+      num_procs: 8
+  drop_columns:
+    - explanation
+    - rating
+    - __index_level_0__