PyPI - sdg-hub - Versions diffs - 0.1.0__py3-none-any.whl - Mend

sdg-hub 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

sdg_hub/__init__.py +3 -0
sdg_hub/_version.py +21 -0
sdg_hub/blocks/__init__.py +36 -0
sdg_hub/blocks/block.py +96 -0
sdg_hub/blocks/llmblock.py +375 -0
sdg_hub/blocks/utilblocks.py +597 -0
sdg_hub/checkpointer.py +139 -0
sdg_hub/configs/__init__.py +0 -0
sdg_hub/configs/annotations/__init__.py +0 -0
sdg_hub/configs/annotations/cot_reflection.yaml +34 -0
sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
sdg_hub/configs/annotations/detailed_description.yaml +10 -0
sdg_hub/configs/annotations/detailed_description_icl.yaml +32 -0
sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
sdg_hub/configs/knowledge/__init__.py +0 -0
sdg_hub/configs/knowledge/atomic_facts.yaml +45 -0
sdg_hub/configs/knowledge/auxilary_instructions.yaml +35 -0
sdg_hub/configs/knowledge/detailed_summary.yaml +17 -0
sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +68 -0
sdg_hub/configs/knowledge/evaluate_question.yaml +38 -0
sdg_hub/configs/knowledge/evaluate_relevancy.yaml +85 -0
sdg_hub/configs/knowledge/extractive_summary.yaml +17 -0
sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +39 -0
sdg_hub/configs/knowledge/generate_questions_responses.yaml +56 -0
sdg_hub/configs/knowledge/mcq_generation.yaml +83 -0
sdg_hub/configs/knowledge/router.yaml +12 -0
sdg_hub/configs/knowledge/simple_generate_qa.yaml +34 -0
sdg_hub/configs/reasoning/__init__.py +0 -0
sdg_hub/configs/reasoning/dynamic_cot.yaml +40 -0
sdg_hub/configs/skills/__init__.py +0 -0
sdg_hub/configs/skills/analyzer.yaml +48 -0
sdg_hub/configs/skills/annotation.yaml +36 -0
sdg_hub/configs/skills/contexts.yaml +28 -0
sdg_hub/configs/skills/critic.yaml +60 -0
sdg_hub/configs/skills/evaluate_freeform_pair.yaml +111 -0
sdg_hub/configs/skills/evaluate_freeform_questions.yaml +78 -0
sdg_hub/configs/skills/evaluate_grounded_pair.yaml +119 -0
sdg_hub/configs/skills/evaluate_grounded_questions.yaml +51 -0
sdg_hub/configs/skills/freeform_questions.yaml +34 -0
sdg_hub/configs/skills/freeform_responses.yaml +39 -0
sdg_hub/configs/skills/grounded_questions.yaml +38 -0
sdg_hub/configs/skills/grounded_responses.yaml +59 -0
sdg_hub/configs/skills/icl_examples/STEM.yaml +56 -0
sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
sdg_hub/configs/skills/icl_examples/coding.yaml +97 -0
sdg_hub/configs/skills/icl_examples/extraction.yaml +36 -0
sdg_hub/configs/skills/icl_examples/humanities.yaml +71 -0
sdg_hub/configs/skills/icl_examples/math.yaml +85 -0
sdg_hub/configs/skills/icl_examples/reasoning.yaml +30 -0
sdg_hub/configs/skills/icl_examples/roleplay.yaml +45 -0
sdg_hub/configs/skills/icl_examples/writing.yaml +80 -0
sdg_hub/configs/skills/judge.yaml +53 -0
sdg_hub/configs/skills/planner.yaml +67 -0
sdg_hub/configs/skills/respond.yaml +8 -0
sdg_hub/configs/skills/revised_responder.yaml +78 -0
sdg_hub/configs/skills/router.yaml +59 -0
sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +27 -0
sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +31 -0
sdg_hub/flow.py +306 -0
sdg_hub/flow_runner.py +204 -0
sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +13 -0
sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +12 -0
sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +89 -0
sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +136 -0
sdg_hub/flows/generation/skills/improve_responses.yaml +103 -0
sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +12 -0
sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +12 -0
sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +80 -0
sdg_hub/flows/generation/skills/synth_skills.yaml +59 -0
sdg_hub/logger_config.py +20 -0
sdg_hub/pipeline.py +121 -0
sdg_hub/prompts.py +43 -0
sdg_hub/py.typed +0 -0
sdg_hub/registry.py +122 -0
sdg_hub/sdg.py +206 -0
sdg_hub/utils/__init__.py +5 -0
sdg_hub/utils/datautils.py +14 -0
sdg_hub-0.1.0.dist-info/METADATA +190 -0
sdg_hub-0.1.0.dist-info/RECORD +82 -0
sdg_hub-0.1.0.dist-info/WHEEL +5 -0
sdg_hub-0.1.0.dist-info/licenses/LICENSE +201 -0
sdg_hub-0.1.0.dist-info/top_level.txt +1 -0

sdg_hub/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0
+# Local
+from .sdg import SDG

sdg_hub/_version.py ADDED Viewed

@@ -0,0 +1,21 @@
+# file generated by setuptools-scm
+# don't change, don't track in version control
+__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from typing import Tuple
+    from typing import Union
+    VERSION_TUPLE = Tuple[Union[int, str], ...]
+else:
+    VERSION_TUPLE = object
+version: str
+__version__: str
+__version_tuple__: VERSION_TUPLE
+version_tuple: VERSION_TUPLE
+__version__ = version = '0.1.0'
+__version_tuple__ = version_tuple = (0, 1, 0)

sdg_hub/blocks/__init__.py ADDED Viewed

@@ -0,0 +1,36 @@
+"""Block implementations for SDG Hub.
+This package provides various block implementations for data generation, processing, and transformation.
+"""
+# Local
+from .block import Block
+from .llmblock import LLMBlock, ConditionalLLMBlock
+from .utilblocks import (
+    SamplePopulatorBlock,
+    SelectorBlock,
+    CombineColumnsBlock,
+    FlattenColumnsBlock,
+    DuplicateColumns,
+    RenameColumns,
+    SetToMajorityValue,
+    FilterByValueBlock,
+    IterBlock,
+)
+from ..registry import BlockRegistry
+__all__ = [
+    "Block",
+    "FilterByValueBlock",
+    "IterBlock",
+    "LLMBlock",
+    "ConditionalLLMBlock",
+    "SamplePopulatorBlock",
+    "SelectorBlock",
+    "CombineColumnsBlock",
+    "FlattenColumnsBlock",
+    "DuplicateColumns",
+    "RenameColumns",
+    "SetToMajorityValue",
+    "BlockRegistry",
+]

sdg_hub/blocks/block.py ADDED Viewed

@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Base block implementation for the SDG Hub system.
+This module provides the abstract base class for all blocks in the system,
+including functionality for template validation and configuration management.
+"""
+# Standard
+from abc import ABC
+from collections import ChainMap
+from typing import Any, Dict, Optional
+# Third Party
+from jinja2 import Template, UndefinedError
+import yaml
+# Local
+from ..registry import BlockRegistry
+from ..logger_config import setup_logger
+logger = setup_logger(__name__)
+@BlockRegistry.register("Block")
+class Block(ABC):
+    """Base abstract class for all blocks in the system.
+    This class provides common functionality for block validation and configuration loading.
+    All specific block implementations should inherit from this class.
+    """
+    def __init__(self, block_name: str) -> None:
+        self.block_name = block_name
+    @staticmethod
+    def _validate(prompt_template: Template, input_dict: Dict[str, Any]) -> bool:
+        """Validate the input data for this block.
+        This method validates whether all required variables in the Jinja template are provided in the input_dict.
+        Parameters
+        ----------
+        prompt_template : Template
+            The Jinja2 template object.
+        input_dict : Dict[str, Any]
+            A dictionary of input values to check against the template.
+        Returns
+        -------
+        bool
+            True if the input data is valid (i.e., no missing variables), False otherwise.
+        """
+        class Default(dict):
+            def __missing__(self, key: str) -> None:
+                raise KeyError(key)
+        try:
+            # Try rendering the template with the input_dict
+            prompt_template.render(ChainMap(input_dict, Default()))
+            return True
+        except UndefinedError as e:
+            logger.error(f"Missing key: {e}")
+            return False
+    def _load_config(self, config_path: str) -> Optional[Dict[str, Any]]:
+        """Load the configuration file for this block.
+        Parameters
+        ----------
+        config_path : str
+            The path to the configuration file.
+        Returns
+        -------
+        Optional[Dict[str, Any]]
+            The loaded configuration. Returns None if file cannot be read or parsed.
+        Raises
+        ------
+        FileNotFoundError
+            If the configuration file does not exist.
+        """
+        try:
+            with open(config_path, "r", encoding="utf-8") as config_file:
+                try:
+                    return yaml.safe_load(config_file)
+                except yaml.YAMLError as e:
+                    logger.error(f"Error parsing YAML from {config_path}: {e}")
+                    return None
+        except FileNotFoundError:
+            logger.error(f"Configuration file not found: {config_path}")
+            raise
+        except Exception as e:
+            logger.error(f"Unexpected error reading config file {config_path}: {e}")
+            return None

sdg_hub/blocks/llmblock.py ADDED Viewed

@@ -0,0 +1,375 @@
+# SPDX-License-Identifier: Apache-2.0
+"""LLM-based blocks for text generation and processing.
+This module provides blocks for interacting with language models.
+"""
+# Standard
+from typing import Any, Dict, List, Optional, Union
+import json
+import re
+# Third Party
+from datasets import Dataset
+from jinja2 import Template
+import openai
+# Local
+from .block import Block
+from ..logger_config import setup_logger
+from ..registry import BlockRegistry, PromptRegistry
+logger = setup_logger(__name__)
+def server_supports_batched(client: openai.OpenAI, model_id: str) -> bool:
+    """Check if the server supports batched inputs.
+    This function checks if the server supports batched inputs by making a test call to the server.
+    Parameters
+    ----------
+    client : openai.OpenAI
+        The client to use to make the test call.
+    model_id : str
+        The model ID to use for the test call.
+    """
+    supported = getattr(client, "server_supports_batched", None)
+    if supported is not None:
+        return supported
+    try:
+        # Make a test call to the server to determine whether it supports
+        # multiple input prompts per request and also the n parameter
+        response = client.completions.create(
+            model=model_id, prompt=["test1", "test2"], max_tokens=1, n=3
+        )
+        # Number outputs should be 2 * 3 = 6
+        supported = len(response.choices) == 6
+    except openai.InternalServerError:
+        supported = False
+    client.server_supports_batched = supported
+    logger.info(f"LLM server supports batched inputs: {client.server_supports_batched}")
+    return supported
+@BlockRegistry.register("LLMBlock")
+class LLMBlock(Block):
+    """Block for generating text using language models.
+    This block handles text generation, prompt formatting, and output parsing
+    for language model interactions.
+    Parameters
+    ----------
+    block_name : str
+        Name of the block.
+    config_path : str
+        Path to the configuration file.
+    client : openai.OpenAI
+        OpenAI client instance.
+    output_cols : List[str]
+        List of output column names.
+    parser_kwargs : Dict[str, Any], optional
+        Keyword arguments for the parser, by default {}.
+    model_prompt : str, optional
+        Template string for model prompt, by default "{prompt}".
+    model_id : Optional[str], optional
+        Model ID to use, by default None.
+    **batch_kwargs : Dict[str, Any]
+        Additional keyword arguments for batch processing.
+    """
+    # pylint: disable=too-many-instance-attributes
+    def __init__(
+        self,
+        block_name: str,
+        config_path: str,
+        client: openai.OpenAI,
+        output_cols: List[str],
+        parser_kwargs: Dict[str, Any] = {},
+        model_prompt: str = "{prompt}",
+        model_id: Optional[str] = None,
+        **batch_kwargs: Dict[str, Any],
+    ) -> None:
+        super().__init__(block_name)
+        self.block_config = self._load_config(config_path)
+        self.prompt_struct = (
+            """{system}\n{introduction}\n{principles}\n{examples}\n{generation}"""
+        )
+        filtered_config = {
+            k: (v if v is not None else "") for k, v in self.block_config.items()
+        }
+        self.prompt_template = Template(self.prompt_struct.format(**filtered_config))
+        self.client = client
+        if model_id:
+            self.model = model_id
+        else:
+            # get the default model id from client
+            self.model = self.client.models.list().data[0].id
+        self.model_prompt = model_prompt
+        self.output_cols = output_cols
+        self.batch_params = batch_kwargs.get("batch_kwargs", {})
+        self.parser_name = parser_kwargs.get("parser_name", None)
+        self.parsing_pattern = parser_kwargs.get("parsing_pattern", None)
+        self.parser_cleanup_tags = parser_kwargs.get("parser_cleanup_tags", None)
+        self.defaults = {
+            "model": self.model,
+            "temperature": 0,
+            "max_tokens": 4096,
+        }
+        # Whether the LLM server supports a list of input prompts
+        # and supports the n parameter to generate n outputs per input
+        self.server_supports_batched = server_supports_batched(client, self.model)
+    def _extract_matches(
+        self, text: str, start_tag: Optional[str], end_tag: Optional[str]
+    ) -> List[str]:
+        if not text:
+            return []
+        if not start_tag and not end_tag:
+            return [text.strip()]
+        pattern = ""
+        if start_tag:
+            pattern += re.escape(start_tag)
+        pattern += r"(.*?)"
+        if end_tag:
+            pattern += re.escape(end_tag)
+        elif start_tag:
+            # Enforce matching till end of string when only start_tag is provided.
+            pattern += "$"
+        return [match.strip() for match in re.findall(pattern, text, re.DOTALL)]
+    def _parse(self, generated_string: str) -> dict:
+        matches = {}
+        if self.parser_name is not None and self.parser_name == "custom":
+            pattern = re.compile(self.parsing_pattern, re.DOTALL)
+            all_matches = pattern.findall(generated_string)
+            matches = {column_name: [] for column_name in self.output_cols}
+            if all_matches and isinstance(all_matches[0], tuple):
+                for match in all_matches:
+                    for column_name, value in zip(self.output_cols, match):
+                        value = value.strip()
+                        for clean_tag in self.parser_cleanup_tags:
+                            value = value.replace(clean_tag, "")
+                        matches[column_name].append(value)
+            else:
+                matches[self.output_cols[0]] = (
+                    [match.strip() for match in all_matches] if all_matches else []
+                )
+        else:
+            for start_tag, end_tag, output_col in zip(
+                self.block_config.get("start_tags", []),
+                self.block_config.get("end_tags", []),
+                self.output_cols,
+            ):
+                matches[output_col] = self._extract_matches(
+                    generated_string, start_tag, end_tag
+                )
+        return matches
+    def _format_prompt(self, sample: Dict) -> str:
+        prompt_templated_str = self.prompt_template.render(sample).strip()
+        return PromptRegistry.render_template(
+            self.model_prompt, prompt_templated_str, add_generation_prompt=True
+        ).strip()
+    def _generate(self, samples: Dataset, **gen_kwargs: Dict[str, Any]) -> list:
+        prompts = [self._format_prompt(sample) for sample in samples]
+        logger.debug("Prompt: %s", prompts[0])
+        generate_args = {**self.defaults, **gen_kwargs}
+        if self.server_supports_batched:
+            response = self.client.completions.create(prompt=prompts, **generate_args)
+            # if stop is provided, then we need to add the stop token to the generated text,
+            # this is because the stop token is not included in the generated text - this is a limitation of the openai api
+            # we need to add the stop token to the generated text to make it consistent for the parser
+            if "stop" in generate_args:
+                return [
+                    choice.text.strip() + "".join(generate_args["stop"])
+                    for choice in response.choices
+                ]
+            return [choice.text.strip() for choice in response.choices]
+        n = gen_kwargs.get("n", 1)
+        results = []
+        for prompt in prompts:
+            for _ in range(n):
+                response = self.client.completions.create(
+                    prompt=prompt, **generate_args
+                )
+                if "stop" in generate_args:
+                    results.append(
+                        response.choices[0].text.strip()
+                        + "".join(generate_args["stop"])
+                    )
+                results.append(response.choices[0].text.strip())
+        return results
+    def generate(self, samples: Dataset, **gen_kwargs: Dict[str, Any]) -> Dataset:
+        """Generate the output from the block.
+        This method should first validate the input data,
+        then generate the output, and finally parse the generated output before returning it.
+        Returns
+        -------
+        Dataset
+            The parsed output after generation.
+        """
+        num_samples = self.block_config.get("num_samples", None)
+        logger.debug("Generating outputs for {} samples".format(len(samples)))
+        if (num_samples is not None) and ("num_samples" not in samples.column_names):
+            samples = samples.add_column("num_samples", [num_samples] * len(samples))
+        # validate each sample
+        # Log errors and remove invalid samples
+        valid_samples = []
+        for sample in samples:
+            if self._validate(self.prompt_template, sample):
+                valid_samples.append(sample)
+            else:
+                logger.warning(
+                    f"Sample failed validation: {sample}"
+                )  # Log details of the failed sample
+        samples = valid_samples
+        if len(samples) == 0:
+            logger.warning(
+                "No valid samples to generate outputs for, returning empty dataset"
+            )
+            return Dataset.from_list([])
+        # generate the output
+        outputs = self._generate(samples, **gen_kwargs)
+        logger.debug("Generated outputs: %s", outputs)
+        num_parallel_samples = gen_kwargs.get("n", 1)
+        extended_samples = []
+        # Duplicate each input sample n times, where n is the number
+        # of output sequences generated per input, so that we can
+        # pair up the inputs and outputs.
+        for item in samples:
+            extended_samples.extend([item] * num_parallel_samples)
+        new_data = []
+        for sample, output in zip(extended_samples, outputs):
+            parsed_outputs = self._parse(output)
+            max_length = max(len(value) for value in parsed_outputs.values())
+            for values in zip(*(lst[:max_length] for lst in parsed_outputs.values())):
+                new_data.append({**sample, **dict(zip(parsed_outputs.keys(), values))})
+        return Dataset.from_list(new_data)
+@BlockRegistry.register("ConditionalLLMBlock")
+class ConditionalLLMBlock(LLMBlock):
+    """Block for conditional text generation using language models.
+    This block selects different prompt templates based on a selector column value.
+    Parameters
+    ----------
+    block_name : str
+        Name of the block.
+    config_paths : Dict[str, str]
+        Dictionary mapping selector values to their config file paths.
+    client : openai.OpenAI
+        OpenAI client instance.
+    model_id : str
+        Model ID to use.
+    output_cols : List[str]
+        List of output column names.
+    selector_column_name : str
+        Name of the column used to select the prompt template.
+    model_prompt : str, optional
+        Template string for model prompt, by default "{prompt}".
+    **batch_kwargs : Dict[str, Any]
+        Additional keyword arguments for batch processing.
+    """
+    def __init__(
+        self,
+        block_name: str,
+        config_paths: Dict[str, str],
+        client: openai.OpenAI,
+        model_id: str,
+        output_cols: List[str],
+        selector_column_name: str,
+        model_prompt: str = "{prompt}",
+        **batch_kwargs: Dict[str, Any],
+    ) -> None:
+        super().__init__(
+            block_name=block_name,
+            config_path=list(config_paths.values())[0],
+            client=client,
+            model_id=model_id,
+            output_cols=output_cols,
+            model_prompt=model_prompt,
+            **batch_kwargs,
+        )
+        self.selector_column_name = selector_column_name
+        self.prompt_template = {}
+        if "All" in config_paths:
+            self.prompt_template = self.prompt_struct.format(**self.block_config)
+        else:
+            for config_key, config in config_paths.items():
+                filtered_config = {
+                    k: (v if v is not None else "")
+                    for k, v in self.block_config.items()
+                }
+                self.prompt_template[config_key] = Template(
+                    self.prompt_struct.format(**self._load_config(config))
+                )
+    def _format_prompt(self, sample: Dict[str, Any]) -> str:
+        """Format the prompt based on the selector column value.
+        Parameters
+        ----------
+        sample : Dict[str, Any]
+            Input sample containing the selector column.
+        Returns
+        -------
+        str
+            Formatted prompt string.
+        """
+        if isinstance(self.prompt_template, dict):
+            return (
+                self.prompt_template[sample[self.selector_column_name]]
+                .render(**sample)
+                .strip()
+            )
+        return self.prompt_template.render(**sample).strip()
+    def _validate(self, prompt_template: Union[str, Template], input_dict: Dict[str, Any]) -> bool:
+        """Validate the input data for this block.
+        Parameters
+        ----------
+        prompt_template : Union[str, Template]
+            The template to validate against.
+        input_dict : Dict[str, Any]
+            Input data to validate.
+        Returns
+        -------
+        bool
+            True if the input data is valid, False otherwise.
+        """
+        if isinstance(prompt_template, dict):
+            prompt_template = prompt_template[input_dict[self.selector_column_name]]
+        return super()._validate(prompt_template, input_dict)