PyPI - sdg-hub - Versions diffs - 0.1.0a3__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

sdg-hub 0.1.0a3py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

sdg_hub/_version.py +2 -2
sdg_hub/blocks/__init__.py +35 -5
sdg_hub/blocks/block.py +58 -16
sdg_hub/blocks/llmblock.py +149 -204
sdg_hub/blocks/utilblocks.py +500 -43
sdg_hub/checkpointer.py +139 -0
sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
sdg_hub/configs/knowledge/atomic_facts.yaml +1 -0
sdg_hub/configs/knowledge/detailed_summary.yaml +1 -0
sdg_hub/configs/knowledge/extractive_summary.yaml +1 -0
sdg_hub/configs/knowledge/generate_questions.yaml +82 -0
sdg_hub/configs/knowledge/generate_responses.yaml +86 -0
sdg_hub/configs/skills/contexts.yaml +18 -11
sdg_hub/configs/skills/evaluate_freeform_pair.yaml +79 -12
sdg_hub/configs/skills/evaluate_freeform_questions.yaml +60 -28
sdg_hub/configs/skills/evaluate_grounded_pair.yaml +95 -30
sdg_hub/configs/skills/freeform_questions.yaml +21 -16
sdg_hub/configs/skills/freeform_responses.yaml +19 -25
sdg_hub/configs/skills/router.yaml +53 -6
sdg_hub/flow.py +351 -21
sdg_hub/flow_runner.py +216 -0
sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +26 -9
sdg_hub/flows/generation/skills/{agentic_improve_skill.yaml → improve_responses.yaml} +26 -31
sdg_hub/flows/generation/skills/synth_skills.yaml +4 -4
sdg_hub/pipeline.py +67 -12
sdg_hub/prompts.py +26 -0
sdg_hub/sdg.py +128 -86
sdg_hub/utils/config_validation.py +91 -0
sdg_hub/utils/validation_result.py +10 -0
sdg_hub-0.1.1.dist-info/METADATA +190 -0
sdg_hub-0.1.1.dist-info/RECORD +86 -0
{sdg_hub-0.1.0a3.dist-info → sdg_hub-0.1.1.dist-info}/WHEEL +1 -1
sdg_hub/blocks/filterblock.py +0 -76
sdg_hub/blocks/iterblock.py +0 -31
sdg_hub/blocks/rmblocks.py +0 -194
sdg_hub/configs/annotations/simple.yaml +0 -10
sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +0 -3
sdg_hub/configs/skills/data_recipe/default_recipe.yaml +0 -6
sdg_hub/flows/annotation/emotion/detailed_description.yaml +0 -19
sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +0 -19
sdg_hub/flows/annotation/emotion/simple.yaml +0 -19
sdg_hub/utils/chunking.py +0 -73
sdg_hub/utils/docprocessor.py +0 -357
sdg_hub/utils/parse_and_convert.py +0 -392
sdg_hub-0.1.0a3.dist-info/METADATA +0 -154
sdg_hub-0.1.0a3.dist-info/RECORD +0 -90
/sdg_hub/configs/{knowledge/data_recipe → reasoning}/__init__.py +0 -0
/sdg_hub/configs/skills/{_G_.yaml → icl_examples/STEM.yaml} +0 -0
/sdg_hub/configs/skills/{data_recipe → icl_examples}/__init__.py +0 -0
/sdg_hub/configs/skills/{_A_.yaml → icl_examples/coding.yaml} +0 -0
/sdg_hub/configs/skills/{_B_.yaml → icl_examples/extraction.yaml} +0 -0
/sdg_hub/configs/skills/{_C_.yaml → icl_examples/humanities.yaml} +0 -0
/sdg_hub/configs/skills/{_D_.yaml → icl_examples/math.yaml} +0 -0
/sdg_hub/configs/skills/{_E_.yaml → icl_examples/reasoning.yaml} +0 -0
/sdg_hub/configs/skills/{_F_.yaml → icl_examples/roleplay.yaml} +0 -0
/sdg_hub/configs/skills/{_H_.yaml → icl_examples/writing.yaml} +0 -0
{sdg_hub-0.1.0a3.dist-info → sdg_hub-0.1.1.dist-info}/licenses/LICENSE +0 -0
{sdg_hub-0.1.0a3.dist-info → sdg_hub-0.1.1.dist-info}/top_level.txt +0 -0

sdg_hub/_version.py CHANGED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.1.0a3'
-__version_tuple__ = version_tuple = (0, 1, 0)
+__version__ = version = '0.1.1'
+__version_tuple__ = version_tuple = (0, 1, 1)

sdg_hub/blocks/__init__.py CHANGED Viewed

@@ -1,6 +1,36 @@
+"""Block implementations for SDG Hub.
+This package provides various block implementations for data generation, processing, and transformation.
+"""
 # Local
-from .block import *
-from .filterblock import *
-from .iterblock import *
-from .llmblock import *
-from .utilblocks import *
+from .block import Block
+from .llmblock import LLMBlock, ConditionalLLMBlock
+from .utilblocks import (
+    SamplePopulatorBlock,
+    SelectorBlock,
+    CombineColumnsBlock,
+    FlattenColumnsBlock,
+    DuplicateColumns,
+    RenameColumns,
+    SetToMajorityValue,
+    FilterByValueBlock,
+    IterBlock,
+)
+from ..registry import BlockRegistry
+__all__ = [
+    "Block",
+    "FilterByValueBlock",
+    "IterBlock",
+    "LLMBlock",
+    "ConditionalLLMBlock",
+    "SamplePopulatorBlock",
+    "SelectorBlock",
+    "CombineColumnsBlock",
+    "FlattenColumnsBlock",
+    "DuplicateColumns",
+    "RenameColumns",
+    "SetToMajorityValue",
+    "BlockRegistry",
+]

sdg_hub/blocks/block.py CHANGED Viewed

@@ -1,8 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
+"""Base block implementation for the SDG Hub system.
+This module provides the abstract base class for all blocks in the system,
+including functionality for template validation and configuration management.
+"""
 # Standard
 from abc import ABC
 from collections import ChainMap
-from typing import Any, Dict, Union
+from typing import Any, Dict, Optional
 # Third Party
 from jinja2 import Template, UndefinedError
@@ -17,24 +23,38 @@ logger = setup_logger(__name__)
 @BlockRegistry.register("Block")
 class Block(ABC):
+    """Base abstract class for all blocks in the system.
+    This class provides common functionality for block validation and configuration loading.
+    All specific block implementations should inherit from this class.
+    """
     def __init__(self, block_name: str) -> None:
         self.block_name = block_name
     @staticmethod
     def _validate(prompt_template: Template, input_dict: Dict[str, Any]) -> bool:
-        """
-        Validate the input data for this block. This method validates whether all required
-        variables in the Jinja template are provided in the input_dict.
+        """Validate the input data for this block.
+        This method validates whether all required variables in the Jinja template are provided in the input_dict.
+        Parameters
+        ----------
+        prompt_template : Template
+            The Jinja2 template object.
+        input_dict : Dict[str, Any]
+            A dictionary of input values to check against the template.
-        :param prompt_template: The Jinja2 template object.
-        :param input_dict: A dictionary of input values to check against the template.
-        :return: True if the input data is valid (i.e., no missing variables), False otherwise.
+        Returns
+        -------
+        bool
+            True if the input data is valid (i.e., no missing variables), False otherwise.
         """
         class Default(dict):
             def __missing__(self, key: str) -> None:
                 raise KeyError(key)
         try:
             # Try rendering the template with the input_dict
             prompt_template.render(ChainMap(input_dict, Default()))
@@ -43,12 +63,34 @@ class Block(ABC):
             logger.error(f"Missing key: {e}")
             return False
-    def _load_config(self, config_path: str) -> Union[Dict[str, Any], None]:
-        """
-        Load the configuration file for this block.
+    def _load_config(self, config_path: str) -> Optional[Dict[str, Any]]:
+        """Load the configuration file for this block.
-        :param config_path: The path to the configuration file.
-        :return: The loaded configuration.
+        Parameters
+        ----------
+        config_path : str
+            The path to the configuration file.
+        Returns
+        -------
+        Optional[Dict[str, Any]]
+            The loaded configuration. Returns None if file cannot be read or parsed.
+        Raises
+        ------
+        FileNotFoundError
+            If the configuration file does not exist.
         """
-        with open(config_path, "r", encoding="utf-8") as config_file:
-            return yaml.safe_load(config_file)
+        try:
+            with open(config_path, "r", encoding="utf-8") as config_file:
+                try:
+                    return yaml.safe_load(config_file)
+                except yaml.YAMLError as e:
+                    logger.error(f"Error parsing YAML from {config_path}: {e}")
+                    return None
+        except FileNotFoundError:
+            logger.error(f"Configuration file not found: {config_path}")
+            raise
+        except Exception as e:
+            logger.error(f"Unexpected error reading config file {config_path}: {e}")
+            return None

sdg_hub/blocks/llmblock.py CHANGED Viewed

@@ -1,7 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+"""LLM-based blocks for text generation and processing.
+This module provides blocks for interacting with language models.
+"""
 # Standard
-from collections import Counter
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional, Union
 import json
 import re
@@ -18,7 +22,18 @@ from ..registry import BlockRegistry, PromptRegistry
 logger = setup_logger(__name__)
-def server_supports_batched(client, model_id: str) -> bool:
+def server_supports_batched(client: openai.OpenAI, model_id: str) -> bool:
+    """Check if the server supports batched inputs.
+    This function checks if the server supports batched inputs by making a test call to the server.
+    Parameters
+    ----------
+    client : openai.OpenAI
+        The client to use to make the test call.
+    model_id : str
+        The model ID to use for the test call.
+    """
     supported = getattr(client, "server_supports_batched", None)
     if supported is not None:
         return supported
@@ -38,19 +53,43 @@ def server_supports_batched(client, model_id: str) -> bool:
 @BlockRegistry.register("LLMBlock")
-# pylint: disable=dangerous-default-value
 class LLMBlock(Block):
+    """Block for generating text using language models.
+    This block handles text generation, prompt formatting, and output parsing
+    for language model interactions.
+    Parameters
+    ----------
+    block_name : str
+        Name of the block.
+    config_path : str
+        Path to the configuration file.
+    client : openai.OpenAI
+        OpenAI client instance.
+    output_cols : List[str]
+        List of output column names.
+    parser_kwargs : Dict[str, Any], optional
+        Keyword arguments for the parser, by default {}.
+    model_prompt : str, optional
+        Template string for model prompt, by default "{prompt}".
+    model_id : Optional[str], optional
+        Model ID to use, by default None.
+    **batch_kwargs : Dict[str, Any]
+        Additional keyword arguments for batch processing.
+    """
     # pylint: disable=too-many-instance-attributes
     def __init__(
         self,
-        block_name,
-        config_path,
-        client,
-        output_cols,
-        parser_kwargs={},
-        model_prompt="{prompt}",
-        model_id=None,
-        **batch_kwargs,
+        block_name: str,
+        config_path: str,
+        client: openai.OpenAI,
+        output_cols: List[str],
+        parser_kwargs: Dict[str, Any] = {},
+        model_prompt: str = "{prompt}",
+        model_id: Optional[str] = None,
+        **batch_kwargs: Dict[str, Any],
     ) -> None:
         super().__init__(block_name)
         self.block_config = self._load_config(config_path)
@@ -84,7 +123,27 @@ class LLMBlock(Block):
         # and supports the n parameter to generate n outputs per input
         self.server_supports_batched = server_supports_batched(client, self.model)
-    def _parse(self, generated_string) -> dict:
+    def _extract_matches(
+        self, text: str, start_tag: Optional[str], end_tag: Optional[str]
+    ) -> List[str]:
+        if not text:
+            return []
+        if not start_tag and not end_tag:
+            return [text.strip()]
+        pattern = ""
+        if start_tag:
+            pattern += re.escape(start_tag)
+        pattern += r"(.*?)"
+        if end_tag:
+            pattern += re.escape(end_tag)
+        elif start_tag:
+            # Enforce matching till end of string when only start_tag is provided.
+            pattern += "$"
+        return [match.strip() for match in re.findall(pattern, text, re.DOTALL)]
+    def _parse(self, generated_string: str) -> dict:
         matches = {}
         if self.parser_name is not None and self.parser_name == "custom":
@@ -108,16 +167,9 @@ class LLMBlock(Block):
                 self.block_config.get("end_tags", []),
                 self.output_cols,
             ):
-                if not start_tag and not end_tag:
-                    matches[output_col] = [
-                        generated_string.strip() if generated_string else None
-                    ]
-                else:
-                    pattern = re.escape(start_tag) + r"(.*?)" + re.escape(end_tag)
-                    all_matches = re.findall(pattern, generated_string, re.DOTALL)
-                    matches[output_col] = (
-                        [match.strip() for match in all_matches] if all_matches else []
-                    )
+                matches[output_col] = self._extract_matches(
+                    generated_string, start_tag, end_tag
+                )
         return matches
@@ -127,7 +179,7 @@ class LLMBlock(Block):
             self.model_prompt, prompt_templated_str, add_generation_prompt=True
         ).strip()
-    def _generate(self, samples, **gen_kwargs) -> list:
+    def _generate(self, samples: Dataset, **gen_kwargs: Dict[str, Any]) -> list:
         prompts = [self._format_prompt(sample) for sample in samples]
         logger.debug("Prompt: %s", prompts[0])
         generate_args = {**self.defaults, **gen_kwargs}
@@ -159,12 +211,16 @@ class LLMBlock(Block):
                 results.append(response.choices[0].text.strip())
         return results
-    def generate(self, samples: Dataset, **gen_kwargs) -> Dataset:
-        """
-        Generate the output from the block. This method should first validate the input data,
+    def generate(self, samples: Dataset, **gen_kwargs: Dict[str, Any]) -> Dataset:
+        """Generate the output from the block.
+        This method should first validate the input data,
         then generate the output, and finally parse the generated output before returning it.
-        :return: The parsed output after generation.
+        Returns
+        -------
+        Dataset
+            The parsed output after generation.
         """
         num_samples = self.block_config.get("num_samples", None)
         logger.debug("Generating outputs for {} samples".format(len(samples)))
@@ -219,16 +275,40 @@ class LLMBlock(Block):
 @BlockRegistry.register("ConditionalLLMBlock")
 class ConditionalLLMBlock(LLMBlock):
+    """Block for conditional text generation using language models.
+    This block selects different prompt templates based on a selector column value.
+    Parameters
+    ----------
+    block_name : str
+        Name of the block.
+    config_paths : Dict[str, str]
+        Dictionary mapping selector values to their config file paths.
+    client : openai.OpenAI
+        OpenAI client instance.
+    model_id : str
+        Model ID to use.
+    output_cols : List[str]
+        List of output column names.
+    selector_column_name : str
+        Name of the column used to select the prompt template.
+    model_prompt : str, optional
+        Template string for model prompt, by default "{prompt}".
+    **batch_kwargs : Dict[str, Any]
+        Additional keyword arguments for batch processing.
+    """
     def __init__(
         self,
-        block_name,
-        config_paths,
-        client,
-        model_id,
-        output_cols,
-        selector_column_name,
-        model_prompt="{prompt}",
-        **batch_kwargs,
+        block_name: str,
+        config_paths: Dict[str, str],
+        client: openai.OpenAI,
+        model_id: str,
+        output_cols: List[str],
+        selector_column_name: str,
+        model_prompt: str = "{prompt}",
+        **batch_kwargs: Dict[str, Any],
     ) -> None:
         super().__init__(
             block_name=block_name,
@@ -245,15 +325,27 @@ class ConditionalLLMBlock(LLMBlock):
             self.prompt_template = self.prompt_struct.format(**self.block_config)
         else:
             for config_key, config in config_paths.items():
-                # Template(self.prompt_struct.format(**filtered_config))
                 filtered_config = {
-                    k: (v if v is not None else "") for k, v in self.block_config.items()
+                    k: (v if v is not None else "")
+                    for k, v in self.block_config.items()
                 }
-                self.prompt_template[config_key] = Template(self.prompt_struct.format(
-                    **self._load_config(config)
-                ))
+                self.prompt_template[config_key] = Template(
+                    self.prompt_struct.format(**self._load_config(config))
+                )
-    def _format_prompt(self, sample: Dict) -> str:
+    def _format_prompt(self, sample: Dict[str, Any]) -> str:
+        """Format the prompt based on the selector column value.
+        Parameters
+        ----------
+        sample : Dict[str, Any]
+            Input sample containing the selector column.
+        Returns
+        -------
+        str
+            Formatted prompt string.
+        """
         if isinstance(self.prompt_template, dict):
             return (
                 self.prompt_template[sample[self.selector_column_name]]
@@ -263,168 +355,21 @@ class ConditionalLLMBlock(LLMBlock):
         return self.prompt_template.render(**sample).strip()
-    def _validate(self, prompt_template: str, input_dict: Dict[str, Any]) -> bool:
-        if isinstance(prompt_template, dict):
-            prompt_template = prompt_template[input_dict[self.selector_column_name]]
-        return super()._validate(prompt_template, input_dict)
-@BlockRegistry.register("LLMLogProbBlock")
-class LLMLogProbBlock(LLMBlock):
-    # init with init of the parent class
-    def __init__(
-        self,
-        block_name,
-        config_path,
-        client,
-        output_cols,
-        parser_kwargs={},
-        model_prompt="{prompt}",
-        model_id=None,
-        **batch_kwargs,
-    ) -> None:
-        super().__init__(
-            block_name=block_name,
-            config_path=config_path,
-            client=client,
-            output_cols=output_cols,
-            parser_kwargs=parser_kwargs,
-            model_prompt=model_prompt,
-            model_id=model_id,
-            **batch_kwargs,
-        )
-    def _generate_logprobs(self, samples, **gen_kwargs):
-        prompts = [
-            self.model_prompt.format(prompt=self._format_prompt(sample))
-            for sample in samples
-        ]
-        generate_args = {**self.defaults, **gen_kwargs}
-        # verify if logprobs is mentioned in the generate_args, if not add it and return top10 logprobs
-        if "logprobs" not in generate_args:
-            generate_args["logprobs"] = 10
+    def _validate(self, prompt_template: Union[str, Template], input_dict: Dict[str, Any]) -> bool:
+        """Validate the input data for this block.
-        if self.server_supports_batched:
-            response = self.client.completions.create(prompt=prompts, **generate_args)
-            return [choice.logprobs.top_logprobs for choice in response.choices]
+        Parameters
+        ----------
+        prompt_template : Union[str, Template]
+            The template to validate against.
+        input_dict : Dict[str, Any]
+            Input data to validate.
-        n = gen_kwargs.get("n", 1)
-        results = []
-        for prompt in prompts:
-            for _ in range(n):
-                response = self.client.completions.create(
-                    prompt=prompt, **generate_args
-                )
-                results.append(response.choices[0].logprobs.top_logprobs)
-        return results
-    def _parse(self, generations: List[List[Dict]]) -> List[List[str]]:
-        # override the parse method to convert the generations to json string
-        # convert the generations to json string to save as dataset
-        # this is because the dataset can only store key value pairs which are consistent
-        return [[json.dumps(item) for item in sublist] for sublist in generations]
-    def generate(self, samples: Dataset, **gen_kwargs) -> Dataset:
+        Returns
+        -------
+        bool
+            True if the input data is valid, False otherwise.
         """
-        Generate the output from the block. This method should first validate the input data,
-        then generate the output, and finally parse the generated output before returning it.
-        :return: The parsed output after generation.
-        """
-        num_samples = self.block_config.get("num_samples", None)
-        logger.debug("Generating outputs for {} samples".format(len(samples)))
-        if (num_samples is not None) and ("num_samples" not in samples.column_names):
-            samples = samples.add_column("num_samples", [num_samples] * len(samples))
-        # validate each sample
-        # Log errors and remove invalid samples
-        valid_samples = []
-        for sample in samples:
-            if self._validate(self.prompt_template, sample):
-                valid_samples.append(sample)
-            else:
-                logger.warning(
-                    f"Sample failed validation: {sample}"
-                )  # Log details of the failed sample
-        samples = valid_samples
-        if len(samples) == 0:
-            logger.warning(
-                "No valid samples to generate outputs for, returning empty dataset"
-            )
-            return Dataset.from_list([])
-        # generate the output
-        outputs = self._generate_logprobs(samples, **gen_kwargs)
-        logger.debug("Generated outputs: %s", outputs)
-        output_dataset = Dataset.from_list(samples)
-        output_dataset = output_dataset.add_column(
-            self.output_cols[0],
-            self._parse(outputs),  # pylint: disable=no-value-for-parameter
-        )
-        return output_dataset
-@BlockRegistry.register("LLMMessagesBlock")
-class LLMMessagesBlock(Block):
-    def __init__(
-        self,
-        block_name,
-        client,
-        input_col,
-        output_col,
-        model_prompt=None,
-        model_id=None,
-        **batch_kwargs,
-    ) -> None:
-        self.block_name = block_name
-        self.model_prompt = model_prompt
-        self.batch_params = batch_kwargs.get("batch_kwargs", {})
-        self.input_col = input_col
-        self.output_col = output_col
-        self.client = client
-        if model_id:
-            self.model = model_id
-        else:
-            self.model = self.client.models.list().data[0].id
-        self.defaults = {
-            "model": self.model,
-            "temperature": 0,
-            "max_tokens": 4096,
-        }
-        self.server_supports_batched = server_supports_batched(client, self.model)
-    def _generate(self, samples, **gen_kwargs) -> list:
-        generate_args = {**self.defaults, **gen_kwargs}
-        if "n" in generate_args and generate_args.get("temperature", 0) <= 0:
-            generate_args["temperature"] = 0.7
-            logger.warning(
-                "Temperature should be greater than 0 for n > 1, setting temperature to 0.7"
-            )
-        messages = samples[self.input_col]
-        results = []
-        n = gen_kwargs.get("n", 1)
-        for message in messages:
-            responses = self.client.chat.completions.create(messages=message, **generate_args)
-            if n > 1:
-                results.append([choice.message.content for choice in responses.choices])
-            else:
-                results.append(responses.choices[0].message.content)
-        return results
-    def generate(self, samples: Dataset, **gen_kwargs) -> Dataset:
-        outputs = self._generate(samples, **gen_kwargs)
-        samples = samples.add_column(self.output_col, outputs)
-        return samples
+        if isinstance(prompt_template, dict):
+            prompt_template = prompt_template[input_dict[self.selector_column_name]]
+        return super()._validate(prompt_template, input_dict)

sdg-hub 0.1.0a3__py3-none-any.whl → 0.1.1__py3-none-any.whl

sdg-hub 0.1.0a3py3-none-any.whl → 0.1.1py3-none-any.whl