sdg-hub 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. sdg_hub/__init__.py +3 -0
  2. sdg_hub/_version.py +21 -0
  3. sdg_hub/blocks/__init__.py +36 -0
  4. sdg_hub/blocks/block.py +96 -0
  5. sdg_hub/blocks/llmblock.py +375 -0
  6. sdg_hub/blocks/utilblocks.py +597 -0
  7. sdg_hub/checkpointer.py +139 -0
  8. sdg_hub/configs/__init__.py +0 -0
  9. sdg_hub/configs/annotations/__init__.py +0 -0
  10. sdg_hub/configs/annotations/cot_reflection.yaml +34 -0
  11. sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
  12. sdg_hub/configs/annotations/detailed_description.yaml +10 -0
  13. sdg_hub/configs/annotations/detailed_description_icl.yaml +32 -0
  14. sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
  15. sdg_hub/configs/knowledge/__init__.py +0 -0
  16. sdg_hub/configs/knowledge/atomic_facts.yaml +45 -0
  17. sdg_hub/configs/knowledge/auxilary_instructions.yaml +35 -0
  18. sdg_hub/configs/knowledge/detailed_summary.yaml +17 -0
  19. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +68 -0
  20. sdg_hub/configs/knowledge/evaluate_question.yaml +38 -0
  21. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +85 -0
  22. sdg_hub/configs/knowledge/extractive_summary.yaml +17 -0
  23. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +39 -0
  24. sdg_hub/configs/knowledge/generate_questions_responses.yaml +56 -0
  25. sdg_hub/configs/knowledge/mcq_generation.yaml +83 -0
  26. sdg_hub/configs/knowledge/router.yaml +12 -0
  27. sdg_hub/configs/knowledge/simple_generate_qa.yaml +34 -0
  28. sdg_hub/configs/reasoning/__init__.py +0 -0
  29. sdg_hub/configs/reasoning/dynamic_cot.yaml +40 -0
  30. sdg_hub/configs/skills/__init__.py +0 -0
  31. sdg_hub/configs/skills/analyzer.yaml +48 -0
  32. sdg_hub/configs/skills/annotation.yaml +36 -0
  33. sdg_hub/configs/skills/contexts.yaml +28 -0
  34. sdg_hub/configs/skills/critic.yaml +60 -0
  35. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +111 -0
  36. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +78 -0
  37. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +119 -0
  38. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +51 -0
  39. sdg_hub/configs/skills/freeform_questions.yaml +34 -0
  40. sdg_hub/configs/skills/freeform_responses.yaml +39 -0
  41. sdg_hub/configs/skills/grounded_questions.yaml +38 -0
  42. sdg_hub/configs/skills/grounded_responses.yaml +59 -0
  43. sdg_hub/configs/skills/icl_examples/STEM.yaml +56 -0
  44. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  45. sdg_hub/configs/skills/icl_examples/coding.yaml +97 -0
  46. sdg_hub/configs/skills/icl_examples/extraction.yaml +36 -0
  47. sdg_hub/configs/skills/icl_examples/humanities.yaml +71 -0
  48. sdg_hub/configs/skills/icl_examples/math.yaml +85 -0
  49. sdg_hub/configs/skills/icl_examples/reasoning.yaml +30 -0
  50. sdg_hub/configs/skills/icl_examples/roleplay.yaml +45 -0
  51. sdg_hub/configs/skills/icl_examples/writing.yaml +80 -0
  52. sdg_hub/configs/skills/judge.yaml +53 -0
  53. sdg_hub/configs/skills/planner.yaml +67 -0
  54. sdg_hub/configs/skills/respond.yaml +8 -0
  55. sdg_hub/configs/skills/revised_responder.yaml +78 -0
  56. sdg_hub/configs/skills/router.yaml +59 -0
  57. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +27 -0
  58. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +31 -0
  59. sdg_hub/flow.py +306 -0
  60. sdg_hub/flow_runner.py +204 -0
  61. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +13 -0
  62. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +12 -0
  63. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +89 -0
  64. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +136 -0
  65. sdg_hub/flows/generation/skills/improve_responses.yaml +103 -0
  66. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +12 -0
  67. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +12 -0
  68. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +80 -0
  69. sdg_hub/flows/generation/skills/synth_skills.yaml +59 -0
  70. sdg_hub/logger_config.py +20 -0
  71. sdg_hub/pipeline.py +121 -0
  72. sdg_hub/prompts.py +43 -0
  73. sdg_hub/py.typed +0 -0
  74. sdg_hub/registry.py +122 -0
  75. sdg_hub/sdg.py +206 -0
  76. sdg_hub/utils/__init__.py +5 -0
  77. sdg_hub/utils/datautils.py +14 -0
  78. sdg_hub-0.1.0.dist-info/METADATA +190 -0
  79. sdg_hub-0.1.0.dist-info/RECORD +82 -0
  80. sdg_hub-0.1.0.dist-info/WHEEL +5 -0
  81. sdg_hub-0.1.0.dist-info/licenses/LICENSE +201 -0
  82. sdg_hub-0.1.0.dist-info/top_level.txt +1 -0
sdg_hub/registry.py ADDED
@@ -0,0 +1,122 @@
1
+ # Standard
2
+ from typing import Union, List, Dict
3
+
4
+ # Third Party
5
+ from jinja2 import Template
6
+
7
+ # Local
8
+ from .logger_config import setup_logger
9
+
10
+ logger = setup_logger(__name__)
11
+
12
+
13
+ class BlockRegistry:
14
+ """Registry for block classes to avoid manual additions to block type map."""
15
+
16
+ _registry: Dict[str, type] = {}
17
+
18
+ @classmethod
19
+ def register(cls, block_name: str):
20
+ """
21
+ Decorator to register a block class under a specified name.
22
+
23
+ :param block_name: Name under which to register the block.
24
+ """
25
+
26
+ def decorator(block_class):
27
+ cls._registry[block_name] = block_class
28
+ logger.debug(
29
+ f"Registered block '{block_name}' with class '{block_class.__name__}'"
30
+ )
31
+ return block_class
32
+
33
+ return decorator
34
+
35
+ @classmethod
36
+ def get_registry(cls):
37
+ """
38
+ Retrieve the current registry map of block types.
39
+
40
+ :return: Dictionary of registered block names and classes.
41
+ """
42
+ logger.debug("Fetching the block registry map.")
43
+ return cls._registry
44
+
45
+
46
+ class PromptRegistry:
47
+ """Registry for managing Jinja2 prompt templates."""
48
+
49
+ _registry: Dict[str, Template] = {}
50
+
51
+ @classmethod
52
+ def register(cls, name: str):
53
+ """Decorator to register a Jinja2 template function by name.
54
+
55
+ :param name: Name of the template to register.
56
+ :return: A decorator that registers the Jinja2 template function.
57
+ """
58
+
59
+ def decorator(func):
60
+ template_str = func()
61
+ cls._registry[name] = Template(template_str)
62
+ logger.debug(f"Registered prompt template '{name}'")
63
+ return func
64
+
65
+ return decorator
66
+
67
+ @classmethod
68
+ def get_template(cls, name: str) -> Template:
69
+ """Retrieve a Jinja2 template by name.
70
+
71
+ :param name: Name of the template to retrieve.
72
+ :return: The Jinja2 template instance.
73
+ """
74
+ if name not in cls._registry:
75
+ raise KeyError(f"Template '{name}' not found.")
76
+ logger.debug(f"Retrieving prompt template '{name}'")
77
+ return cls._registry[name]
78
+
79
+ @classmethod
80
+ def get_registry(cls):
81
+ """
82
+ Retrieve the current registry map of block types.
83
+
84
+ :return: Dictionary of registered block names and classes.
85
+ """
86
+ logger.debug("Fetching the block registry map.")
87
+ return cls._registry
88
+
89
+ @classmethod
90
+ def render_template(
91
+ cls,
92
+ name: str,
93
+ messages: Union[str, List[Dict[str, str]]],
94
+ add_generation_prompt: bool = True,
95
+ ) -> str:
96
+ """Render the template with the provided messages or query.
97
+
98
+ :param name: Name of the template to render.
99
+ :param messages: Either a single query string or a list of messages (each as a dict with 'role' and 'content').
100
+ :param add_generation_prompt: Whether to add a generation prompt at the end.
101
+ :return: The rendered prompt as a string.
102
+ """
103
+
104
+ # Special handling for "blank" template
105
+ if name == "blank":
106
+ if not isinstance(messages, str):
107
+ raise ValueError(
108
+ "The 'blank' template can only be used with a single query string, not a list of messages."
109
+ )
110
+ return messages # Return the query as-is without templating
111
+
112
+ # Get the template
113
+ template = cls.get_template(name)
114
+
115
+ # If `messages` is a string, wrap it in a list with a default user role
116
+ if isinstance(messages, str):
117
+ messages = [{"role": "user", "content": messages}]
118
+
119
+ # Render the template with the `messages` list
120
+ return template.render(
121
+ messages=messages, add_generation_prompt=add_generation_prompt
122
+ )
sdg_hub/sdg.py ADDED
@@ -0,0 +1,206 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ """Synthetic Data Generator (SDG) module for managing data generation flows."""
4
+
5
+ # Standard
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ from typing import List, Optional, Tuple
8
+ import traceback
9
+
10
+ # Third Party
11
+ from datasets import Dataset
12
+ from tqdm import tqdm
13
+
14
+ # Local
15
+ from .checkpointer import Checkpointer
16
+ from .flow import Flow
17
+ from .logger_config import setup_logger
18
+ from .utils.datautils import safe_concatenate_datasets
19
+
20
+ logger = setup_logger(__name__)
21
+
22
+
23
+ class SDG:
24
+ """Synthetic Data Generator class.
25
+
26
+ This class manages the generation of synthetic data using one or more
27
+ data generation flows.
28
+
29
+ Parameters
30
+ ----------
31
+ flows : List[Flow]
32
+ List of flows to execute.
33
+ num_workers : int, optional
34
+ Number of worker threads to use, by default 1
35
+ batch_size : Optional[int], optional
36
+ Size of batches to process, by default None
37
+ save_freq : Optional[int], optional
38
+ Frequency of checkpoint saves, by default None
39
+
40
+ Attributes
41
+ ----------
42
+ flows : List[Flow]
43
+ List of flows to execute.
44
+ num_workers : int
45
+ Number of worker threads to use.
46
+ batch_size : Optional[int]
47
+ Size of batches to process.
48
+ save_freq : Optional[int]
49
+ Frequency of checkpoint saves.
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ flows: List[Flow],
55
+ num_workers: int = 1,
56
+ batch_size: Optional[int] = None,
57
+ save_freq: Optional[int] = None,
58
+ ) -> None:
59
+ self.flows = flows
60
+ self.num_workers = num_workers
61
+ self.batch_size = batch_size
62
+ self.save_freq = save_freq
63
+
64
+ def _split_dataset(
65
+ self, dataset: Dataset, batch_size: int
66
+ ) -> List[Tuple[int, int]]:
67
+ """Split the dataset into smaller batches.
68
+
69
+ Parameters
70
+ ----------
71
+ dataset : Dataset
72
+ The dataset to split.
73
+ batch_size : int
74
+ Size of each batch.
75
+
76
+ Returns
77
+ -------
78
+ List[Tuple[int, int]]
79
+ List of (start, end) indices for each batch.
80
+ """
81
+ total_size = len(dataset)
82
+ num_batches = (total_size + batch_size - 1) // batch_size
83
+
84
+ batches = [
85
+ (i * batch_size, min((i + 1) * batch_size, total_size))
86
+ for i in tqdm(range(num_batches))
87
+ ]
88
+
89
+ return batches
90
+
91
+ @staticmethod
92
+ def _generate_data(
93
+ flows: List[Flow],
94
+ input_split: Tuple[int, int],
95
+ ds: Dataset,
96
+ i: Optional[int] = None,
97
+ ) -> Optional[Dataset]:
98
+ """Generate data for a single split using the provided flows.
99
+
100
+ Parameters
101
+ ----------
102
+ flows : List[Flow]
103
+ List of flows to execute.
104
+ input_split : Tuple[int, int]
105
+ (start, end) indices for the current split.
106
+ ds : Dataset
107
+ The full input dataset.
108
+ i : Optional[int], optional
109
+ Split index for logging, by default None
110
+
111
+ Returns
112
+ -------
113
+ Optional[Dataset]
114
+ Generated dataset for the split, or None if generation failed.
115
+ """
116
+ logger.info(f"Processing split {i}")
117
+ input_split = ds.select(range(input_split[0], input_split[1]))
118
+ try:
119
+ for flow in flows:
120
+ input_split = flow.generate(input_split)
121
+ return input_split
122
+ except Exception as e:
123
+ logger.error(f"Error processing split {i}: {e}")
124
+ traceback.print_exc()
125
+ return None
126
+
127
+ def generate(
128
+ self, dataset: Dataset, checkpoint_dir: Optional[str] = None
129
+ ) -> Dataset:
130
+ """Generate synthetic data using the configured flows.
131
+
132
+ Parameters
133
+ ----------
134
+ dataset : Dataset
135
+ The input dataset to process.
136
+ checkpoint_dir : Optional[str], optional
137
+ Directory to save checkpoints, by default None
138
+
139
+ Returns
140
+ -------
141
+ Dataset
142
+ The generated dataset.
143
+
144
+ Notes
145
+ -----
146
+ If checkpoint_dir is provided, the generation process can be resumed
147
+ from the last checkpoint in case of interruption.
148
+ """
149
+ # Initialize checkpointer
150
+ checkpointer = Checkpointer(checkpoint_dir, self.save_freq)
151
+
152
+ # Load existing checkpoints and determine missing data
153
+ seed_data, pre_generated_data = checkpointer.load_existing_data(dataset)
154
+
155
+ # If all data has been generated, return the pre-generated data
156
+ if seed_data.num_rows == 0 and pre_generated_data is not None:
157
+ return pre_generated_data
158
+
159
+ if not self.batch_size:
160
+ # If batch size is not provided, generate the dataset in a single pass
161
+ generated_dataset = seed_data
162
+ # generated_data is initialized with seed_data, and it gets updated with each flow
163
+ for flow in self.flows:
164
+ generated_dataset = flow.generate(generated_dataset)
165
+ return generated_dataset
166
+
167
+ logger.info("Splitting the dataset into smaller batches")
168
+ input_splits = self._split_dataset(seed_data, self.batch_size)
169
+ logger.info(
170
+ f"Generating dataset with {len(input_splits)} splits, "
171
+ f"batch size {self.batch_size}, and {self.num_workers} workers"
172
+ )
173
+
174
+ generated_data = [pre_generated_data] if pre_generated_data else []
175
+ last_saved_split_index = 0 # To track the last saved split
176
+
177
+ with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
178
+ futures = [
179
+ executor.submit(
180
+ self._generate_data, self.flows, input_split, seed_data, i
181
+ )
182
+ for i, input_split in enumerate(input_splits)
183
+ ]
184
+
185
+ for i, future in enumerate(tqdm(as_completed(futures), total=len(futures))):
186
+ generated_data_split = future.result() # Ensure each future completes
187
+
188
+ if generated_data_split:
189
+ generated_data.append(generated_data_split)
190
+ logger.info(f"Finished future processing split {i} \n\n")
191
+
192
+ # Use checkpointer to handle intermediate saves
193
+ if checkpointer.should_save_checkpoint(i):
194
+ # Save only the new splits since the last checkpoint
195
+ new_splits = generated_data[last_saved_split_index : i + 1]
196
+ checkpoint_dataset = safe_concatenate_datasets(new_splits)
197
+ # check if checkpoint_dataset is not None
198
+ if checkpoint_dataset:
199
+ checkpointer.save_intermediate_checkpoint(
200
+ checkpoint_dataset
201
+ )
202
+ last_saved_split_index = i + 1
203
+
204
+ generated_dataset = safe_concatenate_datasets(generated_data)
205
+
206
+ return generated_dataset
@@ -0,0 +1,5 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # This is part of the public API, and used by instructlab
4
+ class GenerateException(Exception):
5
+ """An exception raised during generate step."""
@@ -0,0 +1,14 @@
1
+ # Third Party
2
+ from datasets import concatenate_datasets
3
+
4
+
5
+ def safe_concatenate_datasets(datasets: list):
6
+ """
7
+ Concatenate datasets safely, ignoring any datasets that are None or empty.
8
+ """
9
+ filtered_datasets = [ds for ds in datasets if ds is not None and ds.num_rows > 0]
10
+
11
+ if not filtered_datasets:
12
+ return None
13
+
14
+ return concatenate_datasets(filtered_datasets)
@@ -0,0 +1,190 @@
1
+ Metadata-Version: 2.4
2
+ Name: sdg_hub
3
+ Version: 0.1.0
4
+ Summary: Synthetic Data Generation
5
+ Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
+ License: Apache-2.0
7
+ Project-URL: homepage, https://ai-innovation.team/
8
+ Project-URL: source, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub
9
+ Project-URL: issues, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/issues
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Environment :: Console
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: MacOS :: MacOS X
15
+ Classifier: Operating System :: POSIX :: Linux
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: Implementation :: CPython
23
+ Requires-Python: >=3.9
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: click<9.0.0,>=8.1.7
27
+ Requires-Dist: datasets<4.0.0,>=2.18.0
28
+ Requires-Dist: httpx<1.0.0,>=0.25.0
29
+ Requires-Dist: jinja2
30
+ Requires-Dist: openai<2.0.0,>=1.13.3
31
+ Requires-Dist: rich
32
+ Requires-Dist: tenacity!=8.4.0,>=8.3.0
33
+ Requires-Dist: tqdm<5.0.0,>=4.66.2
34
+ Provides-Extra: web-interface
35
+ Requires-Dist: flask>=3.0.2; extra == "web-interface"
36
+ Requires-Dist: pyyaml>=6.0.1; extra == "web-interface"
37
+ Requires-Dist: flask-wtf>=1.2.2; extra == "web-interface"
38
+ Provides-Extra: vllm
39
+ Requires-Dist: vllm<0.8.4,>=0.8.0; extra == "vllm"
40
+ Requires-Dist: torch>=2.0.0; extra == "vllm"
41
+ Requires-Dist: transformers>=4.37.0; extra == "vllm"
42
+ Requires-Dist: accelerate>=0.21.0; extra == "vllm"
43
+ Requires-Dist: xformers>=0.0.22.post7; extra == "vllm"
44
+ Provides-Extra: examples
45
+ Requires-Dist: tabulate>=0.9.0; extra == "examples"
46
+ Requires-Dist: transformers>=4.37.0; extra == "examples"
47
+ Requires-Dist: langchain-text-splitters; extra == "examples"
48
+ Requires-Dist: docling>=2.3.0; extra == "examples"
49
+ Provides-Extra: dev
50
+ Requires-Dist: pre-commit<4.0,>=3.0.4; extra == "dev"
51
+ Requires-Dist: pylint<4.0,>=2.16.2; extra == "dev"
52
+ Requires-Dist: pylint-pydantic; extra == "dev"
53
+ Requires-Dist: pytest; extra == "dev"
54
+ Requires-Dist: pytest-asyncio; extra == "dev"
55
+ Requires-Dist: pytest-cov; extra == "dev"
56
+ Requires-Dist: pytest-html; extra == "dev"
57
+ Requires-Dist: tox<5,>=4.4.2; extra == "dev"
58
+ Dynamic: license-file
59
+
60
+ # SDG Hub: Synthetic Data Generation Toolkit
61
+
62
+ [![Build](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/pypi.yaml/badge.svg?branch=main)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/pypi.yaml)
63
+ [![Release](https://img.shields.io/github/v/release/Red-Hat-AI-Innovation-Team/sdg_hub)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/releases)
64
+ [![License](https://img.shields.io/github/license/Red-Hat-AI-Innovation-Team/sdg_hub)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/blob/main/LICENSE)
65
+ [![Tests](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml/badge.svg)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml)
66
+ [![codecov](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub/graph/badge.svg?token=SP75BCXWO2)](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub)
67
+
68
+ <html>
69
+ <h3 align="center">
70
+ A modular, scalable, and efficient solution for creating synthetic data generation flows in a "low-code" manner.
71
+ </h3>
72
+ <h3 align="center">
73
+ <a href="http://ai-innovation.team/sdg_hub">Documentation</a> |
74
+ <a href="examples/">Examples</a> |
75
+ <a href="https://www.youtube.com/watch?v=aGKCViWjAmA">Video Tutorial</a>
76
+ </h3>
77
+ </html>
78
+
79
+ SDG Hub is designed to simplify data creation for LLMs, allowing users to chain computational units and build powerful flows for generating data and processing tasks. Define complex workflows using nothing but YAML configuration files.
80
+
81
+ **📖 Full documentation available at: [https://ai-innovation.team/sdg_hub](https://ai-innovation.team/sdg_hub)**
82
+
83
+ ---
84
+
85
+ ## ✨ Key Features
86
+
87
+ - **Low-Code Flow Creation**: Build sophisticated data generation pipelines using
88
+ simple YAML configuration files without writing any code.
89
+
90
+ - **Modular Block System**: Compose workflows from reusable, self-contained
91
+ blocks that handle LLM calls, data transformations, and filtering.
92
+
93
+ - **LLM-Agnostic**: Works with any language model through configurable
94
+ prompt templates and generation parameters.
95
+
96
+ - **Prompt Engineering Friendly**: Tune LLM behavior by editing declarative YAML prompts.
97
+
98
+ ## 🚀 Installation
99
+
100
+ ### Stable Release (Recommended)
101
+
102
+ ```bash
103
+ pip install sdg-hub
104
+ ```
105
+
106
+ ### Development Version
107
+
108
+ ```bash
109
+ pip install git+https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub.git
110
+ ```
111
+
112
+ ## 🏁 Quick Start
113
+
114
+ ### Prerequisites
115
+
116
+ Before getting started, make sure you have:
117
+ - Python 3.8 or higher
118
+ - LLM Inference Endpoint exposed through OpenAI API
119
+
120
+ ### Simple Example
121
+
122
+ Here's the simplest way to get started:
123
+
124
+ ```python
125
+ from sdg_hub.flow_runner import run_flow
126
+
127
+ # Run a basic knowledge generation flow
128
+ run_flow(
129
+ ds_path="my_data.jsonl",
130
+ save_path="output.jsonl",
131
+ endpoint="http://0.0.0.0:8000/v1",
132
+ flow_path="flows/generation/knowledge/synth_knowledge.yaml"
133
+ )
134
+ ```
135
+
136
+ ### Advanced Configuration
137
+ You can invoke any built-in flow using run_flow:
138
+ ```python
139
+ from sdg_hub.flow_runner import run_flow
140
+
141
+ run_flow(
142
+ ds_path="path/to/dataset.jsonl",
143
+ save_path="path/to/output.jsonl",
144
+ endpoint="http://0.0.0.0:8000/v1",
145
+ flow_path="path/to/flow.yaml",
146
+ checkpoint_dir="path/to/checkpoints",
147
+ batch_size=8,
148
+ num_workers=32,
149
+ save_freq=2,
150
+ )
151
+ ```
152
+
153
+ ### 📂 Available Built-in Flows
154
+
155
+ You can start with any of these YAML flows out of the box:
156
+
157
+ #### 🔎 **Knowledge Flows**
158
+
159
+ | Flow | Description |
160
+ |------|-------------|
161
+ | `synth_knowledge.yaml` | Produces document-grounded questions and answers for factual memorization |
162
+ | `synth_knowledge1.5.yaml` | Improved version that builds intermediate representations for better recall |
163
+
164
+ #### 🧠 **Skills Flows**
165
+
166
+ | Flow | Description |
167
+ |------|-------------|
168
+ | `synth_skills.yaml` | Freeform skills QA generation (eg: "Create a new github issue to add type hints") |
169
+ | `synth_grounded_skills.yaml` | Domain-specific skill generation (eg: "From the given conversation create a table for feature requests") |
170
+ | `improve_responses.yaml` | Uses planning and critique-based refinement to improve generated answers |
171
+
172
+ All these can be found here: [flows](src/sdg_hub/flows)
173
+
174
+ ## 📺 Video Tutorial
175
+
176
+ For a comprehensive walkthrough of sdg_hub:
177
+
178
+ [![SDG Hub Tutorial](https://img.youtube.com/vi/aGKCViWjAmA/0.jpg)](https://www.youtube.com/watch?v=aGKCViWjAmA)
179
+
180
+ ## 🤝 Contributing
181
+
182
+ We welcome contributions from the community! Whether it's bug reports, feature requests, documentation improvements, or code contributions, please check out our [contribution guidelines](CONTRIBUTING.md).
183
+
184
+ ## 📄 License
185
+
186
+ This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details.
187
+
188
+ ---
189
+
190
+ Built with ❤️ by the Red Hat AI Innovation Team
@@ -0,0 +1,82 @@
1
+ sdg_hub/__init__.py,sha256=5Wa6onDndPvG4iwnjq2jK747t3-7XKdQn2WfHfq1sFc,67
2
+ sdg_hub/_version.py,sha256=-LyU5F1uZDjn6Q8_Z6-_FJt_8RE4Kq9zcKdg1abSSps,511
3
+ sdg_hub/checkpointer.py,sha256=R0pNKL_q7-BerxmIarY0w1nFYaq7fGnoRRkCVL6Z-Gw,5053
4
+ sdg_hub/flow.py,sha256=YQNtI7KIqdr4zymUIiTe2pJf7xNVoHNs123W_nRimDo,10914
5
+ sdg_hub/flow_runner.py,sha256=V2VY_gbSWXTujywtFLWr_hKZspkD_41oV5CRzRhKbRo,4829
6
+ sdg_hub/logger_config.py,sha256=7uHEJVRfym1c4n95DOKHelLXqAus8uHsZYmzLsEjqpo,422
7
+ sdg_hub/pipeline.py,sha256=mahktfoCMVnuBnvLNjAVOAoFKNQo-wb0Dz1_xdYhKDM,3852
8
+ sdg_hub/prompts.py,sha256=rtiUS2IuaMAQVAy8aAwGxmk23sKC2Qqro7edymbENrk,8165
9
+ sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ sdg_hub/registry.py,sha256=Sc_HNxo4n0pgWMiEDd_sLjxaSXAMZFiHJIhQKqjywwk,3772
11
+ sdg_hub/sdg.py,sha256=8SKrSnqyvJAwE2Muf9lXw9ONRcDzqmCtaEzFHCYW4CY,6914
12
+ sdg_hub/blocks/__init__.py,sha256=pmxlv29ohPRdIVE9ojnBs3I58UwNMU0uTtGozOZuZzc,807
13
+ sdg_hub/blocks/block.py,sha256=zdeyDyYiY0EdD3xS7kZR2hRZCRkbygQ4WONp_zv3X7w,3051
14
+ sdg_hub/blocks/llmblock.py,sha256=nWslPFZSCiyL7MXQurOk6Jx29UOsgnVDMI3PTwje7kg,13678
15
+ sdg_hub/blocks/utilblocks.py,sha256=U2PQk26cwHOgofk5IenHjrao08gbqPFOBNRy5QJ-EEY,18290
16
+ sdg_hub/configs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ sdg_hub/configs/annotations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ sdg_hub/configs/annotations/cot_reflection.yaml,sha256=60EdsTe1y7GoUIAWYSGfMa3EKI3oLZKCvDuKU7wHgQU,1737
19
+ sdg_hub/configs/annotations/detailed_annotations.yaml,sha256=in21xmlhxDJGEaWh1IgINh33tEyW9AuyG3k4pWBuKSM,1520
20
+ sdg_hub/configs/annotations/detailed_description.yaml,sha256=FsGbQMBxf1MAOi0nhrQ4icxcwYMlRura_ji9Pmeh1AA,192
21
+ sdg_hub/configs/annotations/detailed_description_icl.yaml,sha256=NDdwo5EShnYZjm1Fn80sZTAwfnwpPigixP2hvJ8--cU,679
22
+ sdg_hub/configs/annotations/simple_annotations.yaml,sha256=e2F_Ow8EG_me4XJ2cnBTlKb9y1FmdX0DHKkiMqiwdUQ,188
23
+ sdg_hub/configs/knowledge/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
+ sdg_hub/configs/knowledge/atomic_facts.yaml,sha256=9icyigsMooyBR_nEwWgj9eBAnuc3kMZMNnEy6AxFSKU,2430
25
+ sdg_hub/configs/knowledge/auxilary_instructions.yaml,sha256=aCgIjvNacdC2ZHThEvhZKvwORK6KqErVvVYQYQrIDLE,2034
26
+ sdg_hub/configs/knowledge/detailed_summary.yaml,sha256=PBymlZljkzN8kbo5DgmNsSM_Xb76SZifuS5Yl-x4Uy4,365
27
+ sdg_hub/configs/knowledge/evaluate_faithfulness.yaml,sha256=iuvx5vNNm_jzHlmcKF83StaDYezRz2vQn3JUHM-TMdQ,3054
28
+ sdg_hub/configs/knowledge/evaluate_question.yaml,sha256=02mikEAJCUEkREBo7KxPY9H6iTUHQN-4cRkn2XMlVQ8,1915
29
+ sdg_hub/configs/knowledge/evaluate_relevancy.yaml,sha256=ASh8A1HAYO1h1tQRrwGnkUmK1n-WDKLdfW_LbSW1ipQ,3690
30
+ sdg_hub/configs/knowledge/extractive_summary.yaml,sha256=06Z9lDiZUsQEURhpwWUVXA3wYO3bRaC0aNoGCpo3-44,376
31
+ sdg_hub/configs/knowledge/generate_code_questions_responses.yaml,sha256=cIus2JYMYDvxHFVSU9QVa-1IK5KoChb3rCU2b4b9UmI,908
32
+ sdg_hub/configs/knowledge/generate_questions_responses.yaml,sha256=H9nb_5xGP7k6HtC3VboXqpiI5kQ9Xp3vjhXH3YIFesk,2525
33
+ sdg_hub/configs/knowledge/mcq_generation.yaml,sha256=d4VKegnVIexwCn0e2AJs-0DC6XdLyUBGaCsQVwzICUE,3152
34
+ sdg_hub/configs/knowledge/router.yaml,sha256=9m_cX3xl808Vwrcq2PACyX45QFPkrV2nVYIY8x10JBU,119
35
+ sdg_hub/configs/knowledge/simple_generate_qa.yaml,sha256=OsuZP9SxQeUhTsHdhUO10mnjJ1u_6xekW5IQucFpRco,1565
36
+ sdg_hub/configs/reasoning/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ sdg_hub/configs/reasoning/dynamic_cot.yaml,sha256=6XY_mFpB_oKFQ7U2CmHTqkJRGVHgOvpNmIDfhksYW6o,2641
38
+ sdg_hub/configs/skills/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
+ sdg_hub/configs/skills/analyzer.yaml,sha256=QBtyjaU6HBZqzNOmev_W4_scn_hH7Rfxd2xL_LcPLho,2261
40
+ sdg_hub/configs/skills/annotation.yaml,sha256=k5nJ357kUr0Uvq7Hkt3Ey22UbgSjgSjIomjHFfjaQnY,916
41
+ sdg_hub/configs/skills/contexts.yaml,sha256=MZ2QpuGhTce6kuEsMleaGblljhGG-yhXBuH42htA2P4,1161
42
+ sdg_hub/configs/skills/critic.yaml,sha256=Dr7anOKa7Xx1oDonXzsCfXwKIl4hUTArx2Sb_rgpLQI,1808
43
+ sdg_hub/configs/skills/evaluate_freeform_pair.yaml,sha256=MOI0-GyKrJ_O4v1mm8A1lIKxXfwcS3dA7GjlpDEuXRU,4055
44
+ sdg_hub/configs/skills/evaluate_freeform_questions.yaml,sha256=yDmLd-3A9pN5VLaT4lAcJ_ZvCY43LYlcS1KEdxpBRjU,2559
45
+ sdg_hub/configs/skills/evaluate_grounded_pair.yaml,sha256=vMQtsHpNxPOOHnkzqWPp-N1gSfwPqTbfcKmNfhb9WS8,4648
46
+ sdg_hub/configs/skills/evaluate_grounded_questions.yaml,sha256=9yr97azFhMdOfYp11BFtDSIhhP4wjQMOxYZnKWKlCPU,3115
47
+ sdg_hub/configs/skills/freeform_questions.yaml,sha256=N6R3c1jNiSSw6T-OUJULpLnPHuaSXjvoNjSqTKL6EOY,1500
48
+ sdg_hub/configs/skills/freeform_responses.yaml,sha256=4URTMsPpgSDOVj71Gw3lL82QWnUFR37iE72BIMwwv7c,1544
49
+ sdg_hub/configs/skills/grounded_questions.yaml,sha256=t6pKjt5Fp_ThZueB7JBrUKuQLQY_At-Y9O67OtrIXMo,1898
50
+ sdg_hub/configs/skills/grounded_responses.yaml,sha256=kVOeBp3BjKCFKG2qConXIQVVPI1EgcKJgKn6DFAkl1s,1860
51
+ sdg_hub/configs/skills/judge.yaml,sha256=FxnJA_wdmyMyMqGEZDAT8hc2itO845mGDNXgpmV2EUU,3203
52
+ sdg_hub/configs/skills/planner.yaml,sha256=yNF6t0EnmwYt1EV9Y3-vkmPcbOQRtvoLr8MITuiUw_A,2086
53
+ sdg_hub/configs/skills/respond.yaml,sha256=K1Q5X5_Q1k60hNDbHDjMYBzxbyOIEEHTQcXW6qQ4Ve0,108
54
+ sdg_hub/configs/skills/revised_responder.yaml,sha256=rjypOJbhZV9PuOD9YhlYgymxOJV8Zdzzz54x6Fxn2bY,2875
55
+ sdg_hub/configs/skills/router.yaml,sha256=7YnFp6H5wYD8W5Qn1Ac4r9dGBSFUDhZSNwmglQ99PgQ,3545
56
+ sdg_hub/configs/skills/simple_generate_qa_freeform.yaml,sha256=j8cJtEKSvtA__rE08iU6oz2XnfIgj0HiLVL8-6RhK3c,1431
57
+ sdg_hub/configs/skills/simple_generate_qa_grounded.yaml,sha256=tvX9EN5TArFesOOqpdN3hb-IHe7O82a2twQd-gzyCgw,1500
58
+ sdg_hub/configs/skills/icl_examples/STEM.yaml,sha256=5dcLC5jXOEeDasBkTunnHYrlddI3HcHYnEAXZcrd0ds,8412
59
+ sdg_hub/configs/skills/icl_examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
+ sdg_hub/configs/skills/icl_examples/coding.yaml,sha256=a5m-pUcV9xUb54gQ5U3vsU1RBXzOmsfX0CjTW7U62zo,5240
61
+ sdg_hub/configs/skills/icl_examples/extraction.yaml,sha256=P751l6NvFRkINWz-bX5jgnd_if2bl3d_NlhGI7g81xw,4654
62
+ sdg_hub/configs/skills/icl_examples/humanities.yaml,sha256=tZyiJ4Q3gG4uuoDXw6g__lX3ySEUaRZW2GhW1ustwaM,11370
63
+ sdg_hub/configs/skills/icl_examples/math.yaml,sha256=hNq-QudlXrg9CWLpJdrZ4v3vifGTWhyp2gcfwPdR3_o,6776
64
+ sdg_hub/configs/skills/icl_examples/reasoning.yaml,sha256=eesIlH9SO07TVF20gy18MZrcDzLhSmynd_F_lvg0oQg,4335
65
+ sdg_hub/configs/skills/icl_examples/roleplay.yaml,sha256=LYEyA7wv7QWQscUNQr0K_lotNoWSfuoAEncx3PCRYIs,6997
66
+ sdg_hub/configs/skills/icl_examples/writing.yaml,sha256=El-57IjZ5IvdcmCHyHvX_M2RFFkEos572220be8ecrQ,11335
67
+ sdg_hub/flows/generation/knowledge/mmlu_bench.yaml,sha256=Rueuxr_n1zabE_nGqOgUfh5hqVmEONRka9NLiZANSew,346
68
+ sdg_hub/flows/generation/knowledge/simple_knowledge.yaml,sha256=o4uyfs1nDiECcNROdsvHKiM46NYvQufo9dF4XSGpY54,298
69
+ sdg_hub/flows/generation/knowledge/synth_knowledge.yaml,sha256=ZTZvevfwDQSKUwPcv1i5IzIchsRHSEN03eTefedQmU8,2172
70
+ sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml,sha256=aVnHkp0DkeuVgWdZ2eUQf5-uzI8tPYOkrs27yoF8m5g,3393
71
+ sdg_hub/flows/generation/skills/improve_responses.yaml,sha256=wUV0awTmKHNZ62pHiw_yz-IdG0OYgT_dCwlMUlZS3TA,2683
72
+ sdg_hub/flows/generation/skills/simple_freeform_skill.yaml,sha256=iVEomFH1E52JA7KLmTIwkS1PnzxUJVPMgbK2O-m80As,309
73
+ sdg_hub/flows/generation/skills/simple_grounded_skill.yaml,sha256=LTLxqdgbLIKSJonuIRHhcRSpit1EawwNvytWzXWXe2E,309
74
+ sdg_hub/flows/generation/skills/synth_grounded_skills.yaml,sha256=91Dm--agpmbm02hIVnFhEndjppKsQEWXDbckR9GAzKM,2045
75
+ sdg_hub/flows/generation/skills/synth_skills.yaml,sha256=9lhQcxXXbN4V9ztPph4fyjUtctll2FYtKY-V4grQdy4,1492
76
+ sdg_hub/utils/__init__.py,sha256=UEo-9qPt5iVKBIRvgZhOI0SoIBO6zeBxOuLvUQXaM3g,185
77
+ sdg_hub/utils/datautils.py,sha256=0t_SZ_UXBKl8uL6rVp3SUh8YKRbzKlh2oO5gr2cKyEw,389
78
+ sdg_hub-0.1.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
79
+ sdg_hub-0.1.0.dist-info/METADATA,sha256=y2r66Mmm4OCTiouJZcLVyTFHR1o3eqD5pAlS-8ow8uE,7247
80
+ sdg_hub-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
81
+ sdg_hub-0.1.0.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
82
+ sdg_hub-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+