sdg-hub 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/__init__.py +3 -0
- sdg_hub/_version.py +21 -0
- sdg_hub/blocks/__init__.py +36 -0
- sdg_hub/blocks/block.py +96 -0
- sdg_hub/blocks/llmblock.py +375 -0
- sdg_hub/blocks/utilblocks.py +597 -0
- sdg_hub/checkpointer.py +139 -0
- sdg_hub/configs/__init__.py +0 -0
- sdg_hub/configs/annotations/__init__.py +0 -0
- sdg_hub/configs/annotations/cot_reflection.yaml +34 -0
- sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
- sdg_hub/configs/annotations/detailed_description.yaml +10 -0
- sdg_hub/configs/annotations/detailed_description_icl.yaml +32 -0
- sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
- sdg_hub/configs/knowledge/__init__.py +0 -0
- sdg_hub/configs/knowledge/atomic_facts.yaml +45 -0
- sdg_hub/configs/knowledge/auxilary_instructions.yaml +35 -0
- sdg_hub/configs/knowledge/detailed_summary.yaml +17 -0
- sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +68 -0
- sdg_hub/configs/knowledge/evaluate_question.yaml +38 -0
- sdg_hub/configs/knowledge/evaluate_relevancy.yaml +85 -0
- sdg_hub/configs/knowledge/extractive_summary.yaml +17 -0
- sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +39 -0
- sdg_hub/configs/knowledge/generate_questions_responses.yaml +56 -0
- sdg_hub/configs/knowledge/mcq_generation.yaml +83 -0
- sdg_hub/configs/knowledge/router.yaml +12 -0
- sdg_hub/configs/knowledge/simple_generate_qa.yaml +34 -0
- sdg_hub/configs/reasoning/__init__.py +0 -0
- sdg_hub/configs/reasoning/dynamic_cot.yaml +40 -0
- sdg_hub/configs/skills/__init__.py +0 -0
- sdg_hub/configs/skills/analyzer.yaml +48 -0
- sdg_hub/configs/skills/annotation.yaml +36 -0
- sdg_hub/configs/skills/contexts.yaml +28 -0
- sdg_hub/configs/skills/critic.yaml +60 -0
- sdg_hub/configs/skills/evaluate_freeform_pair.yaml +111 -0
- sdg_hub/configs/skills/evaluate_freeform_questions.yaml +78 -0
- sdg_hub/configs/skills/evaluate_grounded_pair.yaml +119 -0
- sdg_hub/configs/skills/evaluate_grounded_questions.yaml +51 -0
- sdg_hub/configs/skills/freeform_questions.yaml +34 -0
- sdg_hub/configs/skills/freeform_responses.yaml +39 -0
- sdg_hub/configs/skills/grounded_questions.yaml +38 -0
- sdg_hub/configs/skills/grounded_responses.yaml +59 -0
- sdg_hub/configs/skills/icl_examples/STEM.yaml +56 -0
- sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
- sdg_hub/configs/skills/icl_examples/coding.yaml +97 -0
- sdg_hub/configs/skills/icl_examples/extraction.yaml +36 -0
- sdg_hub/configs/skills/icl_examples/humanities.yaml +71 -0
- sdg_hub/configs/skills/icl_examples/math.yaml +85 -0
- sdg_hub/configs/skills/icl_examples/reasoning.yaml +30 -0
- sdg_hub/configs/skills/icl_examples/roleplay.yaml +45 -0
- sdg_hub/configs/skills/icl_examples/writing.yaml +80 -0
- sdg_hub/configs/skills/judge.yaml +53 -0
- sdg_hub/configs/skills/planner.yaml +67 -0
- sdg_hub/configs/skills/respond.yaml +8 -0
- sdg_hub/configs/skills/revised_responder.yaml +78 -0
- sdg_hub/configs/skills/router.yaml +59 -0
- sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +27 -0
- sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +31 -0
- sdg_hub/flow.py +306 -0
- sdg_hub/flow_runner.py +204 -0
- sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +13 -0
- sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +12 -0
- sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +89 -0
- sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +136 -0
- sdg_hub/flows/generation/skills/improve_responses.yaml +103 -0
- sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +12 -0
- sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +12 -0
- sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +80 -0
- sdg_hub/flows/generation/skills/synth_skills.yaml +59 -0
- sdg_hub/logger_config.py +20 -0
- sdg_hub/pipeline.py +121 -0
- sdg_hub/prompts.py +43 -0
- sdg_hub/py.typed +0 -0
- sdg_hub/registry.py +122 -0
- sdg_hub/sdg.py +206 -0
- sdg_hub/utils/__init__.py +5 -0
- sdg_hub/utils/datautils.py +14 -0
- sdg_hub-0.1.0.dist-info/METADATA +190 -0
- sdg_hub-0.1.0.dist-info/RECORD +82 -0
- sdg_hub-0.1.0.dist-info/WHEEL +5 -0
- sdg_hub-0.1.0.dist-info/licenses/LICENSE +201 -0
- sdg_hub-0.1.0.dist-info/top_level.txt +1 -0
sdg_hub/registry.py
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
# Standard
|
2
|
+
from typing import Union, List, Dict
|
3
|
+
|
4
|
+
# Third Party
|
5
|
+
from jinja2 import Template
|
6
|
+
|
7
|
+
# Local
|
8
|
+
from .logger_config import setup_logger
|
9
|
+
|
10
|
+
logger = setup_logger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class BlockRegistry:
|
14
|
+
"""Registry for block classes to avoid manual additions to block type map."""
|
15
|
+
|
16
|
+
_registry: Dict[str, type] = {}
|
17
|
+
|
18
|
+
@classmethod
|
19
|
+
def register(cls, block_name: str):
|
20
|
+
"""
|
21
|
+
Decorator to register a block class under a specified name.
|
22
|
+
|
23
|
+
:param block_name: Name under which to register the block.
|
24
|
+
"""
|
25
|
+
|
26
|
+
def decorator(block_class):
|
27
|
+
cls._registry[block_name] = block_class
|
28
|
+
logger.debug(
|
29
|
+
f"Registered block '{block_name}' with class '{block_class.__name__}'"
|
30
|
+
)
|
31
|
+
return block_class
|
32
|
+
|
33
|
+
return decorator
|
34
|
+
|
35
|
+
@classmethod
|
36
|
+
def get_registry(cls):
|
37
|
+
"""
|
38
|
+
Retrieve the current registry map of block types.
|
39
|
+
|
40
|
+
:return: Dictionary of registered block names and classes.
|
41
|
+
"""
|
42
|
+
logger.debug("Fetching the block registry map.")
|
43
|
+
return cls._registry
|
44
|
+
|
45
|
+
|
46
|
+
class PromptRegistry:
|
47
|
+
"""Registry for managing Jinja2 prompt templates."""
|
48
|
+
|
49
|
+
_registry: Dict[str, Template] = {}
|
50
|
+
|
51
|
+
@classmethod
|
52
|
+
def register(cls, name: str):
|
53
|
+
"""Decorator to register a Jinja2 template function by name.
|
54
|
+
|
55
|
+
:param name: Name of the template to register.
|
56
|
+
:return: A decorator that registers the Jinja2 template function.
|
57
|
+
"""
|
58
|
+
|
59
|
+
def decorator(func):
|
60
|
+
template_str = func()
|
61
|
+
cls._registry[name] = Template(template_str)
|
62
|
+
logger.debug(f"Registered prompt template '{name}'")
|
63
|
+
return func
|
64
|
+
|
65
|
+
return decorator
|
66
|
+
|
67
|
+
@classmethod
|
68
|
+
def get_template(cls, name: str) -> Template:
|
69
|
+
"""Retrieve a Jinja2 template by name.
|
70
|
+
|
71
|
+
:param name: Name of the template to retrieve.
|
72
|
+
:return: The Jinja2 template instance.
|
73
|
+
"""
|
74
|
+
if name not in cls._registry:
|
75
|
+
raise KeyError(f"Template '{name}' not found.")
|
76
|
+
logger.debug(f"Retrieving prompt template '{name}'")
|
77
|
+
return cls._registry[name]
|
78
|
+
|
79
|
+
@classmethod
|
80
|
+
def get_registry(cls):
|
81
|
+
"""
|
82
|
+
Retrieve the current registry map of block types.
|
83
|
+
|
84
|
+
:return: Dictionary of registered block names and classes.
|
85
|
+
"""
|
86
|
+
logger.debug("Fetching the block registry map.")
|
87
|
+
return cls._registry
|
88
|
+
|
89
|
+
@classmethod
|
90
|
+
def render_template(
|
91
|
+
cls,
|
92
|
+
name: str,
|
93
|
+
messages: Union[str, List[Dict[str, str]]],
|
94
|
+
add_generation_prompt: bool = True,
|
95
|
+
) -> str:
|
96
|
+
"""Render the template with the provided messages or query.
|
97
|
+
|
98
|
+
:param name: Name of the template to render.
|
99
|
+
:param messages: Either a single query string or a list of messages (each as a dict with 'role' and 'content').
|
100
|
+
:param add_generation_prompt: Whether to add a generation prompt at the end.
|
101
|
+
:return: The rendered prompt as a string.
|
102
|
+
"""
|
103
|
+
|
104
|
+
# Special handling for "blank" template
|
105
|
+
if name == "blank":
|
106
|
+
if not isinstance(messages, str):
|
107
|
+
raise ValueError(
|
108
|
+
"The 'blank' template can only be used with a single query string, not a list of messages."
|
109
|
+
)
|
110
|
+
return messages # Return the query as-is without templating
|
111
|
+
|
112
|
+
# Get the template
|
113
|
+
template = cls.get_template(name)
|
114
|
+
|
115
|
+
# If `messages` is a string, wrap it in a list with a default user role
|
116
|
+
if isinstance(messages, str):
|
117
|
+
messages = [{"role": "user", "content": messages}]
|
118
|
+
|
119
|
+
# Render the template with the `messages` list
|
120
|
+
return template.render(
|
121
|
+
messages=messages, add_generation_prompt=add_generation_prompt
|
122
|
+
)
|
sdg_hub/sdg.py
ADDED
@@ -0,0 +1,206 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
|
3
|
+
"""Synthetic Data Generator (SDG) module for managing data generation flows."""
|
4
|
+
|
5
|
+
# Standard
|
6
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
7
|
+
from typing import List, Optional, Tuple
|
8
|
+
import traceback
|
9
|
+
|
10
|
+
# Third Party
|
11
|
+
from datasets import Dataset
|
12
|
+
from tqdm import tqdm
|
13
|
+
|
14
|
+
# Local
|
15
|
+
from .checkpointer import Checkpointer
|
16
|
+
from .flow import Flow
|
17
|
+
from .logger_config import setup_logger
|
18
|
+
from .utils.datautils import safe_concatenate_datasets
|
19
|
+
|
20
|
+
logger = setup_logger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
class SDG:
|
24
|
+
"""Synthetic Data Generator class.
|
25
|
+
|
26
|
+
This class manages the generation of synthetic data using one or more
|
27
|
+
data generation flows.
|
28
|
+
|
29
|
+
Parameters
|
30
|
+
----------
|
31
|
+
flows : List[Flow]
|
32
|
+
List of flows to execute.
|
33
|
+
num_workers : int, optional
|
34
|
+
Number of worker threads to use, by default 1
|
35
|
+
batch_size : Optional[int], optional
|
36
|
+
Size of batches to process, by default None
|
37
|
+
save_freq : Optional[int], optional
|
38
|
+
Frequency of checkpoint saves, by default None
|
39
|
+
|
40
|
+
Attributes
|
41
|
+
----------
|
42
|
+
flows : List[Flow]
|
43
|
+
List of flows to execute.
|
44
|
+
num_workers : int
|
45
|
+
Number of worker threads to use.
|
46
|
+
batch_size : Optional[int]
|
47
|
+
Size of batches to process.
|
48
|
+
save_freq : Optional[int]
|
49
|
+
Frequency of checkpoint saves.
|
50
|
+
"""
|
51
|
+
|
52
|
+
def __init__(
|
53
|
+
self,
|
54
|
+
flows: List[Flow],
|
55
|
+
num_workers: int = 1,
|
56
|
+
batch_size: Optional[int] = None,
|
57
|
+
save_freq: Optional[int] = None,
|
58
|
+
) -> None:
|
59
|
+
self.flows = flows
|
60
|
+
self.num_workers = num_workers
|
61
|
+
self.batch_size = batch_size
|
62
|
+
self.save_freq = save_freq
|
63
|
+
|
64
|
+
def _split_dataset(
|
65
|
+
self, dataset: Dataset, batch_size: int
|
66
|
+
) -> List[Tuple[int, int]]:
|
67
|
+
"""Split the dataset into smaller batches.
|
68
|
+
|
69
|
+
Parameters
|
70
|
+
----------
|
71
|
+
dataset : Dataset
|
72
|
+
The dataset to split.
|
73
|
+
batch_size : int
|
74
|
+
Size of each batch.
|
75
|
+
|
76
|
+
Returns
|
77
|
+
-------
|
78
|
+
List[Tuple[int, int]]
|
79
|
+
List of (start, end) indices for each batch.
|
80
|
+
"""
|
81
|
+
total_size = len(dataset)
|
82
|
+
num_batches = (total_size + batch_size - 1) // batch_size
|
83
|
+
|
84
|
+
batches = [
|
85
|
+
(i * batch_size, min((i + 1) * batch_size, total_size))
|
86
|
+
for i in tqdm(range(num_batches))
|
87
|
+
]
|
88
|
+
|
89
|
+
return batches
|
90
|
+
|
91
|
+
@staticmethod
|
92
|
+
def _generate_data(
|
93
|
+
flows: List[Flow],
|
94
|
+
input_split: Tuple[int, int],
|
95
|
+
ds: Dataset,
|
96
|
+
i: Optional[int] = None,
|
97
|
+
) -> Optional[Dataset]:
|
98
|
+
"""Generate data for a single split using the provided flows.
|
99
|
+
|
100
|
+
Parameters
|
101
|
+
----------
|
102
|
+
flows : List[Flow]
|
103
|
+
List of flows to execute.
|
104
|
+
input_split : Tuple[int, int]
|
105
|
+
(start, end) indices for the current split.
|
106
|
+
ds : Dataset
|
107
|
+
The full input dataset.
|
108
|
+
i : Optional[int], optional
|
109
|
+
Split index for logging, by default None
|
110
|
+
|
111
|
+
Returns
|
112
|
+
-------
|
113
|
+
Optional[Dataset]
|
114
|
+
Generated dataset for the split, or None if generation failed.
|
115
|
+
"""
|
116
|
+
logger.info(f"Processing split {i}")
|
117
|
+
input_split = ds.select(range(input_split[0], input_split[1]))
|
118
|
+
try:
|
119
|
+
for flow in flows:
|
120
|
+
input_split = flow.generate(input_split)
|
121
|
+
return input_split
|
122
|
+
except Exception as e:
|
123
|
+
logger.error(f"Error processing split {i}: {e}")
|
124
|
+
traceback.print_exc()
|
125
|
+
return None
|
126
|
+
|
127
|
+
def generate(
|
128
|
+
self, dataset: Dataset, checkpoint_dir: Optional[str] = None
|
129
|
+
) -> Dataset:
|
130
|
+
"""Generate synthetic data using the configured flows.
|
131
|
+
|
132
|
+
Parameters
|
133
|
+
----------
|
134
|
+
dataset : Dataset
|
135
|
+
The input dataset to process.
|
136
|
+
checkpoint_dir : Optional[str], optional
|
137
|
+
Directory to save checkpoints, by default None
|
138
|
+
|
139
|
+
Returns
|
140
|
+
-------
|
141
|
+
Dataset
|
142
|
+
The generated dataset.
|
143
|
+
|
144
|
+
Notes
|
145
|
+
-----
|
146
|
+
If checkpoint_dir is provided, the generation process can be resumed
|
147
|
+
from the last checkpoint in case of interruption.
|
148
|
+
"""
|
149
|
+
# Initialize checkpointer
|
150
|
+
checkpointer = Checkpointer(checkpoint_dir, self.save_freq)
|
151
|
+
|
152
|
+
# Load existing checkpoints and determine missing data
|
153
|
+
seed_data, pre_generated_data = checkpointer.load_existing_data(dataset)
|
154
|
+
|
155
|
+
# If all data has been generated, return the pre-generated data
|
156
|
+
if seed_data.num_rows == 0 and pre_generated_data is not None:
|
157
|
+
return pre_generated_data
|
158
|
+
|
159
|
+
if not self.batch_size:
|
160
|
+
# If batch size is not provided, generate the dataset in a single pass
|
161
|
+
generated_dataset = seed_data
|
162
|
+
# generated_data is initialized with seed_data, and it gets updated with each flow
|
163
|
+
for flow in self.flows:
|
164
|
+
generated_dataset = flow.generate(generated_dataset)
|
165
|
+
return generated_dataset
|
166
|
+
|
167
|
+
logger.info("Splitting the dataset into smaller batches")
|
168
|
+
input_splits = self._split_dataset(seed_data, self.batch_size)
|
169
|
+
logger.info(
|
170
|
+
f"Generating dataset with {len(input_splits)} splits, "
|
171
|
+
f"batch size {self.batch_size}, and {self.num_workers} workers"
|
172
|
+
)
|
173
|
+
|
174
|
+
generated_data = [pre_generated_data] if pre_generated_data else []
|
175
|
+
last_saved_split_index = 0 # To track the last saved split
|
176
|
+
|
177
|
+
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
|
178
|
+
futures = [
|
179
|
+
executor.submit(
|
180
|
+
self._generate_data, self.flows, input_split, seed_data, i
|
181
|
+
)
|
182
|
+
for i, input_split in enumerate(input_splits)
|
183
|
+
]
|
184
|
+
|
185
|
+
for i, future in enumerate(tqdm(as_completed(futures), total=len(futures))):
|
186
|
+
generated_data_split = future.result() # Ensure each future completes
|
187
|
+
|
188
|
+
if generated_data_split:
|
189
|
+
generated_data.append(generated_data_split)
|
190
|
+
logger.info(f"Finished future processing split {i} \n\n")
|
191
|
+
|
192
|
+
# Use checkpointer to handle intermediate saves
|
193
|
+
if checkpointer.should_save_checkpoint(i):
|
194
|
+
# Save only the new splits since the last checkpoint
|
195
|
+
new_splits = generated_data[last_saved_split_index : i + 1]
|
196
|
+
checkpoint_dataset = safe_concatenate_datasets(new_splits)
|
197
|
+
# check if checkpoint_dataset is not None
|
198
|
+
if checkpoint_dataset:
|
199
|
+
checkpointer.save_intermediate_checkpoint(
|
200
|
+
checkpoint_dataset
|
201
|
+
)
|
202
|
+
last_saved_split_index = i + 1
|
203
|
+
|
204
|
+
generated_dataset = safe_concatenate_datasets(generated_data)
|
205
|
+
|
206
|
+
return generated_dataset
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# Third Party
|
2
|
+
from datasets import concatenate_datasets
|
3
|
+
|
4
|
+
|
5
|
+
def safe_concatenate_datasets(datasets: list):
|
6
|
+
"""
|
7
|
+
Concatenate datasets safely, ignoring any datasets that are None or empty.
|
8
|
+
"""
|
9
|
+
filtered_datasets = [ds for ds in datasets if ds is not None and ds.num_rows > 0]
|
10
|
+
|
11
|
+
if not filtered_datasets:
|
12
|
+
return None
|
13
|
+
|
14
|
+
return concatenate_datasets(filtered_datasets)
|
@@ -0,0 +1,190 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: sdg_hub
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Synthetic Data Generation
|
5
|
+
Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
|
6
|
+
License: Apache-2.0
|
7
|
+
Project-URL: homepage, https://ai-innovation.team/
|
8
|
+
Project-URL: source, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub
|
9
|
+
Project-URL: issues, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/issues
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
11
|
+
Classifier: Environment :: Console
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
14
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
15
|
+
Classifier: Operating System :: POSIX :: Linux
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
22
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
23
|
+
Requires-Python: >=3.9
|
24
|
+
Description-Content-Type: text/markdown
|
25
|
+
License-File: LICENSE
|
26
|
+
Requires-Dist: click<9.0.0,>=8.1.7
|
27
|
+
Requires-Dist: datasets<4.0.0,>=2.18.0
|
28
|
+
Requires-Dist: httpx<1.0.0,>=0.25.0
|
29
|
+
Requires-Dist: jinja2
|
30
|
+
Requires-Dist: openai<2.0.0,>=1.13.3
|
31
|
+
Requires-Dist: rich
|
32
|
+
Requires-Dist: tenacity!=8.4.0,>=8.3.0
|
33
|
+
Requires-Dist: tqdm<5.0.0,>=4.66.2
|
34
|
+
Provides-Extra: web-interface
|
35
|
+
Requires-Dist: flask>=3.0.2; extra == "web-interface"
|
36
|
+
Requires-Dist: pyyaml>=6.0.1; extra == "web-interface"
|
37
|
+
Requires-Dist: flask-wtf>=1.2.2; extra == "web-interface"
|
38
|
+
Provides-Extra: vllm
|
39
|
+
Requires-Dist: vllm<0.8.4,>=0.8.0; extra == "vllm"
|
40
|
+
Requires-Dist: torch>=2.0.0; extra == "vllm"
|
41
|
+
Requires-Dist: transformers>=4.37.0; extra == "vllm"
|
42
|
+
Requires-Dist: accelerate>=0.21.0; extra == "vllm"
|
43
|
+
Requires-Dist: xformers>=0.0.22.post7; extra == "vllm"
|
44
|
+
Provides-Extra: examples
|
45
|
+
Requires-Dist: tabulate>=0.9.0; extra == "examples"
|
46
|
+
Requires-Dist: transformers>=4.37.0; extra == "examples"
|
47
|
+
Requires-Dist: langchain-text-splitters; extra == "examples"
|
48
|
+
Requires-Dist: docling>=2.3.0; extra == "examples"
|
49
|
+
Provides-Extra: dev
|
50
|
+
Requires-Dist: pre-commit<4.0,>=3.0.4; extra == "dev"
|
51
|
+
Requires-Dist: pylint<4.0,>=2.16.2; extra == "dev"
|
52
|
+
Requires-Dist: pylint-pydantic; extra == "dev"
|
53
|
+
Requires-Dist: pytest; extra == "dev"
|
54
|
+
Requires-Dist: pytest-asyncio; extra == "dev"
|
55
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
56
|
+
Requires-Dist: pytest-html; extra == "dev"
|
57
|
+
Requires-Dist: tox<5,>=4.4.2; extra == "dev"
|
58
|
+
Dynamic: license-file
|
59
|
+
|
60
|
+
# SDG Hub: Synthetic Data Generation Toolkit
|
61
|
+
|
62
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/pypi.yaml)
|
63
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/releases)
|
64
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/blob/main/LICENSE)
|
65
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml)
|
66
|
+
[](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub)
|
67
|
+
|
68
|
+
<html>
|
69
|
+
<h3 align="center">
|
70
|
+
A modular, scalable, and efficient solution for creating synthetic data generation flows in a "low-code" manner.
|
71
|
+
</h3>
|
72
|
+
<h3 align="center">
|
73
|
+
<a href="http://ai-innovation.team/sdg_hub">Documentation</a> |
|
74
|
+
<a href="examples/">Examples</a> |
|
75
|
+
<a href="https://www.youtube.com/watch?v=aGKCViWjAmA">Video Tutorial</a>
|
76
|
+
</h3>
|
77
|
+
</html>
|
78
|
+
|
79
|
+
SDG Hub is designed to simplify data creation for LLMs, allowing users to chain computational units and build powerful flows for generating data and processing tasks. Define complex workflows using nothing but YAML configuration files.
|
80
|
+
|
81
|
+
**📖 Full documentation available at: [https://ai-innovation.team/sdg_hub](https://ai-innovation.team/sdg_hub)**
|
82
|
+
|
83
|
+
---
|
84
|
+
|
85
|
+
## ✨ Key Features
|
86
|
+
|
87
|
+
- **Low-Code Flow Creation**: Build sophisticated data generation pipelines using
|
88
|
+
simple YAML configuration files without writing any code.
|
89
|
+
|
90
|
+
- **Modular Block System**: Compose workflows from reusable, self-contained
|
91
|
+
blocks that handle LLM calls, data transformations, and filtering.
|
92
|
+
|
93
|
+
- **LLM-Agnostic**: Works with any language model through configurable
|
94
|
+
prompt templates and generation parameters.
|
95
|
+
|
96
|
+
- **Prompt Engineering Friendly**: Tune LLM behavior by editing declarative YAML prompts.
|
97
|
+
|
98
|
+
## 🚀 Installation
|
99
|
+
|
100
|
+
### Stable Release (Recommended)
|
101
|
+
|
102
|
+
```bash
|
103
|
+
pip install sdg-hub
|
104
|
+
```
|
105
|
+
|
106
|
+
### Development Version
|
107
|
+
|
108
|
+
```bash
|
109
|
+
pip install git+https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub.git
|
110
|
+
```
|
111
|
+
|
112
|
+
## 🏁 Quick Start
|
113
|
+
|
114
|
+
### Prerequisites
|
115
|
+
|
116
|
+
Before getting started, make sure you have:
|
117
|
+
- Python 3.8 or higher
|
118
|
+
- LLM Inference Endpoint exposed through OpenAI API
|
119
|
+
|
120
|
+
### Simple Example
|
121
|
+
|
122
|
+
Here's the simplest way to get started:
|
123
|
+
|
124
|
+
```python
|
125
|
+
from sdg_hub.flow_runner import run_flow
|
126
|
+
|
127
|
+
# Run a basic knowledge generation flow
|
128
|
+
run_flow(
|
129
|
+
ds_path="my_data.jsonl",
|
130
|
+
save_path="output.jsonl",
|
131
|
+
endpoint="http://0.0.0.0:8000/v1",
|
132
|
+
flow_path="flows/generation/knowledge/synth_knowledge.yaml"
|
133
|
+
)
|
134
|
+
```
|
135
|
+
|
136
|
+
### Advanced Configuration
|
137
|
+
You can invoke any built-in flow using run_flow:
|
138
|
+
```python
|
139
|
+
from sdg_hub.flow_runner import run_flow
|
140
|
+
|
141
|
+
run_flow(
|
142
|
+
ds_path="path/to/dataset.jsonl",
|
143
|
+
save_path="path/to/output.jsonl",
|
144
|
+
endpoint="http://0.0.0.0:8000/v1",
|
145
|
+
flow_path="path/to/flow.yaml",
|
146
|
+
checkpoint_dir="path/to/checkpoints",
|
147
|
+
batch_size=8,
|
148
|
+
num_workers=32,
|
149
|
+
save_freq=2,
|
150
|
+
)
|
151
|
+
```
|
152
|
+
|
153
|
+
### 📂 Available Built-in Flows
|
154
|
+
|
155
|
+
You can start with any of these YAML flows out of the box:
|
156
|
+
|
157
|
+
#### 🔎 **Knowledge Flows**
|
158
|
+
|
159
|
+
| Flow | Description |
|
160
|
+
|------|-------------|
|
161
|
+
| `synth_knowledge.yaml` | Produces document-grounded questions and answers for factual memorization |
|
162
|
+
| `synth_knowledge1.5.yaml` | Improved version that builds intermediate representations for better recall |
|
163
|
+
|
164
|
+
#### 🧠 **Skills Flows**
|
165
|
+
|
166
|
+
| Flow | Description |
|
167
|
+
|------|-------------|
|
168
|
+
| `synth_skills.yaml` | Freeform skills QA generation (eg: "Create a new github issue to add type hints") |
|
169
|
+
| `synth_grounded_skills.yaml` | Domain-specific skill generation (eg: "From the given conversation create a table for feature requests") |
|
170
|
+
| `improve_responses.yaml` | Uses planning and critique-based refinement to improve generated answers |
|
171
|
+
|
172
|
+
All these can be found here: [flows](src/sdg_hub/flows)
|
173
|
+
|
174
|
+
## 📺 Video Tutorial
|
175
|
+
|
176
|
+
For a comprehensive walkthrough of sdg_hub:
|
177
|
+
|
178
|
+
[](https://www.youtube.com/watch?v=aGKCViWjAmA)
|
179
|
+
|
180
|
+
## 🤝 Contributing
|
181
|
+
|
182
|
+
We welcome contributions from the community! Whether it's bug reports, feature requests, documentation improvements, or code contributions, please check out our [contribution guidelines](CONTRIBUTING.md).
|
183
|
+
|
184
|
+
## 📄 License
|
185
|
+
|
186
|
+
This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details.
|
187
|
+
|
188
|
+
---
|
189
|
+
|
190
|
+
Built with ❤️ by the Red Hat AI Innovation Team
|
@@ -0,0 +1,82 @@
|
|
1
|
+
sdg_hub/__init__.py,sha256=5Wa6onDndPvG4iwnjq2jK747t3-7XKdQn2WfHfq1sFc,67
|
2
|
+
sdg_hub/_version.py,sha256=-LyU5F1uZDjn6Q8_Z6-_FJt_8RE4Kq9zcKdg1abSSps,511
|
3
|
+
sdg_hub/checkpointer.py,sha256=R0pNKL_q7-BerxmIarY0w1nFYaq7fGnoRRkCVL6Z-Gw,5053
|
4
|
+
sdg_hub/flow.py,sha256=YQNtI7KIqdr4zymUIiTe2pJf7xNVoHNs123W_nRimDo,10914
|
5
|
+
sdg_hub/flow_runner.py,sha256=V2VY_gbSWXTujywtFLWr_hKZspkD_41oV5CRzRhKbRo,4829
|
6
|
+
sdg_hub/logger_config.py,sha256=7uHEJVRfym1c4n95DOKHelLXqAus8uHsZYmzLsEjqpo,422
|
7
|
+
sdg_hub/pipeline.py,sha256=mahktfoCMVnuBnvLNjAVOAoFKNQo-wb0Dz1_xdYhKDM,3852
|
8
|
+
sdg_hub/prompts.py,sha256=rtiUS2IuaMAQVAy8aAwGxmk23sKC2Qqro7edymbENrk,8165
|
9
|
+
sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
+
sdg_hub/registry.py,sha256=Sc_HNxo4n0pgWMiEDd_sLjxaSXAMZFiHJIhQKqjywwk,3772
|
11
|
+
sdg_hub/sdg.py,sha256=8SKrSnqyvJAwE2Muf9lXw9ONRcDzqmCtaEzFHCYW4CY,6914
|
12
|
+
sdg_hub/blocks/__init__.py,sha256=pmxlv29ohPRdIVE9ojnBs3I58UwNMU0uTtGozOZuZzc,807
|
13
|
+
sdg_hub/blocks/block.py,sha256=zdeyDyYiY0EdD3xS7kZR2hRZCRkbygQ4WONp_zv3X7w,3051
|
14
|
+
sdg_hub/blocks/llmblock.py,sha256=nWslPFZSCiyL7MXQurOk6Jx29UOsgnVDMI3PTwje7kg,13678
|
15
|
+
sdg_hub/blocks/utilblocks.py,sha256=U2PQk26cwHOgofk5IenHjrao08gbqPFOBNRy5QJ-EEY,18290
|
16
|
+
sdg_hub/configs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
sdg_hub/configs/annotations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
+
sdg_hub/configs/annotations/cot_reflection.yaml,sha256=60EdsTe1y7GoUIAWYSGfMa3EKI3oLZKCvDuKU7wHgQU,1737
|
19
|
+
sdg_hub/configs/annotations/detailed_annotations.yaml,sha256=in21xmlhxDJGEaWh1IgINh33tEyW9AuyG3k4pWBuKSM,1520
|
20
|
+
sdg_hub/configs/annotations/detailed_description.yaml,sha256=FsGbQMBxf1MAOi0nhrQ4icxcwYMlRura_ji9Pmeh1AA,192
|
21
|
+
sdg_hub/configs/annotations/detailed_description_icl.yaml,sha256=NDdwo5EShnYZjm1Fn80sZTAwfnwpPigixP2hvJ8--cU,679
|
22
|
+
sdg_hub/configs/annotations/simple_annotations.yaml,sha256=e2F_Ow8EG_me4XJ2cnBTlKb9y1FmdX0DHKkiMqiwdUQ,188
|
23
|
+
sdg_hub/configs/knowledge/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
24
|
+
sdg_hub/configs/knowledge/atomic_facts.yaml,sha256=9icyigsMooyBR_nEwWgj9eBAnuc3kMZMNnEy6AxFSKU,2430
|
25
|
+
sdg_hub/configs/knowledge/auxilary_instructions.yaml,sha256=aCgIjvNacdC2ZHThEvhZKvwORK6KqErVvVYQYQrIDLE,2034
|
26
|
+
sdg_hub/configs/knowledge/detailed_summary.yaml,sha256=PBymlZljkzN8kbo5DgmNsSM_Xb76SZifuS5Yl-x4Uy4,365
|
27
|
+
sdg_hub/configs/knowledge/evaluate_faithfulness.yaml,sha256=iuvx5vNNm_jzHlmcKF83StaDYezRz2vQn3JUHM-TMdQ,3054
|
28
|
+
sdg_hub/configs/knowledge/evaluate_question.yaml,sha256=02mikEAJCUEkREBo7KxPY9H6iTUHQN-4cRkn2XMlVQ8,1915
|
29
|
+
sdg_hub/configs/knowledge/evaluate_relevancy.yaml,sha256=ASh8A1HAYO1h1tQRrwGnkUmK1n-WDKLdfW_LbSW1ipQ,3690
|
30
|
+
sdg_hub/configs/knowledge/extractive_summary.yaml,sha256=06Z9lDiZUsQEURhpwWUVXA3wYO3bRaC0aNoGCpo3-44,376
|
31
|
+
sdg_hub/configs/knowledge/generate_code_questions_responses.yaml,sha256=cIus2JYMYDvxHFVSU9QVa-1IK5KoChb3rCU2b4b9UmI,908
|
32
|
+
sdg_hub/configs/knowledge/generate_questions_responses.yaml,sha256=H9nb_5xGP7k6HtC3VboXqpiI5kQ9Xp3vjhXH3YIFesk,2525
|
33
|
+
sdg_hub/configs/knowledge/mcq_generation.yaml,sha256=d4VKegnVIexwCn0e2AJs-0DC6XdLyUBGaCsQVwzICUE,3152
|
34
|
+
sdg_hub/configs/knowledge/router.yaml,sha256=9m_cX3xl808Vwrcq2PACyX45QFPkrV2nVYIY8x10JBU,119
|
35
|
+
sdg_hub/configs/knowledge/simple_generate_qa.yaml,sha256=OsuZP9SxQeUhTsHdhUO10mnjJ1u_6xekW5IQucFpRco,1565
|
36
|
+
sdg_hub/configs/reasoning/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
37
|
+
sdg_hub/configs/reasoning/dynamic_cot.yaml,sha256=6XY_mFpB_oKFQ7U2CmHTqkJRGVHgOvpNmIDfhksYW6o,2641
|
38
|
+
sdg_hub/configs/skills/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
39
|
+
sdg_hub/configs/skills/analyzer.yaml,sha256=QBtyjaU6HBZqzNOmev_W4_scn_hH7Rfxd2xL_LcPLho,2261
|
40
|
+
sdg_hub/configs/skills/annotation.yaml,sha256=k5nJ357kUr0Uvq7Hkt3Ey22UbgSjgSjIomjHFfjaQnY,916
|
41
|
+
sdg_hub/configs/skills/contexts.yaml,sha256=MZ2QpuGhTce6kuEsMleaGblljhGG-yhXBuH42htA2P4,1161
|
42
|
+
sdg_hub/configs/skills/critic.yaml,sha256=Dr7anOKa7Xx1oDonXzsCfXwKIl4hUTArx2Sb_rgpLQI,1808
|
43
|
+
sdg_hub/configs/skills/evaluate_freeform_pair.yaml,sha256=MOI0-GyKrJ_O4v1mm8A1lIKxXfwcS3dA7GjlpDEuXRU,4055
|
44
|
+
sdg_hub/configs/skills/evaluate_freeform_questions.yaml,sha256=yDmLd-3A9pN5VLaT4lAcJ_ZvCY43LYlcS1KEdxpBRjU,2559
|
45
|
+
sdg_hub/configs/skills/evaluate_grounded_pair.yaml,sha256=vMQtsHpNxPOOHnkzqWPp-N1gSfwPqTbfcKmNfhb9WS8,4648
|
46
|
+
sdg_hub/configs/skills/evaluate_grounded_questions.yaml,sha256=9yr97azFhMdOfYp11BFtDSIhhP4wjQMOxYZnKWKlCPU,3115
|
47
|
+
sdg_hub/configs/skills/freeform_questions.yaml,sha256=N6R3c1jNiSSw6T-OUJULpLnPHuaSXjvoNjSqTKL6EOY,1500
|
48
|
+
sdg_hub/configs/skills/freeform_responses.yaml,sha256=4URTMsPpgSDOVj71Gw3lL82QWnUFR37iE72BIMwwv7c,1544
|
49
|
+
sdg_hub/configs/skills/grounded_questions.yaml,sha256=t6pKjt5Fp_ThZueB7JBrUKuQLQY_At-Y9O67OtrIXMo,1898
|
50
|
+
sdg_hub/configs/skills/grounded_responses.yaml,sha256=kVOeBp3BjKCFKG2qConXIQVVPI1EgcKJgKn6DFAkl1s,1860
|
51
|
+
sdg_hub/configs/skills/judge.yaml,sha256=FxnJA_wdmyMyMqGEZDAT8hc2itO845mGDNXgpmV2EUU,3203
|
52
|
+
sdg_hub/configs/skills/planner.yaml,sha256=yNF6t0EnmwYt1EV9Y3-vkmPcbOQRtvoLr8MITuiUw_A,2086
|
53
|
+
sdg_hub/configs/skills/respond.yaml,sha256=K1Q5X5_Q1k60hNDbHDjMYBzxbyOIEEHTQcXW6qQ4Ve0,108
|
54
|
+
sdg_hub/configs/skills/revised_responder.yaml,sha256=rjypOJbhZV9PuOD9YhlYgymxOJV8Zdzzz54x6Fxn2bY,2875
|
55
|
+
sdg_hub/configs/skills/router.yaml,sha256=7YnFp6H5wYD8W5Qn1Ac4r9dGBSFUDhZSNwmglQ99PgQ,3545
|
56
|
+
sdg_hub/configs/skills/simple_generate_qa_freeform.yaml,sha256=j8cJtEKSvtA__rE08iU6oz2XnfIgj0HiLVL8-6RhK3c,1431
|
57
|
+
sdg_hub/configs/skills/simple_generate_qa_grounded.yaml,sha256=tvX9EN5TArFesOOqpdN3hb-IHe7O82a2twQd-gzyCgw,1500
|
58
|
+
sdg_hub/configs/skills/icl_examples/STEM.yaml,sha256=5dcLC5jXOEeDasBkTunnHYrlddI3HcHYnEAXZcrd0ds,8412
|
59
|
+
sdg_hub/configs/skills/icl_examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
60
|
+
sdg_hub/configs/skills/icl_examples/coding.yaml,sha256=a5m-pUcV9xUb54gQ5U3vsU1RBXzOmsfX0CjTW7U62zo,5240
|
61
|
+
sdg_hub/configs/skills/icl_examples/extraction.yaml,sha256=P751l6NvFRkINWz-bX5jgnd_if2bl3d_NlhGI7g81xw,4654
|
62
|
+
sdg_hub/configs/skills/icl_examples/humanities.yaml,sha256=tZyiJ4Q3gG4uuoDXw6g__lX3ySEUaRZW2GhW1ustwaM,11370
|
63
|
+
sdg_hub/configs/skills/icl_examples/math.yaml,sha256=hNq-QudlXrg9CWLpJdrZ4v3vifGTWhyp2gcfwPdR3_o,6776
|
64
|
+
sdg_hub/configs/skills/icl_examples/reasoning.yaml,sha256=eesIlH9SO07TVF20gy18MZrcDzLhSmynd_F_lvg0oQg,4335
|
65
|
+
sdg_hub/configs/skills/icl_examples/roleplay.yaml,sha256=LYEyA7wv7QWQscUNQr0K_lotNoWSfuoAEncx3PCRYIs,6997
|
66
|
+
sdg_hub/configs/skills/icl_examples/writing.yaml,sha256=El-57IjZ5IvdcmCHyHvX_M2RFFkEos572220be8ecrQ,11335
|
67
|
+
sdg_hub/flows/generation/knowledge/mmlu_bench.yaml,sha256=Rueuxr_n1zabE_nGqOgUfh5hqVmEONRka9NLiZANSew,346
|
68
|
+
sdg_hub/flows/generation/knowledge/simple_knowledge.yaml,sha256=o4uyfs1nDiECcNROdsvHKiM46NYvQufo9dF4XSGpY54,298
|
69
|
+
sdg_hub/flows/generation/knowledge/synth_knowledge.yaml,sha256=ZTZvevfwDQSKUwPcv1i5IzIchsRHSEN03eTefedQmU8,2172
|
70
|
+
sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml,sha256=aVnHkp0DkeuVgWdZ2eUQf5-uzI8tPYOkrs27yoF8m5g,3393
|
71
|
+
sdg_hub/flows/generation/skills/improve_responses.yaml,sha256=wUV0awTmKHNZ62pHiw_yz-IdG0OYgT_dCwlMUlZS3TA,2683
|
72
|
+
sdg_hub/flows/generation/skills/simple_freeform_skill.yaml,sha256=iVEomFH1E52JA7KLmTIwkS1PnzxUJVPMgbK2O-m80As,309
|
73
|
+
sdg_hub/flows/generation/skills/simple_grounded_skill.yaml,sha256=LTLxqdgbLIKSJonuIRHhcRSpit1EawwNvytWzXWXe2E,309
|
74
|
+
sdg_hub/flows/generation/skills/synth_grounded_skills.yaml,sha256=91Dm--agpmbm02hIVnFhEndjppKsQEWXDbckR9GAzKM,2045
|
75
|
+
sdg_hub/flows/generation/skills/synth_skills.yaml,sha256=9lhQcxXXbN4V9ztPph4fyjUtctll2FYtKY-V4grQdy4,1492
|
76
|
+
sdg_hub/utils/__init__.py,sha256=UEo-9qPt5iVKBIRvgZhOI0SoIBO6zeBxOuLvUQXaM3g,185
|
77
|
+
sdg_hub/utils/datautils.py,sha256=0t_SZ_UXBKl8uL6rVp3SUh8YKRbzKlh2oO5gr2cKyEw,389
|
78
|
+
sdg_hub-0.1.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
79
|
+
sdg_hub-0.1.0.dist-info/METADATA,sha256=y2r66Mmm4OCTiouJZcLVyTFHR1o3eqD5pAlS-8ow8uE,7247
|
80
|
+
sdg_hub-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
81
|
+
sdg_hub-0.1.0.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
|
82
|
+
sdg_hub-0.1.0.dist-info/RECORD,,
|