sdg-hub 0.1.0a3__py3-none-any.whl → 0.1.0a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/_version.py +2 -2
- sdg_hub/blocks/llmblock.py +35 -18
- sdg_hub/prompts.py +5 -0
- sdg_hub-0.1.0a4.dist-info/METADATA +309 -0
- {sdg_hub-0.1.0a3.dist-info → sdg_hub-0.1.0a4.dist-info}/RECORD +8 -8
- {sdg_hub-0.1.0a3.dist-info → sdg_hub-0.1.0a4.dist-info}/WHEEL +1 -1
- sdg_hub-0.1.0a3.dist-info/METADATA +0 -154
- {sdg_hub-0.1.0a3.dist-info → sdg_hub-0.1.0a4.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.1.0a3.dist-info → sdg_hub-0.1.0a4.dist-info}/top_level.txt +0 -0
sdg_hub/_version.py
CHANGED
@@ -17,5 +17,5 @@ __version__: str
|
|
17
17
|
__version_tuple__: VERSION_TUPLE
|
18
18
|
version_tuple: VERSION_TUPLE
|
19
19
|
|
20
|
-
__version__ = version = '0.1.
|
21
|
-
__version_tuple__ = version_tuple = (0, 1, 0)
|
20
|
+
__version__ = version = '0.1.0a4'
|
21
|
+
__version_tuple__ = version_tuple = (0, 1, 0, 'a4')
|
sdg_hub/blocks/llmblock.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# SPDX-License-Identifier: Apache-2.0
|
2
2
|
# Standard
|
3
|
-
from collections import Counter
|
4
3
|
from typing import Any, Dict, List
|
4
|
+
from typing import Optional
|
5
5
|
import json
|
6
6
|
import re
|
7
7
|
|
@@ -84,6 +84,27 @@ class LLMBlock(Block):
|
|
84
84
|
# and supports the n parameter to generate n outputs per input
|
85
85
|
self.server_supports_batched = server_supports_batched(client, self.model)
|
86
86
|
|
87
|
+
|
88
|
+
def _extract_matches(
|
89
|
+
self, text: str, start_tag: Optional[str], end_tag: Optional[str]
|
90
|
+
) -> List[str]:
|
91
|
+
if not text:
|
92
|
+
return []
|
93
|
+
if not start_tag and not end_tag:
|
94
|
+
return [text.strip()]
|
95
|
+
|
96
|
+
pattern = ""
|
97
|
+
if start_tag:
|
98
|
+
pattern += re.escape(start_tag)
|
99
|
+
pattern += r"(.*?)"
|
100
|
+
if end_tag:
|
101
|
+
pattern += re.escape(end_tag)
|
102
|
+
elif start_tag:
|
103
|
+
# Enforce matching till end of string when only start_tag is provided.
|
104
|
+
pattern += "$"
|
105
|
+
|
106
|
+
return [match.strip() for match in re.findall(pattern, text, re.DOTALL)]
|
107
|
+
|
87
108
|
def _parse(self, generated_string) -> dict:
|
88
109
|
matches = {}
|
89
110
|
|
@@ -108,16 +129,9 @@ class LLMBlock(Block):
|
|
108
129
|
self.block_config.get("end_tags", []),
|
109
130
|
self.output_cols,
|
110
131
|
):
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
]
|
115
|
-
else:
|
116
|
-
pattern = re.escape(start_tag) + r"(.*?)" + re.escape(end_tag)
|
117
|
-
all_matches = re.findall(pattern, generated_string, re.DOTALL)
|
118
|
-
matches[output_col] = (
|
119
|
-
[match.strip() for match in all_matches] if all_matches else []
|
120
|
-
)
|
132
|
+
matches[output_col] = self._extract_matches(
|
133
|
+
generated_string, start_tag, end_tag
|
134
|
+
)
|
121
135
|
|
122
136
|
return matches
|
123
137
|
|
@@ -247,11 +261,12 @@ class ConditionalLLMBlock(LLMBlock):
|
|
247
261
|
for config_key, config in config_paths.items():
|
248
262
|
# Template(self.prompt_struct.format(**filtered_config))
|
249
263
|
filtered_config = {
|
250
|
-
k: (v if v is not None else "")
|
264
|
+
k: (v if v is not None else "")
|
265
|
+
for k, v in self.block_config.items()
|
251
266
|
}
|
252
|
-
self.prompt_template[config_key] = Template(
|
253
|
-
**self._load_config(config)
|
254
|
-
)
|
267
|
+
self.prompt_template[config_key] = Template(
|
268
|
+
self.prompt_struct.format(**self._load_config(config))
|
269
|
+
)
|
255
270
|
|
256
271
|
def _format_prompt(self, sample: Dict) -> str:
|
257
272
|
if isinstance(self.prompt_template, dict):
|
@@ -380,7 +395,7 @@ class LLMMessagesBlock(Block):
|
|
380
395
|
client,
|
381
396
|
input_col,
|
382
397
|
output_col,
|
383
|
-
model_prompt=None,
|
398
|
+
model_prompt=None,
|
384
399
|
model_id=None,
|
385
400
|
**batch_kwargs,
|
386
401
|
) -> None:
|
@@ -395,7 +410,7 @@ class LLMMessagesBlock(Block):
|
|
395
410
|
self.model = model_id
|
396
411
|
else:
|
397
412
|
self.model = self.client.models.list().data[0].id
|
398
|
-
|
413
|
+
|
399
414
|
self.defaults = {
|
400
415
|
"model": self.model,
|
401
416
|
"temperature": 0,
|
@@ -417,7 +432,9 @@ class LLMMessagesBlock(Block):
|
|
417
432
|
results = []
|
418
433
|
n = gen_kwargs.get("n", 1)
|
419
434
|
for message in messages:
|
420
|
-
responses = self.client.chat.completions.create(
|
435
|
+
responses = self.client.chat.completions.create(
|
436
|
+
messages=message, **generate_args
|
437
|
+
)
|
421
438
|
if n > 1:
|
422
439
|
results.append([choice.message.content for choice in responses.choices])
|
423
440
|
else:
|
sdg_hub/prompts.py
CHANGED
@@ -15,3 +15,8 @@ def instructlab_chat_template():
|
|
15
15
|
@PromptRegistry.register("mistralai")
|
16
16
|
def mistral_chat_template():
|
17
17
|
return """{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n<s>\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + '</s>'}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n"""
|
18
|
+
|
19
|
+
|
20
|
+
@PromptRegistry.register("meta-llama/Llama-3.3")
|
21
|
+
def meta_llama_chat_template():
|
22
|
+
return """{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n"""
|
@@ -0,0 +1,309 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: sdg_hub
|
3
|
+
Version: 0.1.0a4
|
4
|
+
Summary: Synthetic Data Generation
|
5
|
+
Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
|
6
|
+
License: Apache-2.0
|
7
|
+
Project-URL: homepage, https://ai-innovation.team/
|
8
|
+
Project-URL: source, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub
|
9
|
+
Project-URL: issues, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/issues
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
11
|
+
Classifier: Environment :: Console
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
14
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
15
|
+
Classifier: Operating System :: POSIX :: Linux
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
22
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
23
|
+
Requires-Python: >=3.9
|
24
|
+
Description-Content-Type: text/markdown
|
25
|
+
License-File: LICENSE
|
26
|
+
Requires-Dist: click<9.0.0,>=8.1.7
|
27
|
+
Requires-Dist: datasets<4.0.0,>=2.18.0
|
28
|
+
Requires-Dist: httpx<1.0.0,>=0.25.0
|
29
|
+
Requires-Dist: jinja2
|
30
|
+
Requires-Dist: langchain-text-splitters
|
31
|
+
Requires-Dist: openai<2.0.0,>=1.13.3
|
32
|
+
Requires-Dist: rich
|
33
|
+
Requires-Dist: tenacity!=8.4.0,>=8.3.0
|
34
|
+
Requires-Dist: tqdm<5.0.0,>=4.66.2
|
35
|
+
Dynamic: license-file
|
36
|
+
|
37
|
+
# sdg_hub: Synthetic Data Generation Toolkit for LLMs
|
38
|
+
|
39
|
+

|
40
|
+

|
41
|
+

|
42
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml)
|
43
|
+
[](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub)
|
44
|
+
|
45
|
+
sdg_hub is a modular, scalable, and efficient solution for creating synthetic data generation workflows in a "no-code" manner. At its core, this framework is designed to simplify data creation for LLMs, allowing users to chain computational units and build powerful pipelines for generating data and processing tasks.
|
46
|
+
|
47
|
+
|
48
|
+
## Installation
|
49
|
+
|
50
|
+
Latest release from PyPI
|
51
|
+
|
52
|
+
```sh
|
53
|
+
pip install sdg-hub
|
54
|
+
```
|
55
|
+
|
56
|
+
Latest main branch
|
57
|
+
```sh
|
58
|
+
pip install git+https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub.git
|
59
|
+
```
|
60
|
+
|
61
|
+
## Core Design Principles
|
62
|
+
|
63
|
+
The framework is built around the following principles:
|
64
|
+
|
65
|
+
1. **Modular Design**: Highly composable blocks form the building units of the framework, allowing users to build workflows effortlessly.
|
66
|
+
2. **No-Code Workflow Creation**: Specify workflows using simple YAML configuration files.
|
67
|
+
3. **Scalability and Performance**: Optimized for handling large-scale workflows with millions of records.
|
68
|
+
|
69
|
+
---
|
70
|
+
|
71
|
+
## Framework Architecture
|
72
|
+
|
73
|
+

|
74
|
+
|
75
|
+
### Blocks: The Fundamental Unit
|
76
|
+
|
77
|
+
At the heart of the framework is the **Block**. Each block is a self-contained computational unit that performs specific tasks, such as:
|
78
|
+
|
79
|
+
- Making LLM calls
|
80
|
+
- Performing data transformations
|
81
|
+
- Applying filters
|
82
|
+
|
83
|
+
Blocks are designed to be:
|
84
|
+
- **Modular**: Reusable across multiple pipelines.
|
85
|
+
- **Composable**: Easily chained together to create workflows.
|
86
|
+
|
87
|
+
These blocks are implemented in the [src/sdg_hub/blocks](src/sdg_hub/blocks) directory.
|
88
|
+
|
89
|
+
### Prompts
|
90
|
+
|
91
|
+
Prompts are at the core of how LLMs are instructed within SDG Hub. Each `LLMBlock` is associated with a prompt configuration file written in YAML, allowing users to define the exact behavior of the language model — including system instructions, generation principles, and output formatting.
|
92
|
+
|
93
|
+
#### Prompt YAML Structure
|
94
|
+
|
95
|
+
A typical prompt YAML file looks like this:
|
96
|
+
|
97
|
+
```yaml
|
98
|
+
system: You are a helpful assistant that can summarize text.
|
99
|
+
introduction: Give me a short summary of the text.
|
100
|
+
principles:
|
101
|
+
- Do not add any new information.
|
102
|
+
- Do not miss any key points from the provided text.
|
103
|
+
examples:
|
104
|
+
- input: Red Hat announced the acquisition of Neural Magic...
|
105
|
+
output: Red Hat acquired Neural Magic to enhance its AI optimization capabilities.
|
106
|
+
generation: Here is the document to summarize: {{document}}
|
107
|
+
```
|
108
|
+
|
109
|
+
#### Key Fields
|
110
|
+
* `system`: A high-level instruction that sets the persona or behavior of the model.
|
111
|
+
* `introduction`: Optional introduction to set context for the user.
|
112
|
+
* `principles`: A list of guiding constraints or rules the model should follow during generation.
|
113
|
+
* `examples`: Few-shot examples (optional) to guide output format or tone.
|
114
|
+
* `generation`: The actual template used to generate the model input. This supports variable injection using {{variable_name}}.
|
115
|
+
|
116
|
+
### YAML-Based Workflow: The Flow
|
117
|
+
|
118
|
+
The YAML configuration file, known as the **Flow**, is central to defining data generation workflows in the SDG Framework. A Flow describes how blocks and pipelines are orchestrated to process and generate data efficiently. By leveraging YAML, users can create highly customizable and modular workflows without writing any code.
|
119
|
+
|
120
|
+
#### Key Features of a Flow
|
121
|
+
|
122
|
+
1. **Modular Design**:
|
123
|
+
- Flows are composed of blocks, which can be chained together into pipelines.
|
124
|
+
- Each block performs a specific task, such as generating, filtering, or transforming data.
|
125
|
+
|
126
|
+
2. **Reusability**:
|
127
|
+
- Blocks and configurations defined in a Flow can be reused across different workflows.
|
128
|
+
- YAML makes it easy to tweak or extend workflows without significant changes.
|
129
|
+
|
130
|
+
3. **Ease of Configuration**:
|
131
|
+
- Users can specify block types, configurations, and data processing details in a simple and intuitive manner.
|
132
|
+
|
133
|
+
|
134
|
+
|
135
|
+
## Hello World Example
|
136
|
+
|
137
|
+
Let’s say you have a document and want to generate a concise summary using an LLM. Here’s how simple that is in sdg\_hub:
|
138
|
+
|
139
|
+
```yaml
|
140
|
+
- block_type: LLMBlock
|
141
|
+
block_config:
|
142
|
+
block_name: gen_summary
|
143
|
+
config_path: prompts/summarization.yaml
|
144
|
+
model_id: meta-llama/Llama-3.3-70B-Instruct
|
145
|
+
output_cols:
|
146
|
+
- summary
|
147
|
+
gen_kwargs:
|
148
|
+
max_tokens: 512
|
149
|
+
```
|
150
|
+
|
151
|
+
Want to go further? Add another block to extract keywords from the summary:
|
152
|
+
|
153
|
+
```yaml
|
154
|
+
- block_type: LLMBlock
|
155
|
+
block_config:
|
156
|
+
block_name: gen_keywords
|
157
|
+
config_path: prompts/keywords.yaml
|
158
|
+
model_id: meta-llama/Llama-3.3-70B-Instruct
|
159
|
+
output_cols:
|
160
|
+
- keywords
|
161
|
+
gen_kwargs:
|
162
|
+
max_tokens: 64
|
163
|
+
```
|
164
|
+
|
165
|
+
Just like that, you’ve built a multi-step LLM workflow using nothing but YAML.
|
166
|
+
|
167
|
+
## Available Blocks
|
168
|
+
|
169
|
+
The SDG Framework provides a rich set of blocks for different data processing needs. Here's a comprehensive overview of the available blocks and when to use them:
|
170
|
+
|
171
|
+
### Base Block Class
|
172
|
+
|
173
|
+
The framework is built around the abstract `Block` class, which serves as the foundation for all other blocks:
|
174
|
+
|
175
|
+
- **Purpose**: Provides core functionality and interface for all blocks
|
176
|
+
- **Key Features**:
|
177
|
+
- Template validation for input data
|
178
|
+
- Configuration loading from YAML files
|
179
|
+
- Standardized block initialization
|
180
|
+
- Common interface for all blocks
|
181
|
+
- **Core Methods**:
|
182
|
+
- `_validate`: Validates input data against templates
|
183
|
+
- `_load_config`: Loads configuration from YAML files
|
184
|
+
- `generate`: Abstract method for block execution
|
185
|
+
|
186
|
+
All blocks inherit from this base class, ensuring consistent behavior and interface across the framework.
|
187
|
+
|
188
|
+
### LLM Blocks
|
189
|
+
|
190
|
+
1. **LLMBlock**
|
191
|
+
- **Purpose**: Generate text using language models
|
192
|
+
- **Use Cases**:
|
193
|
+
- Generating questions, responses, or any text content
|
194
|
+
- Single-prompt generation with structured outputs
|
195
|
+
- **Features**:
|
196
|
+
- Supports batched processing
|
197
|
+
- Configurable output parsing
|
198
|
+
- Template-based prompt generation
|
199
|
+
|
200
|
+
2. **ConditionalLLMBlock**
|
201
|
+
- **Purpose**: Generate text based on conditional logic
|
202
|
+
- **Use Cases**:
|
203
|
+
- Different prompt templates based on input conditions
|
204
|
+
- Multi-path text generation workflows
|
205
|
+
- **Features**:
|
206
|
+
- Multiple config paths for different conditions
|
207
|
+
- Dynamic prompt selection
|
208
|
+
|
209
|
+
3. **LLMLogProbBlock**
|
210
|
+
- **Purpose**: Generate text with log probabilities
|
211
|
+
- **Use Cases**:
|
212
|
+
- Analyzing model confidence
|
213
|
+
- Quality scoring of generations
|
214
|
+
- **Features**:
|
215
|
+
- Returns top-k log probabilities
|
216
|
+
- JSON-formatted output
|
217
|
+
|
218
|
+
4. **LLMMessagesBlock**
|
219
|
+
- **Purpose**: Chat-based text generation
|
220
|
+
- **Use Cases**:
|
221
|
+
- Multi-turn conversations
|
222
|
+
- Chat-based interactions
|
223
|
+
- **Features**:
|
224
|
+
- Supports message history
|
225
|
+
- Chat completion API
|
226
|
+
|
227
|
+
### Filtering and Processing Blocks
|
228
|
+
|
229
|
+
1. **FilterByValueBlock**
|
230
|
+
- **Purpose**: Filter datasets based on column values
|
231
|
+
- **Use Cases**:
|
232
|
+
- Removing unwanted samples
|
233
|
+
- Data cleaning
|
234
|
+
- Quality filtering
|
235
|
+
- **Features**:
|
236
|
+
- Multiple filter operations
|
237
|
+
- Type conversion support
|
238
|
+
- Parallel processing
|
239
|
+
|
240
|
+
2. **IterBlock**
|
241
|
+
- **Purpose**: Iterative processing of data
|
242
|
+
- **Use Cases**:
|
243
|
+
- Multiple generation attempts
|
244
|
+
- Iterative refinement
|
245
|
+
- **Features**:
|
246
|
+
- Configurable number of iterations
|
247
|
+
- Nested block execution
|
248
|
+
|
249
|
+
|
250
|
+
|
251
|
+
### Utility Blocks
|
252
|
+
|
253
|
+
1. **SamplePopulatorBlock**
|
254
|
+
- **Purpose**: Populate samples with configuration data
|
255
|
+
- **Use Cases**:
|
256
|
+
- Adding metadata
|
257
|
+
- Configuration injection
|
258
|
+
|
259
|
+
2. **SelectorBlock**
|
260
|
+
- **Purpose**: Select data based on mapping
|
261
|
+
- **Use Cases**:
|
262
|
+
- Conditional data selection
|
263
|
+
- Data routing
|
264
|
+
|
265
|
+
3. **CombineColumnsBlock**
|
266
|
+
- **Purpose**: Merge multiple columns
|
267
|
+
- **Use Cases**:
|
268
|
+
- Text concatenation
|
269
|
+
- Feature combination
|
270
|
+
|
271
|
+
4. **FlattenColumnsBlock**
|
272
|
+
- **Purpose**: Convert wide to long format
|
273
|
+
- **Use Cases**:
|
274
|
+
- Data reshaping
|
275
|
+
- Variable-value pairs
|
276
|
+
|
277
|
+
5. **DuplicateColumns**
|
278
|
+
- **Purpose**: Create column copies
|
279
|
+
- **Use Cases**:
|
280
|
+
- Data preservation
|
281
|
+
- Multiple processing paths
|
282
|
+
|
283
|
+
6. **RenameColumns**
|
284
|
+
- **Purpose**: Rename dataset columns
|
285
|
+
- **Use Cases**:
|
286
|
+
- Standardizing column names
|
287
|
+
- Data reorganization
|
288
|
+
|
289
|
+
7. **SetToMajorityValue**
|
290
|
+
- **Purpose**: Replace values with majority
|
291
|
+
- **Use Cases**:
|
292
|
+
- Data normalization
|
293
|
+
- Outlier handling
|
294
|
+
|
295
|
+
---
|
296
|
+
### Dataflow and Storage
|
297
|
+
|
298
|
+
- **Data Representation**: Dataflow between blocks and pipelines is handled using **Hugging Face Datasets**, which are based on Arrow tables. This provides:
|
299
|
+
- Native parallelization capabilities (e.g., maps, filters).
|
300
|
+
- Support for efficient data transformations.
|
301
|
+
|
302
|
+
- **Data Checkpoints**: Intermediate caches of generated data. Checkpoints allow users to:
|
303
|
+
- Resume workflows from the last successful state if interrupted.
|
304
|
+
- Improve reliability for long-running workflows.
|
305
|
+
|
306
|
+
|
307
|
+
## Examples
|
308
|
+
|
309
|
+
For sample use cases and implementation examples, please refer to the [examples](examples) directory. This directory contains various examples demonstrating different workflows and use cases of the SDG Framework.
|
@@ -1,9 +1,9 @@
|
|
1
1
|
sdg_hub/__init__.py,sha256=5Wa6onDndPvG4iwnjq2jK747t3-7XKdQn2WfHfq1sFc,67
|
2
|
-
sdg_hub/_version.py,sha256=
|
2
|
+
sdg_hub/_version.py,sha256=c_h4q533sxFKNKMu0mPWtg9zCSI76KH0QdADDgQNFCY,519
|
3
3
|
sdg_hub/flow.py,sha256=3b97fMei1rWuQWeNfv-xyHKUbcMaf-d_b9Xms9J3BCQ,5425
|
4
4
|
sdg_hub/logger_config.py,sha256=7uHEJVRfym1c4n95DOKHelLXqAus8uHsZYmzLsEjqpo,422
|
5
5
|
sdg_hub/pipeline.py,sha256=u24ccryfy_nOSvsrWiynNmq1rOmOOkw1L5-TqJvuRSo,2339
|
6
|
-
sdg_hub/prompts.py,sha256=
|
6
|
+
sdg_hub/prompts.py,sha256=jdpTUaVfsTeKTc7xUHa8ONFbv8l8Cd33Ozc9rFYFIAU,6891
|
7
7
|
sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
8
|
sdg_hub/registry.py,sha256=Sc_HNxo4n0pgWMiEDd_sLjxaSXAMZFiHJIhQKqjywwk,3772
|
9
9
|
sdg_hub/sdg.py,sha256=SXXnDGA3MpYlNpsw4XyImL97l0pXiF5P9jrDkZNlDJc,6492
|
@@ -11,7 +11,7 @@ sdg_hub/blocks/__init__.py,sha256=OwPWofuBBWG7n0nYAXNtFXdq4rPf7FyvKkPfjUBlqec,13
|
|
11
11
|
sdg_hub/blocks/block.py,sha256=ObJp8JaAhQ3lQK6SOYoqHPc7b2hBZMhOXEmIap_qa1k,1788
|
12
12
|
sdg_hub/blocks/filterblock.py,sha256=leH0k3stcRzdCWoy8kI2hFruGJ0VUemeA4QBW1eQcdQ,2650
|
13
13
|
sdg_hub/blocks/iterblock.py,sha256=7UZnK_JyQfbMhVNVzZ79TtEtADLuosI0z62LhoP63s4,958
|
14
|
-
sdg_hub/blocks/llmblock.py,sha256=
|
14
|
+
sdg_hub/blocks/llmblock.py,sha256=Diyd-BLsZchDH6-w4pXSUiyFHK-EpY64cNYSVrrteos,16427
|
15
15
|
sdg_hub/blocks/rmblocks.py,sha256=nw0p1LytHO7Dmc8RGfJ5uajDQWM93-oNoYrzhaY2QEY,6222
|
16
16
|
sdg_hub/blocks/utilblocks.py,sha256=nAehqcDKiDE5W3REGApytYAXztRm9AW65cAy95Ufb8U,4926
|
17
17
|
sdg_hub/configs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -83,8 +83,8 @@ sdg_hub/utils/chunking.py,sha256=VSPQ8dSFI5LF4sefcI0tzWG0Vc1rM_FSMTO6xg_iFzA,255
|
|
83
83
|
sdg_hub/utils/datautils.py,sha256=0t_SZ_UXBKl8uL6rVp3SUh8YKRbzKlh2oO5gr2cKyEw,389
|
84
84
|
sdg_hub/utils/docprocessor.py,sha256=Z4J2DfLhRxMCeIeMKttwi-FdivmPqI-hjEwq6-Ub35c,12485
|
85
85
|
sdg_hub/utils/parse_and_convert.py,sha256=I27FdS-H2mSoZ07SsKZmNYM2F_Cg7GHTBXD7YNgASNw,13443
|
86
|
-
sdg_hub-0.1.
|
87
|
-
sdg_hub-0.1.
|
88
|
-
sdg_hub-0.1.
|
89
|
-
sdg_hub-0.1.
|
90
|
-
sdg_hub-0.1.
|
86
|
+
sdg_hub-0.1.0a4.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
87
|
+
sdg_hub-0.1.0a4.dist-info/METADATA,sha256=2Zs4Sbcg5sG5iBsITJ7wt5nxpN1RiAEtRWgG6P81AUY,11012
|
88
|
+
sdg_hub-0.1.0a4.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
|
89
|
+
sdg_hub-0.1.0a4.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
|
90
|
+
sdg_hub-0.1.0a4.dist-info/RECORD,,
|
@@ -1,154 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: sdg_hub
|
3
|
-
Version: 0.1.0a3
|
4
|
-
Summary: Synthetic Data Generation
|
5
|
-
Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
|
6
|
-
License: Apache-2.0
|
7
|
-
Project-URL: homepage, https://ai-innovation.team/
|
8
|
-
Project-URL: source, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub
|
9
|
-
Project-URL: issues, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/issues
|
10
|
-
Classifier: Development Status :: 3 - Alpha
|
11
|
-
Classifier: Environment :: Console
|
12
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
13
|
-
Classifier: License :: OSI Approved :: MIT License
|
14
|
-
Classifier: Operating System :: MacOS :: MacOS X
|
15
|
-
Classifier: Operating System :: POSIX :: Linux
|
16
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
17
|
-
Classifier: Programming Language :: Python :: 3
|
18
|
-
Classifier: Programming Language :: Python :: 3.9
|
19
|
-
Classifier: Programming Language :: Python :: 3.10
|
20
|
-
Classifier: Programming Language :: Python :: 3.11
|
21
|
-
Classifier: Programming Language :: Python :: 3.12
|
22
|
-
Classifier: Programming Language :: Python :: Implementation :: CPython
|
23
|
-
Requires-Python: >=3.9
|
24
|
-
Description-Content-Type: text/markdown
|
25
|
-
License-File: LICENSE
|
26
|
-
Requires-Dist: click<9.0.0,>=8.1.7
|
27
|
-
Requires-Dist: datasets<4.0.0,>=2.18.0
|
28
|
-
Requires-Dist: httpx<1.0.0,>=0.25.0
|
29
|
-
Requires-Dist: jinja2
|
30
|
-
Requires-Dist: langchain-text-splitters
|
31
|
-
Requires-Dist: openai<2.0.0,>=1.13.3
|
32
|
-
Requires-Dist: rich
|
33
|
-
Requires-Dist: tenacity!=8.4.0,>=8.3.0
|
34
|
-
Requires-Dist: tqdm<5.0.0,>=4.66.2
|
35
|
-
Dynamic: license-file
|
36
|
-
|
37
|
-
# Synthetic Data Generation for LLMs
|
38
|
-
|
39
|
-
The SDG Framework is a modular, scalable, and efficient solution for creating synthetic data generation workflows in a "no-code" manner. At its core, this framework is designed to simplify data creation for LLMs, allowing users to chain computational units and build powerful pipelines for generating data and processing tasks.
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
## Core Design Principles
|
44
|
-
|
45
|
-
The framework is built around the following principles:
|
46
|
-
|
47
|
-
1. **Modular Design**: Highly composable blocks form the building units of the framework, allowing users to build workflows effortlessly.
|
48
|
-
2. **No-Code Workflow Creation**: Specify workflows using simple YAML configuration files.
|
49
|
-
3. **Scalability and Performance**: Optimized for handling large-scale workflows with millions of records.
|
50
|
-
|
51
|
-
---
|
52
|
-
|
53
|
-
## Framework Architecture
|
54
|
-
|
55
|
-

|
56
|
-
|
57
|
-
### Blocks: The Fundamental Unit
|
58
|
-
|
59
|
-
At the heart of the framework is the **Block**. Each block is a self-contained computational unit that performs specific tasks, such as:
|
60
|
-
|
61
|
-
- Making LLM calls
|
62
|
-
- Performing data transformations
|
63
|
-
- Applying filters
|
64
|
-
|
65
|
-
Blocks are designed to be:
|
66
|
-
- **Modular**: Reusable across multiple pipelines.
|
67
|
-
- **Composable**: Easily chained together to create workflows.
|
68
|
-
|
69
|
-
These blocks are implemented in the [src/sdg_hub/blocks](src/sdg_hub/blocks) directory.
|
70
|
-
|
71
|
-
### Pipelines: Higher-Level Abstraction
|
72
|
-
|
73
|
-
Blocks can be chained together to form a **Pipeline**. Pipelines enable:
|
74
|
-
- Linear or recursive chaining of blocks.
|
75
|
-
- Execution of complex workflows by chaining multiple pipelines together.
|
76
|
-
|
77
|
-
### SDG Workflow: Full Workflow Automation
|
78
|
-
|
79
|
-
Pipelines are further orchestrated into **SDG Workflows**, enabling seamless end-to-end processing. When invoking `sdg_hub.generate`, it triggers a pipeline/ or multiple pipelines that processes data through all the configured blocks.
|
80
|
-
|
81
|
-
---
|
82
|
-
|
83
|
-
### YAML-Based Workflow: The Flow
|
84
|
-
|
85
|
-
The YAML configuration file, known as the **Flow**, is central to defining data generation workflows in the SDG Framework. A Flow describes how blocks and pipelines are orchestrated to process and generate data efficiently. By leveraging YAML, users can create highly customizable and modular workflows without writing any code.
|
86
|
-
|
87
|
-
#### Key Features of a Flow
|
88
|
-
|
89
|
-
1. **Modular Design**:
|
90
|
-
- Flows are composed of blocks, which can be chained together into pipelines.
|
91
|
-
- Each block performs a specific task, such as generating, filtering, or transforming data.
|
92
|
-
|
93
|
-
2. **Reusability**:
|
94
|
-
- Blocks and configurations defined in a Flow can be reused across different workflows.
|
95
|
-
- YAML makes it easy to tweak or extend workflows without significant changes.
|
96
|
-
|
97
|
-
3. **Ease of Configuration**:
|
98
|
-
- Users can specify block types, configurations, and data processing details in a simple and intuitive manner.
|
99
|
-
|
100
|
-
---
|
101
|
-
|
102
|
-
### Sample Flow
|
103
|
-
|
104
|
-
Here is an example of a Flow configuration:
|
105
|
-
|
106
|
-
```yaml
|
107
|
-
- block_type: LLMBlock
|
108
|
-
block_config:
|
109
|
-
block_name: gen_questions
|
110
|
-
config_path: configs/skills/freeform_questions.yaml
|
111
|
-
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
112
|
-
output_cols:
|
113
|
-
- question
|
114
|
-
batch_kwargs:
|
115
|
-
num_samples: 30
|
116
|
-
drop_duplicates:
|
117
|
-
- question
|
118
|
-
- block_type: FilterByValueBlock
|
119
|
-
block_config:
|
120
|
-
block_name: filter_questions
|
121
|
-
filter_column: score
|
122
|
-
filter_value: 1.0
|
123
|
-
operation: operator.eq
|
124
|
-
convert_dtype: float
|
125
|
-
batch_kwargs:
|
126
|
-
num_procs: 8
|
127
|
-
drop_columns:
|
128
|
-
- evaluation
|
129
|
-
- score
|
130
|
-
- num_samples
|
131
|
-
- block_type: LLMBlock
|
132
|
-
block_config:
|
133
|
-
block_name: gen_responses
|
134
|
-
config_path: configs/skills/freeform_responses.yaml
|
135
|
-
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
136
|
-
output_cols:
|
137
|
-
- response
|
138
|
-
```
|
139
|
-
|
140
|
-
### Dataflow and Storage
|
141
|
-
|
142
|
-
- **Data Representation**: Dataflow between blocks and pipelines is handled using **Hugging Face Datasets**, which are based on Arrow tables. This provides:
|
143
|
-
- Native parallelization capabilities (e.g., maps, filters).
|
144
|
-
- Support for efficient data transformations.
|
145
|
-
|
146
|
-
- **Data Checkpoints**: Intermediate caches of generated data. Checkpoints allow users to:
|
147
|
-
- Resume workflows from the last successful state if interrupted.
|
148
|
-
- Improve reliability for long-running workflows.
|
149
|
-
|
150
|
-
---
|
151
|
-
|
152
|
-
## Examples
|
153
|
-
|
154
|
-
For sample use cases and implementation examples, please refer to the [examples](examples) directory. This directory contains various examples demonstrating different workflows and use cases of the SDG Framework.
|
File without changes
|
File without changes
|