sdg-hub 0.1.0a4__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/_version.py +2 -2
- sdg_hub/blocks/__init__.py +35 -5
- sdg_hub/blocks/block.py +58 -16
- sdg_hub/blocks/llmblock.py +121 -193
- sdg_hub/blocks/utilblocks.py +500 -43
- sdg_hub/checkpointer.py +139 -0
- sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
- sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
- sdg_hub/configs/knowledge/atomic_facts.yaml +1 -0
- sdg_hub/configs/knowledge/detailed_summary.yaml +1 -0
- sdg_hub/configs/knowledge/extractive_summary.yaml +1 -0
- sdg_hub/configs/knowledge/generate_questions.yaml +82 -0
- sdg_hub/configs/knowledge/generate_responses.yaml +86 -0
- sdg_hub/configs/skills/contexts.yaml +18 -11
- sdg_hub/configs/skills/evaluate_freeform_pair.yaml +79 -12
- sdg_hub/configs/skills/evaluate_freeform_questions.yaml +60 -28
- sdg_hub/configs/skills/evaluate_grounded_pair.yaml +95 -30
- sdg_hub/configs/skills/freeform_questions.yaml +21 -16
- sdg_hub/configs/skills/freeform_responses.yaml +19 -25
- sdg_hub/configs/skills/router.yaml +53 -6
- sdg_hub/flow.py +351 -21
- sdg_hub/flow_runner.py +216 -0
- sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +26 -9
- sdg_hub/flows/generation/skills/{agentic_improve_skill.yaml → improve_responses.yaml} +26 -31
- sdg_hub/flows/generation/skills/synth_skills.yaml +4 -4
- sdg_hub/pipeline.py +67 -12
- sdg_hub/prompts.py +21 -0
- sdg_hub/sdg.py +128 -86
- sdg_hub/utils/config_validation.py +91 -0
- sdg_hub/utils/validation_result.py +10 -0
- sdg_hub-0.1.1.dist-info/METADATA +190 -0
- sdg_hub-0.1.1.dist-info/RECORD +86 -0
- {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.1.dist-info}/WHEEL +1 -1
- sdg_hub/blocks/filterblock.py +0 -76
- sdg_hub/blocks/iterblock.py +0 -31
- sdg_hub/blocks/rmblocks.py +0 -194
- sdg_hub/configs/annotations/simple.yaml +0 -10
- sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +0 -3
- sdg_hub/configs/skills/data_recipe/default_recipe.yaml +0 -6
- sdg_hub/flows/annotation/emotion/detailed_description.yaml +0 -19
- sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +0 -19
- sdg_hub/flows/annotation/emotion/simple.yaml +0 -19
- sdg_hub/utils/chunking.py +0 -73
- sdg_hub/utils/docprocessor.py +0 -357
- sdg_hub/utils/parse_and_convert.py +0 -392
- sdg_hub-0.1.0a4.dist-info/METADATA +0 -309
- sdg_hub-0.1.0a4.dist-info/RECORD +0 -90
- /sdg_hub/configs/{knowledge/data_recipe → reasoning}/__init__.py +0 -0
- /sdg_hub/configs/skills/{_G_.yaml → icl_examples/STEM.yaml} +0 -0
- /sdg_hub/configs/skills/{data_recipe → icl_examples}/__init__.py +0 -0
- /sdg_hub/configs/skills/{_A_.yaml → icl_examples/coding.yaml} +0 -0
- /sdg_hub/configs/skills/{_B_.yaml → icl_examples/extraction.yaml} +0 -0
- /sdg_hub/configs/skills/{_C_.yaml → icl_examples/humanities.yaml} +0 -0
- /sdg_hub/configs/skills/{_D_.yaml → icl_examples/math.yaml} +0 -0
- /sdg_hub/configs/skills/{_E_.yaml → icl_examples/reasoning.yaml} +0 -0
- /sdg_hub/configs/skills/{_F_.yaml → icl_examples/roleplay.yaml} +0 -0
- /sdg_hub/configs/skills/{_H_.yaml → icl_examples/writing.yaml} +0 -0
- {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.1.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,190 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: sdg_hub
|
3
|
+
Version: 0.1.1
|
4
|
+
Summary: Synthetic Data Generation
|
5
|
+
Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
|
6
|
+
License: Apache-2.0
|
7
|
+
Project-URL: homepage, https://ai-innovation.team/
|
8
|
+
Project-URL: source, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub
|
9
|
+
Project-URL: issues, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/issues
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
11
|
+
Classifier: Environment :: Console
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
14
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
15
|
+
Classifier: Operating System :: POSIX :: Linux
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
22
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
23
|
+
Requires-Python: >=3.9
|
24
|
+
Description-Content-Type: text/markdown
|
25
|
+
License-File: LICENSE
|
26
|
+
Requires-Dist: click<9.0.0,>=8.1.7
|
27
|
+
Requires-Dist: datasets<4.0.0,>=2.18.0
|
28
|
+
Requires-Dist: httpx<1.0.0,>=0.25.0
|
29
|
+
Requires-Dist: jinja2
|
30
|
+
Requires-Dist: openai<2.0.0,>=1.13.3
|
31
|
+
Requires-Dist: rich
|
32
|
+
Requires-Dist: tenacity!=8.4.0,>=8.3.0
|
33
|
+
Requires-Dist: tqdm<5.0.0,>=4.66.2
|
34
|
+
Provides-Extra: web-interface
|
35
|
+
Requires-Dist: flask>=3.0.2; extra == "web-interface"
|
36
|
+
Requires-Dist: pyyaml>=6.0.1; extra == "web-interface"
|
37
|
+
Requires-Dist: flask-wtf>=1.2.2; extra == "web-interface"
|
38
|
+
Provides-Extra: vllm
|
39
|
+
Requires-Dist: vllm<0.8.4,>=0.8.0; extra == "vllm"
|
40
|
+
Requires-Dist: torch>=2.0.0; extra == "vllm"
|
41
|
+
Requires-Dist: transformers>=4.37.0; extra == "vllm"
|
42
|
+
Requires-Dist: accelerate>=0.21.0; extra == "vllm"
|
43
|
+
Requires-Dist: xformers>=0.0.22.post7; extra == "vllm"
|
44
|
+
Provides-Extra: examples
|
45
|
+
Requires-Dist: tabulate>=0.9.0; extra == "examples"
|
46
|
+
Requires-Dist: transformers>=4.37.0; extra == "examples"
|
47
|
+
Requires-Dist: langchain-text-splitters; extra == "examples"
|
48
|
+
Requires-Dist: docling>=2.3.0; extra == "examples"
|
49
|
+
Provides-Extra: dev
|
50
|
+
Requires-Dist: pre-commit<4.0,>=3.0.4; extra == "dev"
|
51
|
+
Requires-Dist: pylint<4.0,>=2.16.2; extra == "dev"
|
52
|
+
Requires-Dist: pylint-pydantic; extra == "dev"
|
53
|
+
Requires-Dist: pytest; extra == "dev"
|
54
|
+
Requires-Dist: pytest-asyncio; extra == "dev"
|
55
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
56
|
+
Requires-Dist: pytest-html; extra == "dev"
|
57
|
+
Requires-Dist: tox<5,>=4.4.2; extra == "dev"
|
58
|
+
Dynamic: license-file
|
59
|
+
|
60
|
+
# SDG Hub: Synthetic Data Generation Toolkit
|
61
|
+
|
62
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/pypi.yaml)
|
63
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/releases)
|
64
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/blob/main/LICENSE)
|
65
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml)
|
66
|
+
[](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub)
|
67
|
+
|
68
|
+
<html>
|
69
|
+
<h3 align="center">
|
70
|
+
A modular, scalable, and efficient solution for creating synthetic data generation flows in a "low-code" manner.
|
71
|
+
</h3>
|
72
|
+
<h3 align="center">
|
73
|
+
<a href="http://ai-innovation.team/sdg_hub">Documentation</a> |
|
74
|
+
<a href="examples/">Examples</a> |
|
75
|
+
<a href="https://www.youtube.com/watch?v=aGKCViWjAmA">Video Tutorial</a>
|
76
|
+
</h3>
|
77
|
+
</html>
|
78
|
+
|
79
|
+
SDG Hub is designed to simplify data creation for LLMs, allowing users to chain computational units and build powerful flows for generating data and processing tasks. Define complex workflows using nothing but YAML configuration files.
|
80
|
+
|
81
|
+
**📖 Full documentation available at: [https://ai-innovation.team/sdg_hub](https://ai-innovation.team/sdg_hub)**
|
82
|
+
|
83
|
+
---
|
84
|
+
|
85
|
+
## ✨ Key Features
|
86
|
+
|
87
|
+
- **Low-Code Flow Creation**: Build sophisticated data generation pipelines using
|
88
|
+
simple YAML configuration files without writing any code.
|
89
|
+
|
90
|
+
- **Modular Block System**: Compose workflows from reusable, self-contained
|
91
|
+
blocks that handle LLM calls, data transformations, and filtering.
|
92
|
+
|
93
|
+
- **LLM-Agnostic**: Works with any language model through configurable
|
94
|
+
prompt templates and generation parameters.
|
95
|
+
|
96
|
+
- **Prompt Engineering Friendly**: Tune LLM behavior by editing declarative YAML prompts.
|
97
|
+
|
98
|
+
## 🚀 Installation
|
99
|
+
|
100
|
+
### Stable Release (Recommended)
|
101
|
+
|
102
|
+
```bash
|
103
|
+
pip install sdg-hub
|
104
|
+
```
|
105
|
+
|
106
|
+
### Development Version
|
107
|
+
|
108
|
+
```bash
|
109
|
+
pip install git+https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub.git
|
110
|
+
```
|
111
|
+
|
112
|
+
## 🏁 Quick Start
|
113
|
+
|
114
|
+
### Prerequisites
|
115
|
+
|
116
|
+
Before getting started, make sure you have:
|
117
|
+
- Python 3.8 or higher
|
118
|
+
- LLM Inference Endpoint exposed through OpenAI API
|
119
|
+
|
120
|
+
### Simple Example
|
121
|
+
|
122
|
+
Here's the simplest way to get started:
|
123
|
+
|
124
|
+
```python
|
125
|
+
from sdg_hub.flow_runner import run_flow
|
126
|
+
|
127
|
+
# Run a basic knowledge generation flow
|
128
|
+
run_flow(
|
129
|
+
ds_path="my_data.jsonl",
|
130
|
+
save_path="output.jsonl",
|
131
|
+
endpoint="http://0.0.0.0:8000/v1",
|
132
|
+
flow_path="flows/generation/knowledge/synth_knowledge.yaml"
|
133
|
+
)
|
134
|
+
```
|
135
|
+
|
136
|
+
### Advanced Configuration
|
137
|
+
You can invoke any built-in flow using run_flow:
|
138
|
+
```python
|
139
|
+
from sdg_hub.flow_runner import run_flow
|
140
|
+
|
141
|
+
run_flow(
|
142
|
+
ds_path="path/to/dataset.jsonl",
|
143
|
+
save_path="path/to/output.jsonl",
|
144
|
+
endpoint="http://0.0.0.0:8000/v1",
|
145
|
+
flow_path="path/to/flow.yaml",
|
146
|
+
checkpoint_dir="path/to/checkpoints",
|
147
|
+
batch_size=8,
|
148
|
+
num_workers=32,
|
149
|
+
save_freq=2,
|
150
|
+
)
|
151
|
+
```
|
152
|
+
|
153
|
+
### 📂 Available Built-in Flows
|
154
|
+
|
155
|
+
You can start with any of these YAML flows out of the box:
|
156
|
+
|
157
|
+
#### 🔎 **Knowledge Flows**
|
158
|
+
|
159
|
+
| Flow | Description |
|
160
|
+
|------|-------------|
|
161
|
+
| `synth_knowledge.yaml` | Produces document-grounded questions and answers for factual memorization |
|
162
|
+
| `synth_knowledge1.5.yaml` | Improved version that builds intermediate representations for better recall |
|
163
|
+
|
164
|
+
#### 🧠 **Skills Flows**
|
165
|
+
|
166
|
+
| Flow | Description |
|
167
|
+
|------|-------------|
|
168
|
+
| `synth_skills.yaml` | Freeform skills QA generation (eg: "Create a new github issue to add type hints") |
|
169
|
+
| `synth_grounded_skills.yaml` | Domain-specific skill generation (eg: "From the given conversation create a table for feature requests") |
|
170
|
+
| `improve_responses.yaml` | Uses planning and critique-based refinement to improve generated answers |
|
171
|
+
|
172
|
+
All these can be found here: [flows](src/sdg_hub/flows)
|
173
|
+
|
174
|
+
## 📺 Video Tutorial
|
175
|
+
|
176
|
+
For a comprehensive walkthrough of sdg_hub:
|
177
|
+
|
178
|
+
[](https://www.youtube.com/watch?v=aGKCViWjAmA)
|
179
|
+
|
180
|
+
## 🤝 Contributing
|
181
|
+
|
182
|
+
We welcome contributions from the community! Whether it's bug reports, feature requests, documentation improvements, or code contributions, please check out our [contribution guidelines](CONTRIBUTING.md).
|
183
|
+
|
184
|
+
## 📄 License
|
185
|
+
|
186
|
+
This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details.
|
187
|
+
|
188
|
+
---
|
189
|
+
|
190
|
+
Built with ❤️ by the Red Hat AI Innovation Team
|
@@ -0,0 +1,86 @@
|
|
1
|
+
sdg_hub/__init__.py,sha256=5Wa6onDndPvG4iwnjq2jK747t3-7XKdQn2WfHfq1sFc,67
|
2
|
+
sdg_hub/_version.py,sha256=Mmxse1R0ki5tjz9qzU8AQyqUsLt8nTyCAbYQp8R87PU,511
|
3
|
+
sdg_hub/checkpointer.py,sha256=R0pNKL_q7-BerxmIarY0w1nFYaq7fGnoRRkCVL6Z-Gw,5053
|
4
|
+
sdg_hub/flow.py,sha256=psCRKovzIKrsxmPYh6WN6mOUHrNUbi5HDkUlU6xD7x0,18163
|
5
|
+
sdg_hub/flow_runner.py,sha256=kwlYOhIldRC2RCdNV84NFMaWBzAv2plYPGz7drZ7fOA,5648
|
6
|
+
sdg_hub/logger_config.py,sha256=7uHEJVRfym1c4n95DOKHelLXqAus8uHsZYmzLsEjqpo,422
|
7
|
+
sdg_hub/pipeline.py,sha256=mahktfoCMVnuBnvLNjAVOAoFKNQo-wb0Dz1_xdYhKDM,3852
|
8
|
+
sdg_hub/prompts.py,sha256=rtiUS2IuaMAQVAy8aAwGxmk23sKC2Qqro7edymbENrk,8165
|
9
|
+
sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
+
sdg_hub/registry.py,sha256=Sc_HNxo4n0pgWMiEDd_sLjxaSXAMZFiHJIhQKqjywwk,3772
|
11
|
+
sdg_hub/sdg.py,sha256=8SKrSnqyvJAwE2Muf9lXw9ONRcDzqmCtaEzFHCYW4CY,6914
|
12
|
+
sdg_hub/blocks/__init__.py,sha256=pmxlv29ohPRdIVE9ojnBs3I58UwNMU0uTtGozOZuZzc,807
|
13
|
+
sdg_hub/blocks/block.py,sha256=zdeyDyYiY0EdD3xS7kZR2hRZCRkbygQ4WONp_zv3X7w,3051
|
14
|
+
sdg_hub/blocks/llmblock.py,sha256=nWslPFZSCiyL7MXQurOk6Jx29UOsgnVDMI3PTwje7kg,13678
|
15
|
+
sdg_hub/blocks/utilblocks.py,sha256=U2PQk26cwHOgofk5IenHjrao08gbqPFOBNRy5QJ-EEY,18290
|
16
|
+
sdg_hub/configs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
sdg_hub/configs/annotations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
+
sdg_hub/configs/annotations/cot_reflection.yaml,sha256=60EdsTe1y7GoUIAWYSGfMa3EKI3oLZKCvDuKU7wHgQU,1737
|
19
|
+
sdg_hub/configs/annotations/detailed_annotations.yaml,sha256=in21xmlhxDJGEaWh1IgINh33tEyW9AuyG3k4pWBuKSM,1520
|
20
|
+
sdg_hub/configs/annotations/detailed_description.yaml,sha256=FsGbQMBxf1MAOi0nhrQ4icxcwYMlRura_ji9Pmeh1AA,192
|
21
|
+
sdg_hub/configs/annotations/detailed_description_icl.yaml,sha256=NDdwo5EShnYZjm1Fn80sZTAwfnwpPigixP2hvJ8--cU,679
|
22
|
+
sdg_hub/configs/annotations/simple_annotations.yaml,sha256=e2F_Ow8EG_me4XJ2cnBTlKb9y1FmdX0DHKkiMqiwdUQ,188
|
23
|
+
sdg_hub/configs/knowledge/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
24
|
+
sdg_hub/configs/knowledge/atomic_facts.yaml,sha256=bIfQr0q0FyReO94v_lpLO56FikARCvFmZza-ISZTOnA,2453
|
25
|
+
sdg_hub/configs/knowledge/auxilary_instructions.yaml,sha256=aCgIjvNacdC2ZHThEvhZKvwORK6KqErVvVYQYQrIDLE,2034
|
26
|
+
sdg_hub/configs/knowledge/detailed_summary.yaml,sha256=_Mc_i9vaLp1OPKexSOURV5gbXEG41p1eELUukOhz8oM,388
|
27
|
+
sdg_hub/configs/knowledge/evaluate_faithfulness.yaml,sha256=iuvx5vNNm_jzHlmcKF83StaDYezRz2vQn3JUHM-TMdQ,3054
|
28
|
+
sdg_hub/configs/knowledge/evaluate_question.yaml,sha256=02mikEAJCUEkREBo7KxPY9H6iTUHQN-4cRkn2XMlVQ8,1915
|
29
|
+
sdg_hub/configs/knowledge/evaluate_relevancy.yaml,sha256=ASh8A1HAYO1h1tQRrwGnkUmK1n-WDKLdfW_LbSW1ipQ,3690
|
30
|
+
sdg_hub/configs/knowledge/extractive_summary.yaml,sha256=TYgJ7WQc7NFkf3GeRsbx6lwfA_xFnEOYGELewSqorp0,399
|
31
|
+
sdg_hub/configs/knowledge/generate_code_questions_responses.yaml,sha256=cIus2JYMYDvxHFVSU9QVa-1IK5KoChb3rCU2b4b9UmI,908
|
32
|
+
sdg_hub/configs/knowledge/generate_questions.yaml,sha256=iJtttZrVvlXFraUSrMowqTCLoJOLDbBndcTNMPTO8A4,2788
|
33
|
+
sdg_hub/configs/knowledge/generate_questions_responses.yaml,sha256=H9nb_5xGP7k6HtC3VboXqpiI5kQ9Xp3vjhXH3YIFesk,2525
|
34
|
+
sdg_hub/configs/knowledge/generate_responses.yaml,sha256=wwiB7lSB9yEB1XG2SIEIRtHkSlKh3NGJAmDaq2J6-ZY,2483
|
35
|
+
sdg_hub/configs/knowledge/mcq_generation.yaml,sha256=d4VKegnVIexwCn0e2AJs-0DC6XdLyUBGaCsQVwzICUE,3152
|
36
|
+
sdg_hub/configs/knowledge/router.yaml,sha256=9m_cX3xl808Vwrcq2PACyX45QFPkrV2nVYIY8x10JBU,119
|
37
|
+
sdg_hub/configs/knowledge/simple_generate_qa.yaml,sha256=OsuZP9SxQeUhTsHdhUO10mnjJ1u_6xekW5IQucFpRco,1565
|
38
|
+
sdg_hub/configs/reasoning/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
39
|
+
sdg_hub/configs/reasoning/dynamic_cot.yaml,sha256=6XY_mFpB_oKFQ7U2CmHTqkJRGVHgOvpNmIDfhksYW6o,2641
|
40
|
+
sdg_hub/configs/skills/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
41
|
+
sdg_hub/configs/skills/analyzer.yaml,sha256=QBtyjaU6HBZqzNOmev_W4_scn_hH7Rfxd2xL_LcPLho,2261
|
42
|
+
sdg_hub/configs/skills/annotation.yaml,sha256=k5nJ357kUr0Uvq7Hkt3Ey22UbgSjgSjIomjHFfjaQnY,916
|
43
|
+
sdg_hub/configs/skills/contexts.yaml,sha256=MZ2QpuGhTce6kuEsMleaGblljhGG-yhXBuH42htA2P4,1161
|
44
|
+
sdg_hub/configs/skills/critic.yaml,sha256=Dr7anOKa7Xx1oDonXzsCfXwKIl4hUTArx2Sb_rgpLQI,1808
|
45
|
+
sdg_hub/configs/skills/evaluate_freeform_pair.yaml,sha256=MOI0-GyKrJ_O4v1mm8A1lIKxXfwcS3dA7GjlpDEuXRU,4055
|
46
|
+
sdg_hub/configs/skills/evaluate_freeform_questions.yaml,sha256=yDmLd-3A9pN5VLaT4lAcJ_ZvCY43LYlcS1KEdxpBRjU,2559
|
47
|
+
sdg_hub/configs/skills/evaluate_grounded_pair.yaml,sha256=vMQtsHpNxPOOHnkzqWPp-N1gSfwPqTbfcKmNfhb9WS8,4648
|
48
|
+
sdg_hub/configs/skills/evaluate_grounded_questions.yaml,sha256=9yr97azFhMdOfYp11BFtDSIhhP4wjQMOxYZnKWKlCPU,3115
|
49
|
+
sdg_hub/configs/skills/freeform_questions.yaml,sha256=N6R3c1jNiSSw6T-OUJULpLnPHuaSXjvoNjSqTKL6EOY,1500
|
50
|
+
sdg_hub/configs/skills/freeform_responses.yaml,sha256=4URTMsPpgSDOVj71Gw3lL82QWnUFR37iE72BIMwwv7c,1544
|
51
|
+
sdg_hub/configs/skills/grounded_questions.yaml,sha256=t6pKjt5Fp_ThZueB7JBrUKuQLQY_At-Y9O67OtrIXMo,1898
|
52
|
+
sdg_hub/configs/skills/grounded_responses.yaml,sha256=kVOeBp3BjKCFKG2qConXIQVVPI1EgcKJgKn6DFAkl1s,1860
|
53
|
+
sdg_hub/configs/skills/judge.yaml,sha256=FxnJA_wdmyMyMqGEZDAT8hc2itO845mGDNXgpmV2EUU,3203
|
54
|
+
sdg_hub/configs/skills/planner.yaml,sha256=yNF6t0EnmwYt1EV9Y3-vkmPcbOQRtvoLr8MITuiUw_A,2086
|
55
|
+
sdg_hub/configs/skills/respond.yaml,sha256=K1Q5X5_Q1k60hNDbHDjMYBzxbyOIEEHTQcXW6qQ4Ve0,108
|
56
|
+
sdg_hub/configs/skills/revised_responder.yaml,sha256=rjypOJbhZV9PuOD9YhlYgymxOJV8Zdzzz54x6Fxn2bY,2875
|
57
|
+
sdg_hub/configs/skills/router.yaml,sha256=7YnFp6H5wYD8W5Qn1Ac4r9dGBSFUDhZSNwmglQ99PgQ,3545
|
58
|
+
sdg_hub/configs/skills/simple_generate_qa_freeform.yaml,sha256=j8cJtEKSvtA__rE08iU6oz2XnfIgj0HiLVL8-6RhK3c,1431
|
59
|
+
sdg_hub/configs/skills/simple_generate_qa_grounded.yaml,sha256=tvX9EN5TArFesOOqpdN3hb-IHe7O82a2twQd-gzyCgw,1500
|
60
|
+
sdg_hub/configs/skills/icl_examples/STEM.yaml,sha256=5dcLC5jXOEeDasBkTunnHYrlddI3HcHYnEAXZcrd0ds,8412
|
61
|
+
sdg_hub/configs/skills/icl_examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
62
|
+
sdg_hub/configs/skills/icl_examples/coding.yaml,sha256=a5m-pUcV9xUb54gQ5U3vsU1RBXzOmsfX0CjTW7U62zo,5240
|
63
|
+
sdg_hub/configs/skills/icl_examples/extraction.yaml,sha256=P751l6NvFRkINWz-bX5jgnd_if2bl3d_NlhGI7g81xw,4654
|
64
|
+
sdg_hub/configs/skills/icl_examples/humanities.yaml,sha256=tZyiJ4Q3gG4uuoDXw6g__lX3ySEUaRZW2GhW1ustwaM,11370
|
65
|
+
sdg_hub/configs/skills/icl_examples/math.yaml,sha256=hNq-QudlXrg9CWLpJdrZ4v3vifGTWhyp2gcfwPdR3_o,6776
|
66
|
+
sdg_hub/configs/skills/icl_examples/reasoning.yaml,sha256=eesIlH9SO07TVF20gy18MZrcDzLhSmynd_F_lvg0oQg,4335
|
67
|
+
sdg_hub/configs/skills/icl_examples/roleplay.yaml,sha256=LYEyA7wv7QWQscUNQr0K_lotNoWSfuoAEncx3PCRYIs,6997
|
68
|
+
sdg_hub/configs/skills/icl_examples/writing.yaml,sha256=El-57IjZ5IvdcmCHyHvX_M2RFFkEos572220be8ecrQ,11335
|
69
|
+
sdg_hub/flows/generation/knowledge/mmlu_bench.yaml,sha256=Rueuxr_n1zabE_nGqOgUfh5hqVmEONRka9NLiZANSew,346
|
70
|
+
sdg_hub/flows/generation/knowledge/simple_knowledge.yaml,sha256=o4uyfs1nDiECcNROdsvHKiM46NYvQufo9dF4XSGpY54,298
|
71
|
+
sdg_hub/flows/generation/knowledge/synth_knowledge.yaml,sha256=ZTZvevfwDQSKUwPcv1i5IzIchsRHSEN03eTefedQmU8,2172
|
72
|
+
sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml,sha256=5KAyOfhjqpFoDBtG-juEZES4gkskzB2VgSvAAlUbSak,3709
|
73
|
+
sdg_hub/flows/generation/skills/improve_responses.yaml,sha256=wUV0awTmKHNZ62pHiw_yz-IdG0OYgT_dCwlMUlZS3TA,2683
|
74
|
+
sdg_hub/flows/generation/skills/simple_freeform_skill.yaml,sha256=iVEomFH1E52JA7KLmTIwkS1PnzxUJVPMgbK2O-m80As,309
|
75
|
+
sdg_hub/flows/generation/skills/simple_grounded_skill.yaml,sha256=LTLxqdgbLIKSJonuIRHhcRSpit1EawwNvytWzXWXe2E,309
|
76
|
+
sdg_hub/flows/generation/skills/synth_grounded_skills.yaml,sha256=91Dm--agpmbm02hIVnFhEndjppKsQEWXDbckR9GAzKM,2045
|
77
|
+
sdg_hub/flows/generation/skills/synth_skills.yaml,sha256=9lhQcxXXbN4V9ztPph4fyjUtctll2FYtKY-V4grQdy4,1492
|
78
|
+
sdg_hub/utils/__init__.py,sha256=UEo-9qPt5iVKBIRvgZhOI0SoIBO6zeBxOuLvUQXaM3g,185
|
79
|
+
sdg_hub/utils/config_validation.py,sha256=g92GxN73Mjr0cXvc5amB_Fn4iV9-iKeWmPz9HwLPmNY,3426
|
80
|
+
sdg_hub/utils/datautils.py,sha256=0t_SZ_UXBKl8uL6rVp3SUh8YKRbzKlh2oO5gr2cKyEw,389
|
81
|
+
sdg_hub/utils/validation_result.py,sha256=O3zF6r49LQ9StAf_oWmK2bg-JfTQw6rpbHtHr9lI4ks,264
|
82
|
+
sdg_hub-0.1.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
83
|
+
sdg_hub-0.1.1.dist-info/METADATA,sha256=s4pRrDO0pKXc3g_mGTyqXiNND1Rbd6YJOeHoLbHhSDw,7247
|
84
|
+
sdg_hub-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
85
|
+
sdg_hub-0.1.1.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
|
86
|
+
sdg_hub-0.1.1.dist-info/RECORD,,
|
sdg_hub/blocks/filterblock.py
DELETED
@@ -1,76 +0,0 @@
|
|
1
|
-
# SPDX-License-Identifier: Apache-2.0
|
2
|
-
# Standard
|
3
|
-
import operator
|
4
|
-
|
5
|
-
# Third Party
|
6
|
-
from datasets import Dataset
|
7
|
-
|
8
|
-
# Local
|
9
|
-
from .block import Block
|
10
|
-
from ..registry import BlockRegistry
|
11
|
-
from ..logger_config import setup_logger
|
12
|
-
|
13
|
-
logger = setup_logger(__name__)
|
14
|
-
|
15
|
-
|
16
|
-
@BlockRegistry.register("FilterByValueBlock")
|
17
|
-
class FilterByValueBlock(Block):
|
18
|
-
def __init__(
|
19
|
-
self, filter_column, filter_value, operation, convert_dtype=None, **batch_kwargs
|
20
|
-
) -> None:
|
21
|
-
"""
|
22
|
-
Initializes a new instance of the FilterByValueBlock class.
|
23
|
-
|
24
|
-
Parameters:
|
25
|
-
- filter_column (str): The name of the column in the dataset to apply the filter on.
|
26
|
-
- filter_value (any or list of any): The value(s) to filter by.
|
27
|
-
- operation (callable): A function that takes two arguments (column value and filter value) and returns a boolean indicating whether the row should be included in the filtered dataset.
|
28
|
-
- convert_dtype (callable, optional): A function to convert the data type of the filter column before applying the filter. Defaults to None.
|
29
|
-
- **batch_kwargs: Additional kwargs for batch processing.
|
30
|
-
|
31
|
-
Returns:
|
32
|
-
None
|
33
|
-
"""
|
34
|
-
super().__init__(block_name=self.__class__.__name__)
|
35
|
-
self.value = filter_value if isinstance(filter_value, list) else [filter_value]
|
36
|
-
self.column_name = filter_column
|
37
|
-
self.operation = operation
|
38
|
-
self.convert_dtype = convert_dtype
|
39
|
-
self.num_procs = batch_kwargs.get("num_procs", 1)
|
40
|
-
|
41
|
-
def _convert_dtype(self, sample):
|
42
|
-
try:
|
43
|
-
sample[self.column_name] = self.convert_dtype(sample[self.column_name])
|
44
|
-
except ValueError as e:
|
45
|
-
logger.error(
|
46
|
-
"Error converting dtype: %s, filling with None to be filtered later", e
|
47
|
-
)
|
48
|
-
sample[self.column_name] = None
|
49
|
-
return sample
|
50
|
-
|
51
|
-
def generate(self, samples) -> Dataset:
|
52
|
-
if self.convert_dtype:
|
53
|
-
samples = samples.map(
|
54
|
-
self._convert_dtype,
|
55
|
-
num_proc=self.num_procs,
|
56
|
-
)
|
57
|
-
|
58
|
-
if self.operation == operator.contains:
|
59
|
-
samples = samples.filter(
|
60
|
-
lambda x: self.operation(self.value, x[self.column_name]),
|
61
|
-
num_proc=self.num_procs,
|
62
|
-
)
|
63
|
-
|
64
|
-
samples = samples.filter(
|
65
|
-
lambda x: x[self.column_name] is not None,
|
66
|
-
num_proc=self.num_procs,
|
67
|
-
)
|
68
|
-
|
69
|
-
samples = samples.filter(
|
70
|
-
lambda x: any(
|
71
|
-
self.operation(x[self.column_name], value) for value in self.value
|
72
|
-
),
|
73
|
-
num_proc=self.num_procs,
|
74
|
-
)
|
75
|
-
|
76
|
-
return samples
|
sdg_hub/blocks/iterblock.py
DELETED
@@ -1,31 +0,0 @@
|
|
1
|
-
# Third Party
|
2
|
-
from datasets import Dataset
|
3
|
-
|
4
|
-
# Local
|
5
|
-
from .block import Block
|
6
|
-
from ..registry import BlockRegistry
|
7
|
-
from ..logger_config import setup_logger
|
8
|
-
|
9
|
-
logger = setup_logger(__name__)
|
10
|
-
|
11
|
-
|
12
|
-
@BlockRegistry.register("IterBlock")
|
13
|
-
class IterBlock(Block):
|
14
|
-
def __init__(self, block_name, num_iters, block_type, block_kwargs, **kwargs):
|
15
|
-
super().__init__(block_name)
|
16
|
-
self.num_iters = num_iters
|
17
|
-
self.block = block_type(**block_kwargs)
|
18
|
-
self.gen_kwargs = kwargs.get("gen_kwargs", {})
|
19
|
-
self.gen_kwargs = kwargs.get("gen_kwargs", {})
|
20
|
-
|
21
|
-
def generate(self, samples, **gen_kwargs) -> Dataset:
|
22
|
-
generated_samples = []
|
23
|
-
num_iters = self.num_iters
|
24
|
-
|
25
|
-
for _ in range(num_iters):
|
26
|
-
batch_generated = self.block.generate(
|
27
|
-
samples, **{**self.gen_kwargs, **gen_kwargs}
|
28
|
-
)
|
29
|
-
generated_samples.extend(batch_generated)
|
30
|
-
|
31
|
-
return Dataset.from_list(generated_samples)
|
sdg_hub/blocks/rmblocks.py
DELETED
@@ -1,194 +0,0 @@
|
|
1
|
-
"""Module containing blocks for scoring responses using Reward Models."""
|
2
|
-
|
3
|
-
# Standard
|
4
|
-
from typing import Dict, List
|
5
|
-
import json
|
6
|
-
from urllib.parse import urljoin
|
7
|
-
|
8
|
-
# Third Party
|
9
|
-
from datasets import Dataset
|
10
|
-
import requests
|
11
|
-
|
12
|
-
# Local
|
13
|
-
from .block import Block
|
14
|
-
from ..logger_config import setup_logger
|
15
|
-
from ..registry import BlockRegistry
|
16
|
-
|
17
|
-
logger = setup_logger(__name__)
|
18
|
-
|
19
|
-
|
20
|
-
@BlockRegistry.register("PRMBlock")
|
21
|
-
class PRMBlock(Block):
|
22
|
-
"""A block for scoring responses using a ProcessReward Model (PRM) via HTTP API.
|
23
|
-
|
24
|
-
This block sends prompts and responses to a PRM endpoint and returns reward scores
|
25
|
-
for each step in the response.
|
26
|
-
"""
|
27
|
-
|
28
|
-
def __init__(
|
29
|
-
self,
|
30
|
-
block_name: str,
|
31
|
-
host: str,
|
32
|
-
port: int,
|
33
|
-
model_name: str,
|
34
|
-
prompt_col: str,
|
35
|
-
response_col: str,
|
36
|
-
output_col: str = "step_rewards",
|
37
|
-
system_prompt: str = None,
|
38
|
-
endpoint: str = "pooling",
|
39
|
-
step_separator: str = "\n\n",
|
40
|
-
step_fill_token: str = "<extra_0>",
|
41
|
-
) -> None:
|
42
|
-
r"""Initialize the PRM (Process Reward Model) Block.
|
43
|
-
|
44
|
-
Parameters
|
45
|
-
----------
|
46
|
-
block_name : str
|
47
|
-
Name of the block
|
48
|
-
host : str
|
49
|
-
Hostname of the PRM service (e.g., "0.0.0.0" or "localhost")
|
50
|
-
port : int
|
51
|
-
Port number the service is running on
|
52
|
-
model_name : str
|
53
|
-
Name of the PRM model to use
|
54
|
-
prompt_col : str
|
55
|
-
Column name containing the prompt
|
56
|
-
response_col : str
|
57
|
-
Column name containing the response
|
58
|
-
output_col : str, optional
|
59
|
-
Column name to store the reward scores, by default "step_rewards"
|
60
|
-
system_prompt : str, optional
|
61
|
-
Optional system prompt to use for scoring, by default None
|
62
|
-
endpoint : str, optional
|
63
|
-
API endpoint name, by default "pooling"
|
64
|
-
step_separator : str, optional
|
65
|
-
Separator between steps in the response, by default "\n\n"
|
66
|
-
step_fill_token : str, optional
|
67
|
-
Model specific fill token for steps in the response, by default "<extra_0>" used by Qwen2.5-Math-PRM
|
68
|
-
"""
|
69
|
-
super().__init__(block_name)
|
70
|
-
# Construct base URL from host and port
|
71
|
-
self.base_url = f"http://{host.strip('/')}:{port}/"
|
72
|
-
self.endpoint = endpoint.strip("/")
|
73
|
-
|
74
|
-
# Construct the full API URL using urljoin
|
75
|
-
self.api_url = urljoin(self.base_url, self.endpoint)
|
76
|
-
logger.info(f"Initialized PRMBlock with API URL: {self.api_url}")
|
77
|
-
|
78
|
-
self.model_name = model_name
|
79
|
-
self.prompt_col = prompt_col
|
80
|
-
self.response_col = response_col
|
81
|
-
self.output_col = output_col
|
82
|
-
self.system_prompt = system_prompt
|
83
|
-
self.step_separator = step_separator
|
84
|
-
self.step_fill_token = step_fill_token
|
85
|
-
|
86
|
-
def _post_request(self, messages: List[Dict]) -> requests.Response:
|
87
|
-
"""Make POST request to PRM API endpoint.
|
88
|
-
|
89
|
-
Parameters
|
90
|
-
----------
|
91
|
-
messages : List[Dict]
|
92
|
-
List of message dictionaries to send to the API
|
93
|
-
|
94
|
-
Returns
|
95
|
-
-------
|
96
|
-
requests.Response
|
97
|
-
Response from the API
|
98
|
-
"""
|
99
|
-
headers = {"User-Agent": "PRMBlock Client"}
|
100
|
-
prompt = {"model": self.model_name, "messages": messages}
|
101
|
-
response = requests.post(self.api_url, headers=headers, json=prompt)
|
102
|
-
return response
|
103
|
-
|
104
|
-
def _format_messages(self, sample: Dict) -> List[Dict]:
|
105
|
-
"""Format input sample into messages for the PRM API.
|
106
|
-
|
107
|
-
Parameters
|
108
|
-
----------
|
109
|
-
sample : Dict
|
110
|
-
Input sample containing prompt and response
|
111
|
-
|
112
|
-
Returns
|
113
|
-
-------
|
114
|
-
List[Dict]
|
115
|
-
Formatted messages for the API
|
116
|
-
"""
|
117
|
-
messages = []
|
118
|
-
if self.system_prompt:
|
119
|
-
messages.append({"role": "system", "content": self.system_prompt})
|
120
|
-
|
121
|
-
messages.append({"role": "user", "content": sample[self.prompt_col]})
|
122
|
-
messages.append(
|
123
|
-
{
|
124
|
-
"role": "assistant",
|
125
|
-
"content": self.step_fill_token.join(sample[self.response_col].split(self.step_separator))
|
126
|
-
+ self.step_fill_token,
|
127
|
-
}
|
128
|
-
)
|
129
|
-
return messages
|
130
|
-
|
131
|
-
def _extract_rewards(self, response: requests.Response) -> List[float]:
|
132
|
-
"""Extract reward scores from API response.
|
133
|
-
|
134
|
-
Parameters
|
135
|
-
----------
|
136
|
-
response : requests.Response
|
137
|
-
Response from the API
|
138
|
-
|
139
|
-
Returns
|
140
|
-
-------
|
141
|
-
List[float]
|
142
|
-
List of reward scores
|
143
|
-
"""
|
144
|
-
try:
|
145
|
-
response_data = response.json()
|
146
|
-
rewards = [x[1] for x in response_data["data"][0]["data"]]
|
147
|
-
return rewards
|
148
|
-
except (KeyError, IndexError, json.JSONDecodeError) as e:
|
149
|
-
logger.error(f"Error extracting rewards from response: {e}")
|
150
|
-
return []
|
151
|
-
|
152
|
-
def _generate(self, sample: dict) -> dict:
|
153
|
-
"""Generate reward scores for the input samples.
|
154
|
-
|
155
|
-
Parameters
|
156
|
-
----------
|
157
|
-
sample : dict
|
158
|
-
Input sample to score
|
159
|
-
|
160
|
-
Returns
|
161
|
-
-------
|
162
|
-
dict
|
163
|
-
Dictionary with added reward scores column
|
164
|
-
"""
|
165
|
-
messages = self._format_messages(sample)
|
166
|
-
rm_response = self._post_request(messages)
|
167
|
-
|
168
|
-
if rm_response.status_code != 200:
|
169
|
-
logger.error(f"API request failed with status {rm_response.status_code}")
|
170
|
-
rewards = [0.0] * len(
|
171
|
-
sample[self.response_col].split(self.step_separator)
|
172
|
-
) # Default to 0 scores on failure
|
173
|
-
else:
|
174
|
-
rewards = self._extract_rewards(rm_response)
|
175
|
-
|
176
|
-
sample[self.output_col] = rewards
|
177
|
-
return sample
|
178
|
-
|
179
|
-
def generate(self, samples: Dataset, batch_size: int = 4) -> Dataset:
|
180
|
-
"""Generate reward scores for the input samples.
|
181
|
-
|
182
|
-
Parameters
|
183
|
-
----------
|
184
|
-
samples : Dataset
|
185
|
-
Input dataset containing samples to score
|
186
|
-
batch_size : int, optional
|
187
|
-
Number of processes to use for parallel processing, by default 4
|
188
|
-
|
189
|
-
Returns
|
190
|
-
-------
|
191
|
-
Dataset
|
192
|
-
Dataset with added reward scores
|
193
|
-
"""
|
194
|
-
return samples.map(self._generate, num_proc=batch_size)
|
@@ -1,6 +0,0 @@
|
|
1
|
-
datasets:
|
2
|
-
- path: instructlab/InstructLabCommunity
|
3
|
-
sampling_size: 1.0
|
4
|
-
|
5
|
-
sys_prompt: |
|
6
|
-
I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.
|
@@ -1,19 +0,0 @@
|
|
1
|
-
- block_type: LLMBlock
|
2
|
-
block_config:
|
3
|
-
block_name: gen_responses
|
4
|
-
config_path: configs/annotations/detailed_description.yaml
|
5
|
-
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
6
|
-
output_cols:
|
7
|
-
- output
|
8
|
-
gen_kwargs:
|
9
|
-
max_tokens: 5
|
10
|
-
temperature: 0
|
11
|
-
extra_body:
|
12
|
-
guided_choice:
|
13
|
-
- "joy"
|
14
|
-
- "sadness"
|
15
|
-
- "anger"
|
16
|
-
- "fear"
|
17
|
-
- "love"
|
18
|
-
drop_duplicates:
|
19
|
-
- prompt
|
@@ -1,19 +0,0 @@
|
|
1
|
-
- block_type: LLMBlock
|
2
|
-
block_config:
|
3
|
-
block_name: gen_responses
|
4
|
-
config_path: configs/annotations/detailed_description_icl.yaml
|
5
|
-
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
6
|
-
output_cols:
|
7
|
-
- output
|
8
|
-
gen_kwargs:
|
9
|
-
max_tokens: 5
|
10
|
-
temperature: 0
|
11
|
-
extra_body:
|
12
|
-
guided_choice:
|
13
|
-
- "joy"
|
14
|
-
- "sadness"
|
15
|
-
- "anger"
|
16
|
-
- "fear"
|
17
|
-
- "love"
|
18
|
-
drop_duplicates:
|
19
|
-
- prompt
|
@@ -1,19 +0,0 @@
|
|
1
|
-
- block_type: LLMBlock
|
2
|
-
block_config:
|
3
|
-
block_name: gen_responses
|
4
|
-
config_path: configs/annotations/simple.yaml
|
5
|
-
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
6
|
-
output_cols:
|
7
|
-
- output
|
8
|
-
gen_kwargs:
|
9
|
-
max_tokens: 5
|
10
|
-
temperature: 0
|
11
|
-
extra_body:
|
12
|
-
guided_choice:
|
13
|
-
- "joy"
|
14
|
-
- "sadness"
|
15
|
-
- "anger"
|
16
|
-
- "fear"
|
17
|
-
- "love"
|
18
|
-
drop_duplicates:
|
19
|
-
- prompt
|