sdg-hub 0.1.0a3__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. sdg_hub/_version.py +2 -2
  2. sdg_hub/blocks/__init__.py +35 -5
  3. sdg_hub/blocks/block.py +58 -16
  4. sdg_hub/blocks/llmblock.py +149 -204
  5. sdg_hub/blocks/utilblocks.py +500 -43
  6. sdg_hub/checkpointer.py +139 -0
  7. sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
  8. sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
  9. sdg_hub/configs/knowledge/atomic_facts.yaml +1 -0
  10. sdg_hub/configs/knowledge/detailed_summary.yaml +1 -0
  11. sdg_hub/configs/knowledge/extractive_summary.yaml +1 -0
  12. sdg_hub/configs/knowledge/generate_questions.yaml +82 -0
  13. sdg_hub/configs/knowledge/generate_responses.yaml +86 -0
  14. sdg_hub/configs/skills/contexts.yaml +18 -11
  15. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +79 -12
  16. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +60 -28
  17. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +95 -30
  18. sdg_hub/configs/skills/freeform_questions.yaml +21 -16
  19. sdg_hub/configs/skills/freeform_responses.yaml +19 -25
  20. sdg_hub/configs/skills/router.yaml +53 -6
  21. sdg_hub/flow.py +351 -21
  22. sdg_hub/flow_runner.py +216 -0
  23. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +26 -9
  24. sdg_hub/flows/generation/skills/{agentic_improve_skill.yaml → improve_responses.yaml} +26 -31
  25. sdg_hub/flows/generation/skills/synth_skills.yaml +4 -4
  26. sdg_hub/pipeline.py +67 -12
  27. sdg_hub/prompts.py +26 -0
  28. sdg_hub/sdg.py +128 -86
  29. sdg_hub/utils/config_validation.py +91 -0
  30. sdg_hub/utils/validation_result.py +10 -0
  31. sdg_hub-0.1.1.dist-info/METADATA +190 -0
  32. sdg_hub-0.1.1.dist-info/RECORD +86 -0
  33. {sdg_hub-0.1.0a3.dist-info → sdg_hub-0.1.1.dist-info}/WHEEL +1 -1
  34. sdg_hub/blocks/filterblock.py +0 -76
  35. sdg_hub/blocks/iterblock.py +0 -31
  36. sdg_hub/blocks/rmblocks.py +0 -194
  37. sdg_hub/configs/annotations/simple.yaml +0 -10
  38. sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +0 -3
  39. sdg_hub/configs/skills/data_recipe/default_recipe.yaml +0 -6
  40. sdg_hub/flows/annotation/emotion/detailed_description.yaml +0 -19
  41. sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +0 -19
  42. sdg_hub/flows/annotation/emotion/simple.yaml +0 -19
  43. sdg_hub/utils/chunking.py +0 -73
  44. sdg_hub/utils/docprocessor.py +0 -357
  45. sdg_hub/utils/parse_and_convert.py +0 -392
  46. sdg_hub-0.1.0a3.dist-info/METADATA +0 -154
  47. sdg_hub-0.1.0a3.dist-info/RECORD +0 -90
  48. /sdg_hub/configs/{knowledge/data_recipe → reasoning}/__init__.py +0 -0
  49. /sdg_hub/configs/skills/{_G_.yaml → icl_examples/STEM.yaml} +0 -0
  50. /sdg_hub/configs/skills/{data_recipe → icl_examples}/__init__.py +0 -0
  51. /sdg_hub/configs/skills/{_A_.yaml → icl_examples/coding.yaml} +0 -0
  52. /sdg_hub/configs/skills/{_B_.yaml → icl_examples/extraction.yaml} +0 -0
  53. /sdg_hub/configs/skills/{_C_.yaml → icl_examples/humanities.yaml} +0 -0
  54. /sdg_hub/configs/skills/{_D_.yaml → icl_examples/math.yaml} +0 -0
  55. /sdg_hub/configs/skills/{_E_.yaml → icl_examples/reasoning.yaml} +0 -0
  56. /sdg_hub/configs/skills/{_F_.yaml → icl_examples/roleplay.yaml} +0 -0
  57. /sdg_hub/configs/skills/{_H_.yaml → icl_examples/writing.yaml} +0 -0
  58. {sdg_hub-0.1.0a3.dist-info → sdg_hub-0.1.1.dist-info}/licenses/LICENSE +0 -0
  59. {sdg_hub-0.1.0a3.dist-info → sdg_hub-0.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,10 @@
1
+ from typing import List
2
+
3
+
4
+ class ValidationResult:
5
+ def __init__(self, valid: bool, errors: List[str]):
6
+ self.valid = valid
7
+ self.errors = errors
8
+
9
+ def __repr__(self):
10
+ return f"ValidationResult(valid={self.valid}, errors={self.errors})"
@@ -0,0 +1,190 @@
1
+ Metadata-Version: 2.4
2
+ Name: sdg_hub
3
+ Version: 0.1.1
4
+ Summary: Synthetic Data Generation
5
+ Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
+ License: Apache-2.0
7
+ Project-URL: homepage, https://ai-innovation.team/
8
+ Project-URL: source, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub
9
+ Project-URL: issues, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/issues
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Environment :: Console
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: MacOS :: MacOS X
15
+ Classifier: Operating System :: POSIX :: Linux
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: Implementation :: CPython
23
+ Requires-Python: >=3.9
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: click<9.0.0,>=8.1.7
27
+ Requires-Dist: datasets<4.0.0,>=2.18.0
28
+ Requires-Dist: httpx<1.0.0,>=0.25.0
29
+ Requires-Dist: jinja2
30
+ Requires-Dist: openai<2.0.0,>=1.13.3
31
+ Requires-Dist: rich
32
+ Requires-Dist: tenacity!=8.4.0,>=8.3.0
33
+ Requires-Dist: tqdm<5.0.0,>=4.66.2
34
+ Provides-Extra: web-interface
35
+ Requires-Dist: flask>=3.0.2; extra == "web-interface"
36
+ Requires-Dist: pyyaml>=6.0.1; extra == "web-interface"
37
+ Requires-Dist: flask-wtf>=1.2.2; extra == "web-interface"
38
+ Provides-Extra: vllm
39
+ Requires-Dist: vllm<0.8.4,>=0.8.0; extra == "vllm"
40
+ Requires-Dist: torch>=2.0.0; extra == "vllm"
41
+ Requires-Dist: transformers>=4.37.0; extra == "vllm"
42
+ Requires-Dist: accelerate>=0.21.0; extra == "vllm"
43
+ Requires-Dist: xformers>=0.0.22.post7; extra == "vllm"
44
+ Provides-Extra: examples
45
+ Requires-Dist: tabulate>=0.9.0; extra == "examples"
46
+ Requires-Dist: transformers>=4.37.0; extra == "examples"
47
+ Requires-Dist: langchain-text-splitters; extra == "examples"
48
+ Requires-Dist: docling>=2.3.0; extra == "examples"
49
+ Provides-Extra: dev
50
+ Requires-Dist: pre-commit<4.0,>=3.0.4; extra == "dev"
51
+ Requires-Dist: pylint<4.0,>=2.16.2; extra == "dev"
52
+ Requires-Dist: pylint-pydantic; extra == "dev"
53
+ Requires-Dist: pytest; extra == "dev"
54
+ Requires-Dist: pytest-asyncio; extra == "dev"
55
+ Requires-Dist: pytest-cov; extra == "dev"
56
+ Requires-Dist: pytest-html; extra == "dev"
57
+ Requires-Dist: tox<5,>=4.4.2; extra == "dev"
58
+ Dynamic: license-file
59
+
60
+ # SDG Hub: Synthetic Data Generation Toolkit
61
+
62
+ [![Build](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/pypi.yaml/badge.svg?branch=main)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/pypi.yaml)
63
+ [![Release](https://img.shields.io/github/v/release/Red-Hat-AI-Innovation-Team/sdg_hub)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/releases)
64
+ [![License](https://img.shields.io/github/license/Red-Hat-AI-Innovation-Team/sdg_hub)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/blob/main/LICENSE)
65
+ [![Tests](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml/badge.svg)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml)
66
+ [![codecov](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub/graph/badge.svg?token=SP75BCXWO2)](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub)
67
+
68
+ <html>
69
+ <h3 align="center">
70
+ A modular, scalable, and efficient solution for creating synthetic data generation flows in a "low-code" manner.
71
+ </h3>
72
+ <h3 align="center">
73
+ <a href="http://ai-innovation.team/sdg_hub">Documentation</a> |
74
+ <a href="examples/">Examples</a> |
75
+ <a href="https://www.youtube.com/watch?v=aGKCViWjAmA">Video Tutorial</a>
76
+ </h3>
77
+ </html>
78
+
79
+ SDG Hub is designed to simplify data creation for LLMs, allowing users to chain computational units and build powerful flows for generating data and processing tasks. Define complex workflows using nothing but YAML configuration files.
80
+
81
+ **📖 Full documentation available at: [https://ai-innovation.team/sdg_hub](https://ai-innovation.team/sdg_hub)**
82
+
83
+ ---
84
+
85
+ ## ✨ Key Features
86
+
87
+ - **Low-Code Flow Creation**: Build sophisticated data generation pipelines using
88
+ simple YAML configuration files without writing any code.
89
+
90
+ - **Modular Block System**: Compose workflows from reusable, self-contained
91
+ blocks that handle LLM calls, data transformations, and filtering.
92
+
93
+ - **LLM-Agnostic**: Works with any language model through configurable
94
+ prompt templates and generation parameters.
95
+
96
+ - **Prompt Engineering Friendly**: Tune LLM behavior by editing declarative YAML prompts.
97
+
98
+ ## 🚀 Installation
99
+
100
+ ### Stable Release (Recommended)
101
+
102
+ ```bash
103
+ pip install sdg-hub
104
+ ```
105
+
106
+ ### Development Version
107
+
108
+ ```bash
109
+ pip install git+https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub.git
110
+ ```
111
+
112
+ ## 🏁 Quick Start
113
+
114
+ ### Prerequisites
115
+
116
+ Before getting started, make sure you have:
117
+ - Python 3.8 or higher
118
+ - LLM Inference Endpoint exposed through OpenAI API
119
+
120
+ ### Simple Example
121
+
122
+ Here's the simplest way to get started:
123
+
124
+ ```python
125
+ from sdg_hub.flow_runner import run_flow
126
+
127
+ # Run a basic knowledge generation flow
128
+ run_flow(
129
+ ds_path="my_data.jsonl",
130
+ save_path="output.jsonl",
131
+ endpoint="http://0.0.0.0:8000/v1",
132
+ flow_path="flows/generation/knowledge/synth_knowledge.yaml"
133
+ )
134
+ ```
135
+
136
+ ### Advanced Configuration
137
+ You can invoke any built-in flow using run_flow:
138
+ ```python
139
+ from sdg_hub.flow_runner import run_flow
140
+
141
+ run_flow(
142
+ ds_path="path/to/dataset.jsonl",
143
+ save_path="path/to/output.jsonl",
144
+ endpoint="http://0.0.0.0:8000/v1",
145
+ flow_path="path/to/flow.yaml",
146
+ checkpoint_dir="path/to/checkpoints",
147
+ batch_size=8,
148
+ num_workers=32,
149
+ save_freq=2,
150
+ )
151
+ ```
152
+
153
+ ### 📂 Available Built-in Flows
154
+
155
+ You can start with any of these YAML flows out of the box:
156
+
157
+ #### 🔎 **Knowledge Flows**
158
+
159
+ | Flow | Description |
160
+ |------|-------------|
161
+ | `synth_knowledge.yaml` | Produces document-grounded questions and answers for factual memorization |
162
+ | `synth_knowledge1.5.yaml` | Improved version that builds intermediate representations for better recall |
163
+
164
+ #### 🧠 **Skills Flows**
165
+
166
+ | Flow | Description |
167
+ |------|-------------|
168
+ | `synth_skills.yaml` | Freeform skills QA generation (eg: "Create a new github issue to add type hints") |
169
+ | `synth_grounded_skills.yaml` | Domain-specific skill generation (eg: "From the given conversation create a table for feature requests") |
170
+ | `improve_responses.yaml` | Uses planning and critique-based refinement to improve generated answers |
171
+
172
+ All these can be found here: [flows](src/sdg_hub/flows)
173
+
174
+ ## 📺 Video Tutorial
175
+
176
+ For a comprehensive walkthrough of sdg_hub:
177
+
178
+ [![SDG Hub Tutorial](https://img.youtube.com/vi/aGKCViWjAmA/0.jpg)](https://www.youtube.com/watch?v=aGKCViWjAmA)
179
+
180
+ ## 🤝 Contributing
181
+
182
+ We welcome contributions from the community! Whether it's bug reports, feature requests, documentation improvements, or code contributions, please check out our [contribution guidelines](CONTRIBUTING.md).
183
+
184
+ ## 📄 License
185
+
186
+ This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details.
187
+
188
+ ---
189
+
190
+ Built with ❤️ by the Red Hat AI Innovation Team
@@ -0,0 +1,86 @@
1
+ sdg_hub/__init__.py,sha256=5Wa6onDndPvG4iwnjq2jK747t3-7XKdQn2WfHfq1sFc,67
2
+ sdg_hub/_version.py,sha256=Mmxse1R0ki5tjz9qzU8AQyqUsLt8nTyCAbYQp8R87PU,511
3
+ sdg_hub/checkpointer.py,sha256=R0pNKL_q7-BerxmIarY0w1nFYaq7fGnoRRkCVL6Z-Gw,5053
4
+ sdg_hub/flow.py,sha256=psCRKovzIKrsxmPYh6WN6mOUHrNUbi5HDkUlU6xD7x0,18163
5
+ sdg_hub/flow_runner.py,sha256=kwlYOhIldRC2RCdNV84NFMaWBzAv2plYPGz7drZ7fOA,5648
6
+ sdg_hub/logger_config.py,sha256=7uHEJVRfym1c4n95DOKHelLXqAus8uHsZYmzLsEjqpo,422
7
+ sdg_hub/pipeline.py,sha256=mahktfoCMVnuBnvLNjAVOAoFKNQo-wb0Dz1_xdYhKDM,3852
8
+ sdg_hub/prompts.py,sha256=rtiUS2IuaMAQVAy8aAwGxmk23sKC2Qqro7edymbENrk,8165
9
+ sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ sdg_hub/registry.py,sha256=Sc_HNxo4n0pgWMiEDd_sLjxaSXAMZFiHJIhQKqjywwk,3772
11
+ sdg_hub/sdg.py,sha256=8SKrSnqyvJAwE2Muf9lXw9ONRcDzqmCtaEzFHCYW4CY,6914
12
+ sdg_hub/blocks/__init__.py,sha256=pmxlv29ohPRdIVE9ojnBs3I58UwNMU0uTtGozOZuZzc,807
13
+ sdg_hub/blocks/block.py,sha256=zdeyDyYiY0EdD3xS7kZR2hRZCRkbygQ4WONp_zv3X7w,3051
14
+ sdg_hub/blocks/llmblock.py,sha256=nWslPFZSCiyL7MXQurOk6Jx29UOsgnVDMI3PTwje7kg,13678
15
+ sdg_hub/blocks/utilblocks.py,sha256=U2PQk26cwHOgofk5IenHjrao08gbqPFOBNRy5QJ-EEY,18290
16
+ sdg_hub/configs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ sdg_hub/configs/annotations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ sdg_hub/configs/annotations/cot_reflection.yaml,sha256=60EdsTe1y7GoUIAWYSGfMa3EKI3oLZKCvDuKU7wHgQU,1737
19
+ sdg_hub/configs/annotations/detailed_annotations.yaml,sha256=in21xmlhxDJGEaWh1IgINh33tEyW9AuyG3k4pWBuKSM,1520
20
+ sdg_hub/configs/annotations/detailed_description.yaml,sha256=FsGbQMBxf1MAOi0nhrQ4icxcwYMlRura_ji9Pmeh1AA,192
21
+ sdg_hub/configs/annotations/detailed_description_icl.yaml,sha256=NDdwo5EShnYZjm1Fn80sZTAwfnwpPigixP2hvJ8--cU,679
22
+ sdg_hub/configs/annotations/simple_annotations.yaml,sha256=e2F_Ow8EG_me4XJ2cnBTlKb9y1FmdX0DHKkiMqiwdUQ,188
23
+ sdg_hub/configs/knowledge/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
+ sdg_hub/configs/knowledge/atomic_facts.yaml,sha256=bIfQr0q0FyReO94v_lpLO56FikARCvFmZza-ISZTOnA,2453
25
+ sdg_hub/configs/knowledge/auxilary_instructions.yaml,sha256=aCgIjvNacdC2ZHThEvhZKvwORK6KqErVvVYQYQrIDLE,2034
26
+ sdg_hub/configs/knowledge/detailed_summary.yaml,sha256=_Mc_i9vaLp1OPKexSOURV5gbXEG41p1eELUukOhz8oM,388
27
+ sdg_hub/configs/knowledge/evaluate_faithfulness.yaml,sha256=iuvx5vNNm_jzHlmcKF83StaDYezRz2vQn3JUHM-TMdQ,3054
28
+ sdg_hub/configs/knowledge/evaluate_question.yaml,sha256=02mikEAJCUEkREBo7KxPY9H6iTUHQN-4cRkn2XMlVQ8,1915
29
+ sdg_hub/configs/knowledge/evaluate_relevancy.yaml,sha256=ASh8A1HAYO1h1tQRrwGnkUmK1n-WDKLdfW_LbSW1ipQ,3690
30
+ sdg_hub/configs/knowledge/extractive_summary.yaml,sha256=TYgJ7WQc7NFkf3GeRsbx6lwfA_xFnEOYGELewSqorp0,399
31
+ sdg_hub/configs/knowledge/generate_code_questions_responses.yaml,sha256=cIus2JYMYDvxHFVSU9QVa-1IK5KoChb3rCU2b4b9UmI,908
32
+ sdg_hub/configs/knowledge/generate_questions.yaml,sha256=iJtttZrVvlXFraUSrMowqTCLoJOLDbBndcTNMPTO8A4,2788
33
+ sdg_hub/configs/knowledge/generate_questions_responses.yaml,sha256=H9nb_5xGP7k6HtC3VboXqpiI5kQ9Xp3vjhXH3YIFesk,2525
34
+ sdg_hub/configs/knowledge/generate_responses.yaml,sha256=wwiB7lSB9yEB1XG2SIEIRtHkSlKh3NGJAmDaq2J6-ZY,2483
35
+ sdg_hub/configs/knowledge/mcq_generation.yaml,sha256=d4VKegnVIexwCn0e2AJs-0DC6XdLyUBGaCsQVwzICUE,3152
36
+ sdg_hub/configs/knowledge/router.yaml,sha256=9m_cX3xl808Vwrcq2PACyX45QFPkrV2nVYIY8x10JBU,119
37
+ sdg_hub/configs/knowledge/simple_generate_qa.yaml,sha256=OsuZP9SxQeUhTsHdhUO10mnjJ1u_6xekW5IQucFpRco,1565
38
+ sdg_hub/configs/reasoning/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
+ sdg_hub/configs/reasoning/dynamic_cot.yaml,sha256=6XY_mFpB_oKFQ7U2CmHTqkJRGVHgOvpNmIDfhksYW6o,2641
40
+ sdg_hub/configs/skills/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
+ sdg_hub/configs/skills/analyzer.yaml,sha256=QBtyjaU6HBZqzNOmev_W4_scn_hH7Rfxd2xL_LcPLho,2261
42
+ sdg_hub/configs/skills/annotation.yaml,sha256=k5nJ357kUr0Uvq7Hkt3Ey22UbgSjgSjIomjHFfjaQnY,916
43
+ sdg_hub/configs/skills/contexts.yaml,sha256=MZ2QpuGhTce6kuEsMleaGblljhGG-yhXBuH42htA2P4,1161
44
+ sdg_hub/configs/skills/critic.yaml,sha256=Dr7anOKa7Xx1oDonXzsCfXwKIl4hUTArx2Sb_rgpLQI,1808
45
+ sdg_hub/configs/skills/evaluate_freeform_pair.yaml,sha256=MOI0-GyKrJ_O4v1mm8A1lIKxXfwcS3dA7GjlpDEuXRU,4055
46
+ sdg_hub/configs/skills/evaluate_freeform_questions.yaml,sha256=yDmLd-3A9pN5VLaT4lAcJ_ZvCY43LYlcS1KEdxpBRjU,2559
47
+ sdg_hub/configs/skills/evaluate_grounded_pair.yaml,sha256=vMQtsHpNxPOOHnkzqWPp-N1gSfwPqTbfcKmNfhb9WS8,4648
48
+ sdg_hub/configs/skills/evaluate_grounded_questions.yaml,sha256=9yr97azFhMdOfYp11BFtDSIhhP4wjQMOxYZnKWKlCPU,3115
49
+ sdg_hub/configs/skills/freeform_questions.yaml,sha256=N6R3c1jNiSSw6T-OUJULpLnPHuaSXjvoNjSqTKL6EOY,1500
50
+ sdg_hub/configs/skills/freeform_responses.yaml,sha256=4URTMsPpgSDOVj71Gw3lL82QWnUFR37iE72BIMwwv7c,1544
51
+ sdg_hub/configs/skills/grounded_questions.yaml,sha256=t6pKjt5Fp_ThZueB7JBrUKuQLQY_At-Y9O67OtrIXMo,1898
52
+ sdg_hub/configs/skills/grounded_responses.yaml,sha256=kVOeBp3BjKCFKG2qConXIQVVPI1EgcKJgKn6DFAkl1s,1860
53
+ sdg_hub/configs/skills/judge.yaml,sha256=FxnJA_wdmyMyMqGEZDAT8hc2itO845mGDNXgpmV2EUU,3203
54
+ sdg_hub/configs/skills/planner.yaml,sha256=yNF6t0EnmwYt1EV9Y3-vkmPcbOQRtvoLr8MITuiUw_A,2086
55
+ sdg_hub/configs/skills/respond.yaml,sha256=K1Q5X5_Q1k60hNDbHDjMYBzxbyOIEEHTQcXW6qQ4Ve0,108
56
+ sdg_hub/configs/skills/revised_responder.yaml,sha256=rjypOJbhZV9PuOD9YhlYgymxOJV8Zdzzz54x6Fxn2bY,2875
57
+ sdg_hub/configs/skills/router.yaml,sha256=7YnFp6H5wYD8W5Qn1Ac4r9dGBSFUDhZSNwmglQ99PgQ,3545
58
+ sdg_hub/configs/skills/simple_generate_qa_freeform.yaml,sha256=j8cJtEKSvtA__rE08iU6oz2XnfIgj0HiLVL8-6RhK3c,1431
59
+ sdg_hub/configs/skills/simple_generate_qa_grounded.yaml,sha256=tvX9EN5TArFesOOqpdN3hb-IHe7O82a2twQd-gzyCgw,1500
60
+ sdg_hub/configs/skills/icl_examples/STEM.yaml,sha256=5dcLC5jXOEeDasBkTunnHYrlddI3HcHYnEAXZcrd0ds,8412
61
+ sdg_hub/configs/skills/icl_examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
+ sdg_hub/configs/skills/icl_examples/coding.yaml,sha256=a5m-pUcV9xUb54gQ5U3vsU1RBXzOmsfX0CjTW7U62zo,5240
63
+ sdg_hub/configs/skills/icl_examples/extraction.yaml,sha256=P751l6NvFRkINWz-bX5jgnd_if2bl3d_NlhGI7g81xw,4654
64
+ sdg_hub/configs/skills/icl_examples/humanities.yaml,sha256=tZyiJ4Q3gG4uuoDXw6g__lX3ySEUaRZW2GhW1ustwaM,11370
65
+ sdg_hub/configs/skills/icl_examples/math.yaml,sha256=hNq-QudlXrg9CWLpJdrZ4v3vifGTWhyp2gcfwPdR3_o,6776
66
+ sdg_hub/configs/skills/icl_examples/reasoning.yaml,sha256=eesIlH9SO07TVF20gy18MZrcDzLhSmynd_F_lvg0oQg,4335
67
+ sdg_hub/configs/skills/icl_examples/roleplay.yaml,sha256=LYEyA7wv7QWQscUNQr0K_lotNoWSfuoAEncx3PCRYIs,6997
68
+ sdg_hub/configs/skills/icl_examples/writing.yaml,sha256=El-57IjZ5IvdcmCHyHvX_M2RFFkEos572220be8ecrQ,11335
69
+ sdg_hub/flows/generation/knowledge/mmlu_bench.yaml,sha256=Rueuxr_n1zabE_nGqOgUfh5hqVmEONRka9NLiZANSew,346
70
+ sdg_hub/flows/generation/knowledge/simple_knowledge.yaml,sha256=o4uyfs1nDiECcNROdsvHKiM46NYvQufo9dF4XSGpY54,298
71
+ sdg_hub/flows/generation/knowledge/synth_knowledge.yaml,sha256=ZTZvevfwDQSKUwPcv1i5IzIchsRHSEN03eTefedQmU8,2172
72
+ sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml,sha256=5KAyOfhjqpFoDBtG-juEZES4gkskzB2VgSvAAlUbSak,3709
73
+ sdg_hub/flows/generation/skills/improve_responses.yaml,sha256=wUV0awTmKHNZ62pHiw_yz-IdG0OYgT_dCwlMUlZS3TA,2683
74
+ sdg_hub/flows/generation/skills/simple_freeform_skill.yaml,sha256=iVEomFH1E52JA7KLmTIwkS1PnzxUJVPMgbK2O-m80As,309
75
+ sdg_hub/flows/generation/skills/simple_grounded_skill.yaml,sha256=LTLxqdgbLIKSJonuIRHhcRSpit1EawwNvytWzXWXe2E,309
76
+ sdg_hub/flows/generation/skills/synth_grounded_skills.yaml,sha256=91Dm--agpmbm02hIVnFhEndjppKsQEWXDbckR9GAzKM,2045
77
+ sdg_hub/flows/generation/skills/synth_skills.yaml,sha256=9lhQcxXXbN4V9ztPph4fyjUtctll2FYtKY-V4grQdy4,1492
78
+ sdg_hub/utils/__init__.py,sha256=UEo-9qPt5iVKBIRvgZhOI0SoIBO6zeBxOuLvUQXaM3g,185
79
+ sdg_hub/utils/config_validation.py,sha256=g92GxN73Mjr0cXvc5amB_Fn4iV9-iKeWmPz9HwLPmNY,3426
80
+ sdg_hub/utils/datautils.py,sha256=0t_SZ_UXBKl8uL6rVp3SUh8YKRbzKlh2oO5gr2cKyEw,389
81
+ sdg_hub/utils/validation_result.py,sha256=O3zF6r49LQ9StAf_oWmK2bg-JfTQw6rpbHtHr9lI4ks,264
82
+ sdg_hub-0.1.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
83
+ sdg_hub-0.1.1.dist-info/METADATA,sha256=s4pRrDO0pKXc3g_mGTyqXiNND1Rbd6YJOeHoLbHhSDw,7247
84
+ sdg_hub-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
85
+ sdg_hub-0.1.1.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
86
+ sdg_hub-0.1.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,76 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- # Standard
3
- import operator
4
-
5
- # Third Party
6
- from datasets import Dataset
7
-
8
- # Local
9
- from .block import Block
10
- from ..registry import BlockRegistry
11
- from ..logger_config import setup_logger
12
-
13
- logger = setup_logger(__name__)
14
-
15
-
16
- @BlockRegistry.register("FilterByValueBlock")
17
- class FilterByValueBlock(Block):
18
- def __init__(
19
- self, filter_column, filter_value, operation, convert_dtype=None, **batch_kwargs
20
- ) -> None:
21
- """
22
- Initializes a new instance of the FilterByValueBlock class.
23
-
24
- Parameters:
25
- - filter_column (str): The name of the column in the dataset to apply the filter on.
26
- - filter_value (any or list of any): The value(s) to filter by.
27
- - operation (callable): A function that takes two arguments (column value and filter value) and returns a boolean indicating whether the row should be included in the filtered dataset.
28
- - convert_dtype (callable, optional): A function to convert the data type of the filter column before applying the filter. Defaults to None.
29
- - **batch_kwargs: Additional kwargs for batch processing.
30
-
31
- Returns:
32
- None
33
- """
34
- super().__init__(block_name=self.__class__.__name__)
35
- self.value = filter_value if isinstance(filter_value, list) else [filter_value]
36
- self.column_name = filter_column
37
- self.operation = operation
38
- self.convert_dtype = convert_dtype
39
- self.num_procs = batch_kwargs.get("num_procs", 1)
40
-
41
- def _convert_dtype(self, sample):
42
- try:
43
- sample[self.column_name] = self.convert_dtype(sample[self.column_name])
44
- except ValueError as e:
45
- logger.error(
46
- "Error converting dtype: %s, filling with None to be filtered later", e
47
- )
48
- sample[self.column_name] = None
49
- return sample
50
-
51
- def generate(self, samples) -> Dataset:
52
- if self.convert_dtype:
53
- samples = samples.map(
54
- self._convert_dtype,
55
- num_proc=self.num_procs,
56
- )
57
-
58
- if self.operation == operator.contains:
59
- samples = samples.filter(
60
- lambda x: self.operation(self.value, x[self.column_name]),
61
- num_proc=self.num_procs,
62
- )
63
-
64
- samples = samples.filter(
65
- lambda x: x[self.column_name] is not None,
66
- num_proc=self.num_procs,
67
- )
68
-
69
- samples = samples.filter(
70
- lambda x: any(
71
- self.operation(x[self.column_name], value) for value in self.value
72
- ),
73
- num_proc=self.num_procs,
74
- )
75
-
76
- return samples
@@ -1,31 +0,0 @@
1
- # Third Party
2
- from datasets import Dataset
3
-
4
- # Local
5
- from .block import Block
6
- from ..registry import BlockRegistry
7
- from ..logger_config import setup_logger
8
-
9
- logger = setup_logger(__name__)
10
-
11
-
12
- @BlockRegistry.register("IterBlock")
13
- class IterBlock(Block):
14
- def __init__(self, block_name, num_iters, block_type, block_kwargs, **kwargs):
15
- super().__init__(block_name)
16
- self.num_iters = num_iters
17
- self.block = block_type(**block_kwargs)
18
- self.gen_kwargs = kwargs.get("gen_kwargs", {})
19
- self.gen_kwargs = kwargs.get("gen_kwargs", {})
20
-
21
- def generate(self, samples, **gen_kwargs) -> Dataset:
22
- generated_samples = []
23
- num_iters = self.num_iters
24
-
25
- for _ in range(num_iters):
26
- batch_generated = self.block.generate(
27
- samples, **{**self.gen_kwargs, **gen_kwargs}
28
- )
29
- generated_samples.extend(batch_generated)
30
-
31
- return Dataset.from_list(generated_samples)
@@ -1,194 +0,0 @@
1
- """Module containing blocks for scoring responses using Reward Models."""
2
-
3
- # Standard
4
- from typing import Dict, List
5
- import json
6
- from urllib.parse import urljoin
7
-
8
- # Third Party
9
- from datasets import Dataset
10
- import requests
11
-
12
- # Local
13
- from .block import Block
14
- from ..logger_config import setup_logger
15
- from ..registry import BlockRegistry
16
-
17
- logger = setup_logger(__name__)
18
-
19
-
20
- @BlockRegistry.register("PRMBlock")
21
- class PRMBlock(Block):
22
- """A block for scoring responses using a ProcessReward Model (PRM) via HTTP API.
23
-
24
- This block sends prompts and responses to a PRM endpoint and returns reward scores
25
- for each step in the response.
26
- """
27
-
28
- def __init__(
29
- self,
30
- block_name: str,
31
- host: str,
32
- port: int,
33
- model_name: str,
34
- prompt_col: str,
35
- response_col: str,
36
- output_col: str = "step_rewards",
37
- system_prompt: str = None,
38
- endpoint: str = "pooling",
39
- step_separator: str = "\n\n",
40
- step_fill_token: str = "<extra_0>",
41
- ) -> None:
42
- r"""Initialize the PRM (Process Reward Model) Block.
43
-
44
- Parameters
45
- ----------
46
- block_name : str
47
- Name of the block
48
- host : str
49
- Hostname of the PRM service (e.g., "0.0.0.0" or "localhost")
50
- port : int
51
- Port number the service is running on
52
- model_name : str
53
- Name of the PRM model to use
54
- prompt_col : str
55
- Column name containing the prompt
56
- response_col : str
57
- Column name containing the response
58
- output_col : str, optional
59
- Column name to store the reward scores, by default "step_rewards"
60
- system_prompt : str, optional
61
- Optional system prompt to use for scoring, by default None
62
- endpoint : str, optional
63
- API endpoint name, by default "pooling"
64
- step_separator : str, optional
65
- Separator between steps in the response, by default "\n\n"
66
- step_fill_token : str, optional
67
- Model specific fill token for steps in the response, by default "<extra_0>" used by Qwen2.5-Math-PRM
68
- """
69
- super().__init__(block_name)
70
- # Construct base URL from host and port
71
- self.base_url = f"http://{host.strip('/')}:{port}/"
72
- self.endpoint = endpoint.strip("/")
73
-
74
- # Construct the full API URL using urljoin
75
- self.api_url = urljoin(self.base_url, self.endpoint)
76
- logger.info(f"Initialized PRMBlock with API URL: {self.api_url}")
77
-
78
- self.model_name = model_name
79
- self.prompt_col = prompt_col
80
- self.response_col = response_col
81
- self.output_col = output_col
82
- self.system_prompt = system_prompt
83
- self.step_separator = step_separator
84
- self.step_fill_token = step_fill_token
85
-
86
- def _post_request(self, messages: List[Dict]) -> requests.Response:
87
- """Make POST request to PRM API endpoint.
88
-
89
- Parameters
90
- ----------
91
- messages : List[Dict]
92
- List of message dictionaries to send to the API
93
-
94
- Returns
95
- -------
96
- requests.Response
97
- Response from the API
98
- """
99
- headers = {"User-Agent": "PRMBlock Client"}
100
- prompt = {"model": self.model_name, "messages": messages}
101
- response = requests.post(self.api_url, headers=headers, json=prompt)
102
- return response
103
-
104
- def _format_messages(self, sample: Dict) -> List[Dict]:
105
- """Format input sample into messages for the PRM API.
106
-
107
- Parameters
108
- ----------
109
- sample : Dict
110
- Input sample containing prompt and response
111
-
112
- Returns
113
- -------
114
- List[Dict]
115
- Formatted messages for the API
116
- """
117
- messages = []
118
- if self.system_prompt:
119
- messages.append({"role": "system", "content": self.system_prompt})
120
-
121
- messages.append({"role": "user", "content": sample[self.prompt_col]})
122
- messages.append(
123
- {
124
- "role": "assistant",
125
- "content": self.step_fill_token.join(sample[self.response_col].split(self.step_separator))
126
- + self.step_fill_token,
127
- }
128
- )
129
- return messages
130
-
131
- def _extract_rewards(self, response: requests.Response) -> List[float]:
132
- """Extract reward scores from API response.
133
-
134
- Parameters
135
- ----------
136
- response : requests.Response
137
- Response from the API
138
-
139
- Returns
140
- -------
141
- List[float]
142
- List of reward scores
143
- """
144
- try:
145
- response_data = response.json()
146
- rewards = [x[1] for x in response_data["data"][0]["data"]]
147
- return rewards
148
- except (KeyError, IndexError, json.JSONDecodeError) as e:
149
- logger.error(f"Error extracting rewards from response: {e}")
150
- return []
151
-
152
- def _generate(self, sample: dict) -> dict:
153
- """Generate reward scores for the input samples.
154
-
155
- Parameters
156
- ----------
157
- sample : dict
158
- Input sample to score
159
-
160
- Returns
161
- -------
162
- dict
163
- Dictionary with added reward scores column
164
- """
165
- messages = self._format_messages(sample)
166
- rm_response = self._post_request(messages)
167
-
168
- if rm_response.status_code != 200:
169
- logger.error(f"API request failed with status {rm_response.status_code}")
170
- rewards = [0.0] * len(
171
- sample[self.response_col].split(self.step_separator)
172
- ) # Default to 0 scores on failure
173
- else:
174
- rewards = self._extract_rewards(rm_response)
175
-
176
- sample[self.output_col] = rewards
177
- return sample
178
-
179
- def generate(self, samples: Dataset, batch_size: int = 4) -> Dataset:
180
- """Generate reward scores for the input samples.
181
-
182
- Parameters
183
- ----------
184
- samples : Dataset
185
- Input dataset containing samples to score
186
- batch_size : int, optional
187
- Number of processes to use for parallel processing, by default 4
188
-
189
- Returns
190
- -------
191
- Dataset
192
- Dataset with added reward scores
193
- """
194
- return samples.map(self._generate, num_proc=batch_size)
@@ -1,10 +0,0 @@
1
- system: ~
2
- introduction: |
3
- Task Description: {{ simple_task_description }}
4
- principles: ~
5
- examples: ~
6
- generation: |
7
- Here is the query for annotation:
8
- {{ prompt }}
9
- start_tags: [""]
10
- end_tags: [""]
@@ -1,3 +0,0 @@
1
- datasets: []
2
- sys_prompt: |
3
- I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.
@@ -1,6 +0,0 @@
1
- datasets:
2
- - path: instructlab/InstructLabCommunity
3
- sampling_size: 1.0
4
-
5
- sys_prompt: |
6
- I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.
@@ -1,19 +0,0 @@
1
- - block_type: LLMBlock
2
- block_config:
3
- block_name: gen_responses
4
- config_path: configs/annotations/detailed_description.yaml
5
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
6
- output_cols:
7
- - output
8
- gen_kwargs:
9
- max_tokens: 5
10
- temperature: 0
11
- extra_body:
12
- guided_choice:
13
- - "joy"
14
- - "sadness"
15
- - "anger"
16
- - "fear"
17
- - "love"
18
- drop_duplicates:
19
- - prompt
@@ -1,19 +0,0 @@
1
- - block_type: LLMBlock
2
- block_config:
3
- block_name: gen_responses
4
- config_path: configs/annotations/detailed_description_icl.yaml
5
- model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
6
- output_cols:
7
- - output
8
- gen_kwargs:
9
- max_tokens: 5
10
- temperature: 0
11
- extra_body:
12
- guided_choice:
13
- - "joy"
14
- - "sadness"
15
- - "anger"
16
- - "fear"
17
- - "love"
18
- drop_duplicates:
19
- - prompt