sdg-hub 0.1.0a4__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. sdg_hub/_version.py +2 -2
  2. sdg_hub/blocks/__init__.py +41 -5
  3. sdg_hub/blocks/block.py +58 -16
  4. sdg_hub/blocks/llmblock.py +121 -193
  5. sdg_hub/blocks/openaichatblock.py +556 -0
  6. sdg_hub/blocks/utilblocks.py +500 -43
  7. sdg_hub/checkpointer.py +139 -0
  8. sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
  9. sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
  10. sdg_hub/configs/knowledge/atomic_facts.yaml +1 -0
  11. sdg_hub/configs/knowledge/detailed_summary.yaml +1 -0
  12. sdg_hub/configs/knowledge/extractive_summary.yaml +1 -0
  13. sdg_hub/configs/knowledge/generate_questions.yaml +82 -0
  14. sdg_hub/configs/knowledge/generate_responses.yaml +86 -0
  15. sdg_hub/configs/skills/contexts.yaml +18 -11
  16. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +79 -12
  17. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +60 -28
  18. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +95 -30
  19. sdg_hub/configs/skills/freeform_questions.yaml +21 -16
  20. sdg_hub/configs/skills/freeform_responses.yaml +19 -25
  21. sdg_hub/configs/skills/router.yaml +53 -6
  22. sdg_hub/flow.py +366 -33
  23. sdg_hub/flow_runner.py +437 -0
  24. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +21 -9
  25. sdg_hub/flows/generation/skills/{agentic_improve_skill.yaml → improve_responses.yaml} +26 -31
  26. sdg_hub/flows/generation/skills/synth_skills.yaml +4 -4
  27. sdg_hub/pipeline.py +67 -12
  28. sdg_hub/prompts.py +52 -0
  29. sdg_hub/sdg.py +128 -86
  30. sdg_hub/utils/__init__.py +5 -0
  31. sdg_hub/utils/config_validation.py +91 -0
  32. sdg_hub/utils/error_handling.py +94 -0
  33. sdg_hub/utils/path_resolution.py +62 -0
  34. sdg_hub/utils/validation_result.py +10 -0
  35. sdg_hub-0.1.2.dist-info/METADATA +190 -0
  36. sdg_hub-0.1.2.dist-info/RECORD +89 -0
  37. {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/WHEEL +1 -1
  38. sdg_hub/blocks/filterblock.py +0 -76
  39. sdg_hub/blocks/iterblock.py +0 -31
  40. sdg_hub/blocks/rmblocks.py +0 -194
  41. sdg_hub/configs/annotations/simple.yaml +0 -10
  42. sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +0 -3
  43. sdg_hub/configs/skills/data_recipe/default_recipe.yaml +0 -6
  44. sdg_hub/flows/annotation/emotion/detailed_description.yaml +0 -19
  45. sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +0 -19
  46. sdg_hub/flows/annotation/emotion/simple.yaml +0 -19
  47. sdg_hub/utils/chunking.py +0 -73
  48. sdg_hub/utils/docprocessor.py +0 -357
  49. sdg_hub/utils/parse_and_convert.py +0 -392
  50. sdg_hub-0.1.0a4.dist-info/METADATA +0 -309
  51. sdg_hub-0.1.0a4.dist-info/RECORD +0 -90
  52. /sdg_hub/configs/{knowledge/data_recipe → reasoning}/__init__.py +0 -0
  53. /sdg_hub/configs/skills/{_G_.yaml → icl_examples/STEM.yaml} +0 -0
  54. /sdg_hub/configs/skills/{data_recipe → icl_examples}/__init__.py +0 -0
  55. /sdg_hub/configs/skills/{_A_.yaml → icl_examples/coding.yaml} +0 -0
  56. /sdg_hub/configs/skills/{_B_.yaml → icl_examples/extraction.yaml} +0 -0
  57. /sdg_hub/configs/skills/{_C_.yaml → icl_examples/humanities.yaml} +0 -0
  58. /sdg_hub/configs/skills/{_D_.yaml → icl_examples/math.yaml} +0 -0
  59. /sdg_hub/configs/skills/{_E_.yaml → icl_examples/reasoning.yaml} +0 -0
  60. /sdg_hub/configs/skills/{_F_.yaml → icl_examples/roleplay.yaml} +0 -0
  61. /sdg_hub/configs/skills/{_H_.yaml → icl_examples/writing.yaml} +0 -0
  62. {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/licenses/LICENSE +0 -0
  63. {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,62 @@
1
+ """
2
+ Path resolution utilities for SDG Hub.
3
+
4
+ This module provides utilities for resolving file paths relative to one or more
5
+ base directories, with support for both single directory and multiple directory
6
+ search paths.
7
+ """
8
+
9
+ # Standard
10
+ from typing import List, Union
11
+ import os
12
+
13
+
14
+ def resolve_path(filename: str, search_dirs: Union[str, List[str]]) -> str:
15
+ """Resolve a file path relative to one or more search directories.
16
+
17
+ Files are checked in the following order:
18
+ 1. Absolute path is always used as-is
19
+ 2. Checked relative to each directory in search_dirs (in order)
20
+ 3. If not found, returns the original filename (assumes relative to current directory)
21
+
22
+ Parameters
23
+ ----------
24
+ filename : str
25
+ The path to the file to resolve.
26
+ search_dirs : Union[str, List[str]]
27
+ Directory or list of directories in which to search for the file.
28
+
29
+ Returns
30
+ -------
31
+ str
32
+ Resolved file path.
33
+
34
+ Examples
35
+ --------
36
+ >>> resolve_path("config.yaml", "/path/to/base")
37
+ '/path/to/base/config.yaml' # if file exists
38
+
39
+ >>> resolve_path("config.yaml", ["/path1", "/path2"])
40
+ '/path1/config.yaml' # if file exists in path1
41
+ '/path2/config.yaml' # if file exists in path2 but not path1
42
+
43
+ >>> resolve_path("/absolute/path/file.yaml", ["/path1", "/path2"])
44
+ '/absolute/path/file.yaml' # absolute path always used as-is
45
+ """
46
+ # Handle absolute paths - always use as-is
47
+ if os.path.isabs(filename):
48
+ return filename
49
+
50
+ # Convert single directory to list for uniform handling
51
+ if isinstance(search_dirs, str):
52
+ search_dirs = [search_dirs]
53
+
54
+ # Check each directory in order
55
+ for directory in search_dirs:
56
+ full_file_path = os.path.join(directory, filename)
57
+ if os.path.isfile(full_file_path):
58
+ return full_file_path
59
+
60
+ # If not found in any search directory, return the original filename
61
+ # This assumes the path is relative to the current directory
62
+ return filename
@@ -0,0 +1,10 @@
1
+ from typing import List
2
+
3
+
4
+ class ValidationResult:
5
+ def __init__(self, valid: bool, errors: List[str]):
6
+ self.valid = valid
7
+ self.errors = errors
8
+
9
+ def __repr__(self):
10
+ return f"ValidationResult(valid={self.valid}, errors={self.errors})"
@@ -0,0 +1,190 @@
1
+ Metadata-Version: 2.4
2
+ Name: sdg_hub
3
+ Version: 0.1.2
4
+ Summary: Synthetic Data Generation
5
+ Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
+ License: Apache-2.0
7
+ Project-URL: homepage, https://ai-innovation.team/
8
+ Project-URL: source, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub
9
+ Project-URL: issues, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/issues
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Environment :: Console
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: MacOS :: MacOS X
15
+ Classifier: Operating System :: POSIX :: Linux
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: Implementation :: CPython
23
+ Requires-Python: >=3.9
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: click<9.0.0,>=8.1.7
27
+ Requires-Dist: datasets<4.0.0,>=2.18.0
28
+ Requires-Dist: httpx<1.0.0,>=0.25.0
29
+ Requires-Dist: jinja2
30
+ Requires-Dist: openai<2.0.0,>=1.13.3
31
+ Requires-Dist: rich
32
+ Requires-Dist: tenacity!=8.4.0,>=8.3.0
33
+ Requires-Dist: tqdm<5.0.0,>=4.66.2
34
+ Provides-Extra: web-interface
35
+ Requires-Dist: flask>=3.0.2; extra == "web-interface"
36
+ Requires-Dist: pyyaml>=6.0.1; extra == "web-interface"
37
+ Requires-Dist: flask-wtf>=1.2.2; extra == "web-interface"
38
+ Provides-Extra: vllm
39
+ Requires-Dist: vllm<0.8.4,>=0.8.0; extra == "vllm"
40
+ Requires-Dist: torch>=2.0.0; extra == "vllm"
41
+ Requires-Dist: transformers>=4.37.0; extra == "vllm"
42
+ Requires-Dist: accelerate>=0.21.0; extra == "vllm"
43
+ Requires-Dist: xformers>=0.0.22.post7; extra == "vllm"
44
+ Provides-Extra: examples
45
+ Requires-Dist: tabulate>=0.9.0; extra == "examples"
46
+ Requires-Dist: transformers>=4.37.0; extra == "examples"
47
+ Requires-Dist: langchain-text-splitters; extra == "examples"
48
+ Requires-Dist: docling>=2.3.0; extra == "examples"
49
+ Provides-Extra: dev
50
+ Requires-Dist: pre-commit<4.0,>=3.0.4; extra == "dev"
51
+ Requires-Dist: pylint<4.0,>=2.16.2; extra == "dev"
52
+ Requires-Dist: pylint-pydantic; extra == "dev"
53
+ Requires-Dist: pytest; extra == "dev"
54
+ Requires-Dist: pytest-asyncio; extra == "dev"
55
+ Requires-Dist: pytest-cov; extra == "dev"
56
+ Requires-Dist: pytest-html; extra == "dev"
57
+ Requires-Dist: tox<5,>=4.4.2; extra == "dev"
58
+ Dynamic: license-file
59
+
60
+ # SDG Hub: Synthetic Data Generation Toolkit
61
+
62
+ [![Build](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/pypi.yaml/badge.svg?branch=main)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/pypi.yaml)
63
+ [![Release](https://img.shields.io/github/v/release/Red-Hat-AI-Innovation-Team/sdg_hub)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/releases)
64
+ [![License](https://img.shields.io/github/license/Red-Hat-AI-Innovation-Team/sdg_hub)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/blob/main/LICENSE)
65
+ [![Tests](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml/badge.svg)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml)
66
+ [![codecov](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub/graph/badge.svg?token=SP75BCXWO2)](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub)
67
+
68
+ <html>
69
+ <h3 align="center">
70
+ A modular, scalable, and efficient solution for creating synthetic data generation flows in a "low-code" manner.
71
+ </h3>
72
+ <h3 align="center">
73
+ <a href="http://ai-innovation.team/sdg_hub">Documentation</a> |
74
+ <a href="examples/">Examples</a> |
75
+ <a href="https://www.youtube.com/watch?v=aGKCViWjAmA">Video Tutorial</a>
76
+ </h3>
77
+ </html>
78
+
79
+ SDG Hub is designed to simplify data creation for LLMs, allowing users to chain computational units and build powerful flows for generating data and processing tasks. Define complex workflows using nothing but YAML configuration files.
80
+
81
+ **📖 Full documentation available at: [https://ai-innovation.team/sdg_hub](https://ai-innovation.team/sdg_hub)**
82
+
83
+ ---
84
+
85
+ ## ✨ Key Features
86
+
87
+ - **Low-Code Flow Creation**: Build sophisticated data generation pipelines using
88
+ simple YAML configuration files without writing any code.
89
+
90
+ - **Modular Block System**: Compose workflows from reusable, self-contained
91
+ blocks that handle LLM calls, data transformations, and filtering.
92
+
93
+ - **LLM-Agnostic**: Works with any language model through configurable
94
+ prompt templates and generation parameters.
95
+
96
+ - **Prompt Engineering Friendly**: Tune LLM behavior by editing declarative YAML prompts.
97
+
98
+ ## 🚀 Installation
99
+
100
+ ### Stable Release (Recommended)
101
+
102
+ ```bash
103
+ pip install sdg-hub
104
+ ```
105
+
106
+ ### Development Version
107
+
108
+ ```bash
109
+ pip install git+https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub.git
110
+ ```
111
+
112
+ ## 🏁 Quick Start
113
+
114
+ ### Prerequisites
115
+
116
+ Before getting started, make sure you have:
117
+ - Python 3.8 or higher
118
+ - LLM Inference Endpoint exposed through OpenAI API
119
+
120
+ ### Simple Example
121
+
122
+ Here's the simplest way to get started:
123
+
124
+ ```python
125
+ from sdg_hub.flow_runner import run_flow
126
+
127
+ # Run a basic knowledge generation flow
128
+ run_flow(
129
+ ds_path="my_data.jsonl",
130
+ save_path="output.jsonl",
131
+ endpoint="http://0.0.0.0:8000/v1",
132
+ flow_path="flows/generation/knowledge/synth_knowledge.yaml"
133
+ )
134
+ ```
135
+
136
+ ### Advanced Configuration
137
+ You can invoke any built-in flow using run_flow:
138
+ ```python
139
+ from sdg_hub.flow_runner import run_flow
140
+
141
+ run_flow(
142
+ ds_path="path/to/dataset.jsonl",
143
+ save_path="path/to/output.jsonl",
144
+ endpoint="http://0.0.0.0:8000/v1",
145
+ flow_path="path/to/flow.yaml",
146
+ checkpoint_dir="path/to/checkpoints",
147
+ batch_size=8,
148
+ num_workers=32,
149
+ save_freq=2,
150
+ )
151
+ ```
152
+
153
+ ### 📂 Available Built-in Flows
154
+
155
+ You can start with any of these YAML flows out of the box:
156
+
157
+ #### 🔎 **Knowledge Flows**
158
+
159
+ | Flow | Description |
160
+ |------|-------------|
161
+ | `synth_knowledge.yaml` | Produces document-grounded questions and answers for factual memorization |
162
+ | `synth_knowledge1.5.yaml` | Improved version that builds intermediate representations for better recall |
163
+
164
+ #### 🧠 **Skills Flows**
165
+
166
+ | Flow | Description |
167
+ |------|-------------|
168
+ | `synth_skills.yaml` | Freeform skills QA generation (eg: "Create a new github issue to add type hints") |
169
+ | `synth_grounded_skills.yaml` | Domain-specific skill generation (eg: "From the given conversation create a table for feature requests") |
170
+ | `improve_responses.yaml` | Uses planning and critique-based refinement to improve generated answers |
171
+
172
+ All these can be found here: [flows](src/sdg_hub/flows)
173
+
174
+ ## 📺 Video Tutorial
175
+
176
+ For a comprehensive walkthrough of sdg_hub:
177
+
178
+ [![SDG Hub Tutorial](https://img.youtube.com/vi/aGKCViWjAmA/0.jpg)](https://www.youtube.com/watch?v=aGKCViWjAmA)
179
+
180
+ ## 🤝 Contributing
181
+
182
+ We welcome contributions from the community! Whether it's bug reports, feature requests, documentation improvements, or code contributions, please check out our [contribution guidelines](CONTRIBUTING.md).
183
+
184
+ ## 📄 License
185
+
186
+ This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details.
187
+
188
+ ---
189
+
190
+ Built with ❤️ by the Red Hat AI Innovation Team
@@ -0,0 +1,89 @@
1
+ sdg_hub/__init__.py,sha256=5Wa6onDndPvG4iwnjq2jK747t3-7XKdQn2WfHfq1sFc,67
2
+ sdg_hub/_version.py,sha256=bSmADqydH8nBu-J4lG8UVuR7hnU_zcwhnSav2oQ0W0A,511
3
+ sdg_hub/checkpointer.py,sha256=R0pNKL_q7-BerxmIarY0w1nFYaq7fGnoRRkCVL6Z-Gw,5053
4
+ sdg_hub/flow.py,sha256=14WDZfb-VDUBwXsVo9u5oMuWD6aOm-GWtIdT64z4j-0,18050
5
+ sdg_hub/flow_runner.py,sha256=xeAIdx2r86kwtdrMFysjR1N-j4teonvbSHKg-m1VNSs,14584
6
+ sdg_hub/logger_config.py,sha256=7uHEJVRfym1c4n95DOKHelLXqAus8uHsZYmzLsEjqpo,422
7
+ sdg_hub/pipeline.py,sha256=mahktfoCMVnuBnvLNjAVOAoFKNQo-wb0Dz1_xdYhKDM,3852
8
+ sdg_hub/prompts.py,sha256=Gto1KcIhO-50ERvZx1Qzu-eAhSlIkOjYH9F6j2eIPfY,17482
9
+ sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ sdg_hub/registry.py,sha256=Sc_HNxo4n0pgWMiEDd_sLjxaSXAMZFiHJIhQKqjywwk,3772
11
+ sdg_hub/sdg.py,sha256=8SKrSnqyvJAwE2Muf9lXw9ONRcDzqmCtaEzFHCYW4CY,6914
12
+ sdg_hub/blocks/__init__.py,sha256=I-kMjIM7E1NrPLyBuUi0yNoXnuw_kTK3A7ybyt3pOxU,936
13
+ sdg_hub/blocks/block.py,sha256=zdeyDyYiY0EdD3xS7kZR2hRZCRkbygQ4WONp_zv3X7w,3051
14
+ sdg_hub/blocks/llmblock.py,sha256=nWslPFZSCiyL7MXQurOk6Jx29UOsgnVDMI3PTwje7kg,13678
15
+ sdg_hub/blocks/openaichatblock.py,sha256=BWsWFEozWptwe1MMaz-_ZmgQPsNbCRun6ZlaKD3ICxQ,20016
16
+ sdg_hub/blocks/utilblocks.py,sha256=U2PQk26cwHOgofk5IenHjrao08gbqPFOBNRy5QJ-EEY,18290
17
+ sdg_hub/configs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ sdg_hub/configs/annotations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ sdg_hub/configs/annotations/cot_reflection.yaml,sha256=60EdsTe1y7GoUIAWYSGfMa3EKI3oLZKCvDuKU7wHgQU,1737
20
+ sdg_hub/configs/annotations/detailed_annotations.yaml,sha256=in21xmlhxDJGEaWh1IgINh33tEyW9AuyG3k4pWBuKSM,1520
21
+ sdg_hub/configs/annotations/detailed_description.yaml,sha256=FsGbQMBxf1MAOi0nhrQ4icxcwYMlRura_ji9Pmeh1AA,192
22
+ sdg_hub/configs/annotations/detailed_description_icl.yaml,sha256=NDdwo5EShnYZjm1Fn80sZTAwfnwpPigixP2hvJ8--cU,679
23
+ sdg_hub/configs/annotations/simple_annotations.yaml,sha256=e2F_Ow8EG_me4XJ2cnBTlKb9y1FmdX0DHKkiMqiwdUQ,188
24
+ sdg_hub/configs/knowledge/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ sdg_hub/configs/knowledge/atomic_facts.yaml,sha256=bIfQr0q0FyReO94v_lpLO56FikARCvFmZza-ISZTOnA,2453
26
+ sdg_hub/configs/knowledge/auxilary_instructions.yaml,sha256=aCgIjvNacdC2ZHThEvhZKvwORK6KqErVvVYQYQrIDLE,2034
27
+ sdg_hub/configs/knowledge/detailed_summary.yaml,sha256=_Mc_i9vaLp1OPKexSOURV5gbXEG41p1eELUukOhz8oM,388
28
+ sdg_hub/configs/knowledge/evaluate_faithfulness.yaml,sha256=iuvx5vNNm_jzHlmcKF83StaDYezRz2vQn3JUHM-TMdQ,3054
29
+ sdg_hub/configs/knowledge/evaluate_question.yaml,sha256=02mikEAJCUEkREBo7KxPY9H6iTUHQN-4cRkn2XMlVQ8,1915
30
+ sdg_hub/configs/knowledge/evaluate_relevancy.yaml,sha256=ASh8A1HAYO1h1tQRrwGnkUmK1n-WDKLdfW_LbSW1ipQ,3690
31
+ sdg_hub/configs/knowledge/extractive_summary.yaml,sha256=TYgJ7WQc7NFkf3GeRsbx6lwfA_xFnEOYGELewSqorp0,399
32
+ sdg_hub/configs/knowledge/generate_code_questions_responses.yaml,sha256=cIus2JYMYDvxHFVSU9QVa-1IK5KoChb3rCU2b4b9UmI,908
33
+ sdg_hub/configs/knowledge/generate_questions.yaml,sha256=iJtttZrVvlXFraUSrMowqTCLoJOLDbBndcTNMPTO8A4,2788
34
+ sdg_hub/configs/knowledge/generate_questions_responses.yaml,sha256=H9nb_5xGP7k6HtC3VboXqpiI5kQ9Xp3vjhXH3YIFesk,2525
35
+ sdg_hub/configs/knowledge/generate_responses.yaml,sha256=wwiB7lSB9yEB1XG2SIEIRtHkSlKh3NGJAmDaq2J6-ZY,2483
36
+ sdg_hub/configs/knowledge/mcq_generation.yaml,sha256=d4VKegnVIexwCn0e2AJs-0DC6XdLyUBGaCsQVwzICUE,3152
37
+ sdg_hub/configs/knowledge/router.yaml,sha256=9m_cX3xl808Vwrcq2PACyX45QFPkrV2nVYIY8x10JBU,119
38
+ sdg_hub/configs/knowledge/simple_generate_qa.yaml,sha256=OsuZP9SxQeUhTsHdhUO10mnjJ1u_6xekW5IQucFpRco,1565
39
+ sdg_hub/configs/reasoning/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
+ sdg_hub/configs/reasoning/dynamic_cot.yaml,sha256=6XY_mFpB_oKFQ7U2CmHTqkJRGVHgOvpNmIDfhksYW6o,2641
41
+ sdg_hub/configs/skills/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
+ sdg_hub/configs/skills/analyzer.yaml,sha256=QBtyjaU6HBZqzNOmev_W4_scn_hH7Rfxd2xL_LcPLho,2261
43
+ sdg_hub/configs/skills/annotation.yaml,sha256=k5nJ357kUr0Uvq7Hkt3Ey22UbgSjgSjIomjHFfjaQnY,916
44
+ sdg_hub/configs/skills/contexts.yaml,sha256=MZ2QpuGhTce6kuEsMleaGblljhGG-yhXBuH42htA2P4,1161
45
+ sdg_hub/configs/skills/critic.yaml,sha256=Dr7anOKa7Xx1oDonXzsCfXwKIl4hUTArx2Sb_rgpLQI,1808
46
+ sdg_hub/configs/skills/evaluate_freeform_pair.yaml,sha256=MOI0-GyKrJ_O4v1mm8A1lIKxXfwcS3dA7GjlpDEuXRU,4055
47
+ sdg_hub/configs/skills/evaluate_freeform_questions.yaml,sha256=yDmLd-3A9pN5VLaT4lAcJ_ZvCY43LYlcS1KEdxpBRjU,2559
48
+ sdg_hub/configs/skills/evaluate_grounded_pair.yaml,sha256=vMQtsHpNxPOOHnkzqWPp-N1gSfwPqTbfcKmNfhb9WS8,4648
49
+ sdg_hub/configs/skills/evaluate_grounded_questions.yaml,sha256=9yr97azFhMdOfYp11BFtDSIhhP4wjQMOxYZnKWKlCPU,3115
50
+ sdg_hub/configs/skills/freeform_questions.yaml,sha256=N6R3c1jNiSSw6T-OUJULpLnPHuaSXjvoNjSqTKL6EOY,1500
51
+ sdg_hub/configs/skills/freeform_responses.yaml,sha256=4URTMsPpgSDOVj71Gw3lL82QWnUFR37iE72BIMwwv7c,1544
52
+ sdg_hub/configs/skills/grounded_questions.yaml,sha256=t6pKjt5Fp_ThZueB7JBrUKuQLQY_At-Y9O67OtrIXMo,1898
53
+ sdg_hub/configs/skills/grounded_responses.yaml,sha256=kVOeBp3BjKCFKG2qConXIQVVPI1EgcKJgKn6DFAkl1s,1860
54
+ sdg_hub/configs/skills/judge.yaml,sha256=FxnJA_wdmyMyMqGEZDAT8hc2itO845mGDNXgpmV2EUU,3203
55
+ sdg_hub/configs/skills/planner.yaml,sha256=yNF6t0EnmwYt1EV9Y3-vkmPcbOQRtvoLr8MITuiUw_A,2086
56
+ sdg_hub/configs/skills/respond.yaml,sha256=K1Q5X5_Q1k60hNDbHDjMYBzxbyOIEEHTQcXW6qQ4Ve0,108
57
+ sdg_hub/configs/skills/revised_responder.yaml,sha256=rjypOJbhZV9PuOD9YhlYgymxOJV8Zdzzz54x6Fxn2bY,2875
58
+ sdg_hub/configs/skills/router.yaml,sha256=7YnFp6H5wYD8W5Qn1Ac4r9dGBSFUDhZSNwmglQ99PgQ,3545
59
+ sdg_hub/configs/skills/simple_generate_qa_freeform.yaml,sha256=j8cJtEKSvtA__rE08iU6oz2XnfIgj0HiLVL8-6RhK3c,1431
60
+ sdg_hub/configs/skills/simple_generate_qa_grounded.yaml,sha256=tvX9EN5TArFesOOqpdN3hb-IHe7O82a2twQd-gzyCgw,1500
61
+ sdg_hub/configs/skills/icl_examples/STEM.yaml,sha256=5dcLC5jXOEeDasBkTunnHYrlddI3HcHYnEAXZcrd0ds,8412
62
+ sdg_hub/configs/skills/icl_examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
+ sdg_hub/configs/skills/icl_examples/coding.yaml,sha256=a5m-pUcV9xUb54gQ5U3vsU1RBXzOmsfX0CjTW7U62zo,5240
64
+ sdg_hub/configs/skills/icl_examples/extraction.yaml,sha256=P751l6NvFRkINWz-bX5jgnd_if2bl3d_NlhGI7g81xw,4654
65
+ sdg_hub/configs/skills/icl_examples/humanities.yaml,sha256=tZyiJ4Q3gG4uuoDXw6g__lX3ySEUaRZW2GhW1ustwaM,11370
66
+ sdg_hub/configs/skills/icl_examples/math.yaml,sha256=hNq-QudlXrg9CWLpJdrZ4v3vifGTWhyp2gcfwPdR3_o,6776
67
+ sdg_hub/configs/skills/icl_examples/reasoning.yaml,sha256=eesIlH9SO07TVF20gy18MZrcDzLhSmynd_F_lvg0oQg,4335
68
+ sdg_hub/configs/skills/icl_examples/roleplay.yaml,sha256=LYEyA7wv7QWQscUNQr0K_lotNoWSfuoAEncx3PCRYIs,6997
69
+ sdg_hub/configs/skills/icl_examples/writing.yaml,sha256=El-57IjZ5IvdcmCHyHvX_M2RFFkEos572220be8ecrQ,11335
70
+ sdg_hub/flows/generation/knowledge/mmlu_bench.yaml,sha256=Rueuxr_n1zabE_nGqOgUfh5hqVmEONRka9NLiZANSew,346
71
+ sdg_hub/flows/generation/knowledge/simple_knowledge.yaml,sha256=o4uyfs1nDiECcNROdsvHKiM46NYvQufo9dF4XSGpY54,298
72
+ sdg_hub/flows/generation/knowledge/synth_knowledge.yaml,sha256=ZTZvevfwDQSKUwPcv1i5IzIchsRHSEN03eTefedQmU8,2172
73
+ sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml,sha256=KYMdStAsfWKZhoFzEwTfl8XhF0qRSc6WsgJbzLWCw-U,3634
74
+ sdg_hub/flows/generation/skills/improve_responses.yaml,sha256=wUV0awTmKHNZ62pHiw_yz-IdG0OYgT_dCwlMUlZS3TA,2683
75
+ sdg_hub/flows/generation/skills/simple_freeform_skill.yaml,sha256=iVEomFH1E52JA7KLmTIwkS1PnzxUJVPMgbK2O-m80As,309
76
+ sdg_hub/flows/generation/skills/simple_grounded_skill.yaml,sha256=LTLxqdgbLIKSJonuIRHhcRSpit1EawwNvytWzXWXe2E,309
77
+ sdg_hub/flows/generation/skills/synth_grounded_skills.yaml,sha256=91Dm--agpmbm02hIVnFhEndjppKsQEWXDbckR9GAzKM,2045
78
+ sdg_hub/flows/generation/skills/synth_skills.yaml,sha256=9lhQcxXXbN4V9ztPph4fyjUtctll2FYtKY-V4grQdy4,1492
79
+ sdg_hub/utils/__init__.py,sha256=Jfs1DAVSYDNn8dfs0Uq2MguSwu77NyhP-KufSJICiBQ,278
80
+ sdg_hub/utils/config_validation.py,sha256=g92GxN73Mjr0cXvc5amB_Fn4iV9-iKeWmPz9HwLPmNY,3426
81
+ sdg_hub/utils/datautils.py,sha256=0t_SZ_UXBKl8uL6rVp3SUh8YKRbzKlh2oO5gr2cKyEw,389
82
+ sdg_hub/utils/error_handling.py,sha256=UvPEmtdpbBL71Zx8DWpIqd8869kEY2dlCH11iDgMfec,1847
83
+ sdg_hub/utils/path_resolution.py,sha256=M7hnwoyRQTKgwGC3Ld1_KmKaO_8Lu0PCk6JtQrLp67Q,2006
84
+ sdg_hub/utils/validation_result.py,sha256=O3zF6r49LQ9StAf_oWmK2bg-JfTQw6rpbHtHr9lI4ks,264
85
+ sdg_hub-0.1.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
86
+ sdg_hub-0.1.2.dist-info/METADATA,sha256=4etDbH6APmsl8vh-b5H8-8r7pVCYBRWbqlRbf6gmYcY,7247
87
+ sdg_hub-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
88
+ sdg_hub-0.1.2.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
89
+ sdg_hub-0.1.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.3.1)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,76 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- # Standard
3
- import operator
4
-
5
- # Third Party
6
- from datasets import Dataset
7
-
8
- # Local
9
- from .block import Block
10
- from ..registry import BlockRegistry
11
- from ..logger_config import setup_logger
12
-
13
- logger = setup_logger(__name__)
14
-
15
-
16
- @BlockRegistry.register("FilterByValueBlock")
17
- class FilterByValueBlock(Block):
18
- def __init__(
19
- self, filter_column, filter_value, operation, convert_dtype=None, **batch_kwargs
20
- ) -> None:
21
- """
22
- Initializes a new instance of the FilterByValueBlock class.
23
-
24
- Parameters:
25
- - filter_column (str): The name of the column in the dataset to apply the filter on.
26
- - filter_value (any or list of any): The value(s) to filter by.
27
- - operation (callable): A function that takes two arguments (column value and filter value) and returns a boolean indicating whether the row should be included in the filtered dataset.
28
- - convert_dtype (callable, optional): A function to convert the data type of the filter column before applying the filter. Defaults to None.
29
- - **batch_kwargs: Additional kwargs for batch processing.
30
-
31
- Returns:
32
- None
33
- """
34
- super().__init__(block_name=self.__class__.__name__)
35
- self.value = filter_value if isinstance(filter_value, list) else [filter_value]
36
- self.column_name = filter_column
37
- self.operation = operation
38
- self.convert_dtype = convert_dtype
39
- self.num_procs = batch_kwargs.get("num_procs", 1)
40
-
41
- def _convert_dtype(self, sample):
42
- try:
43
- sample[self.column_name] = self.convert_dtype(sample[self.column_name])
44
- except ValueError as e:
45
- logger.error(
46
- "Error converting dtype: %s, filling with None to be filtered later", e
47
- )
48
- sample[self.column_name] = None
49
- return sample
50
-
51
- def generate(self, samples) -> Dataset:
52
- if self.convert_dtype:
53
- samples = samples.map(
54
- self._convert_dtype,
55
- num_proc=self.num_procs,
56
- )
57
-
58
- if self.operation == operator.contains:
59
- samples = samples.filter(
60
- lambda x: self.operation(self.value, x[self.column_name]),
61
- num_proc=self.num_procs,
62
- )
63
-
64
- samples = samples.filter(
65
- lambda x: x[self.column_name] is not None,
66
- num_proc=self.num_procs,
67
- )
68
-
69
- samples = samples.filter(
70
- lambda x: any(
71
- self.operation(x[self.column_name], value) for value in self.value
72
- ),
73
- num_proc=self.num_procs,
74
- )
75
-
76
- return samples
@@ -1,31 +0,0 @@
1
- # Third Party
2
- from datasets import Dataset
3
-
4
- # Local
5
- from .block import Block
6
- from ..registry import BlockRegistry
7
- from ..logger_config import setup_logger
8
-
9
- logger = setup_logger(__name__)
10
-
11
-
12
- @BlockRegistry.register("IterBlock")
13
- class IterBlock(Block):
14
- def __init__(self, block_name, num_iters, block_type, block_kwargs, **kwargs):
15
- super().__init__(block_name)
16
- self.num_iters = num_iters
17
- self.block = block_type(**block_kwargs)
18
- self.gen_kwargs = kwargs.get("gen_kwargs", {})
19
- self.gen_kwargs = kwargs.get("gen_kwargs", {})
20
-
21
- def generate(self, samples, **gen_kwargs) -> Dataset:
22
- generated_samples = []
23
- num_iters = self.num_iters
24
-
25
- for _ in range(num_iters):
26
- batch_generated = self.block.generate(
27
- samples, **{**self.gen_kwargs, **gen_kwargs}
28
- )
29
- generated_samples.extend(batch_generated)
30
-
31
- return Dataset.from_list(generated_samples)
@@ -1,194 +0,0 @@
1
- """Module containing blocks for scoring responses using Reward Models."""
2
-
3
- # Standard
4
- from typing import Dict, List
5
- import json
6
- from urllib.parse import urljoin
7
-
8
- # Third Party
9
- from datasets import Dataset
10
- import requests
11
-
12
- # Local
13
- from .block import Block
14
- from ..logger_config import setup_logger
15
- from ..registry import BlockRegistry
16
-
17
- logger = setup_logger(__name__)
18
-
19
-
20
- @BlockRegistry.register("PRMBlock")
21
- class PRMBlock(Block):
22
- """A block for scoring responses using a ProcessReward Model (PRM) via HTTP API.
23
-
24
- This block sends prompts and responses to a PRM endpoint and returns reward scores
25
- for each step in the response.
26
- """
27
-
28
- def __init__(
29
- self,
30
- block_name: str,
31
- host: str,
32
- port: int,
33
- model_name: str,
34
- prompt_col: str,
35
- response_col: str,
36
- output_col: str = "step_rewards",
37
- system_prompt: str = None,
38
- endpoint: str = "pooling",
39
- step_separator: str = "\n\n",
40
- step_fill_token: str = "<extra_0>",
41
- ) -> None:
42
- r"""Initialize the PRM (Process Reward Model) Block.
43
-
44
- Parameters
45
- ----------
46
- block_name : str
47
- Name of the block
48
- host : str
49
- Hostname of the PRM service (e.g., "0.0.0.0" or "localhost")
50
- port : int
51
- Port number the service is running on
52
- model_name : str
53
- Name of the PRM model to use
54
- prompt_col : str
55
- Column name containing the prompt
56
- response_col : str
57
- Column name containing the response
58
- output_col : str, optional
59
- Column name to store the reward scores, by default "step_rewards"
60
- system_prompt : str, optional
61
- Optional system prompt to use for scoring, by default None
62
- endpoint : str, optional
63
- API endpoint name, by default "pooling"
64
- step_separator : str, optional
65
- Separator between steps in the response, by default "\n\n"
66
- step_fill_token : str, optional
67
- Model specific fill token for steps in the response, by default "<extra_0>" used by Qwen2.5-Math-PRM
68
- """
69
- super().__init__(block_name)
70
- # Construct base URL from host and port
71
- self.base_url = f"http://{host.strip('/')}:{port}/"
72
- self.endpoint = endpoint.strip("/")
73
-
74
- # Construct the full API URL using urljoin
75
- self.api_url = urljoin(self.base_url, self.endpoint)
76
- logger.info(f"Initialized PRMBlock with API URL: {self.api_url}")
77
-
78
- self.model_name = model_name
79
- self.prompt_col = prompt_col
80
- self.response_col = response_col
81
- self.output_col = output_col
82
- self.system_prompt = system_prompt
83
- self.step_separator = step_separator
84
- self.step_fill_token = step_fill_token
85
-
86
- def _post_request(self, messages: List[Dict]) -> requests.Response:
87
- """Make POST request to PRM API endpoint.
88
-
89
- Parameters
90
- ----------
91
- messages : List[Dict]
92
- List of message dictionaries to send to the API
93
-
94
- Returns
95
- -------
96
- requests.Response
97
- Response from the API
98
- """
99
- headers = {"User-Agent": "PRMBlock Client"}
100
- prompt = {"model": self.model_name, "messages": messages}
101
- response = requests.post(self.api_url, headers=headers, json=prompt)
102
- return response
103
-
104
- def _format_messages(self, sample: Dict) -> List[Dict]:
105
- """Format input sample into messages for the PRM API.
106
-
107
- Parameters
108
- ----------
109
- sample : Dict
110
- Input sample containing prompt and response
111
-
112
- Returns
113
- -------
114
- List[Dict]
115
- Formatted messages for the API
116
- """
117
- messages = []
118
- if self.system_prompt:
119
- messages.append({"role": "system", "content": self.system_prompt})
120
-
121
- messages.append({"role": "user", "content": sample[self.prompt_col]})
122
- messages.append(
123
- {
124
- "role": "assistant",
125
- "content": self.step_fill_token.join(sample[self.response_col].split(self.step_separator))
126
- + self.step_fill_token,
127
- }
128
- )
129
- return messages
130
-
131
- def _extract_rewards(self, response: requests.Response) -> List[float]:
132
- """Extract reward scores from API response.
133
-
134
- Parameters
135
- ----------
136
- response : requests.Response
137
- Response from the API
138
-
139
- Returns
140
- -------
141
- List[float]
142
- List of reward scores
143
- """
144
- try:
145
- response_data = response.json()
146
- rewards = [x[1] for x in response_data["data"][0]["data"]]
147
- return rewards
148
- except (KeyError, IndexError, json.JSONDecodeError) as e:
149
- logger.error(f"Error extracting rewards from response: {e}")
150
- return []
151
-
152
- def _generate(self, sample: dict) -> dict:
153
- """Generate reward scores for the input samples.
154
-
155
- Parameters
156
- ----------
157
- sample : dict
158
- Input sample to score
159
-
160
- Returns
161
- -------
162
- dict
163
- Dictionary with added reward scores column
164
- """
165
- messages = self._format_messages(sample)
166
- rm_response = self._post_request(messages)
167
-
168
- if rm_response.status_code != 200:
169
- logger.error(f"API request failed with status {rm_response.status_code}")
170
- rewards = [0.0] * len(
171
- sample[self.response_col].split(self.step_separator)
172
- ) # Default to 0 scores on failure
173
- else:
174
- rewards = self._extract_rewards(rm_response)
175
-
176
- sample[self.output_col] = rewards
177
- return sample
178
-
179
- def generate(self, samples: Dataset, batch_size: int = 4) -> Dataset:
180
- """Generate reward scores for the input samples.
181
-
182
- Parameters
183
- ----------
184
- samples : Dataset
185
- Input dataset containing samples to score
186
- batch_size : int, optional
187
- Number of processes to use for parallel processing, by default 4
188
-
189
- Returns
190
- -------
191
- Dataset
192
- Dataset with added reward scores
193
- """
194
- return samples.map(self._generate, num_proc=batch_size)
@@ -1,10 +0,0 @@
1
- system: ~
2
- introduction: |
3
- Task Description: {{ simple_task_description }}
4
- principles: ~
5
- examples: ~
6
- generation: |
7
- Here is the query for annotation:
8
- {{ prompt }}
9
- start_tags: [""]
10
- end_tags: [""]