sdg-hub 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/__init__.py +28 -1
- sdg_hub/_version.py +2 -2
- sdg_hub/core/__init__.py +22 -0
- sdg_hub/core/blocks/__init__.py +58 -0
- sdg_hub/core/blocks/base.py +313 -0
- sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
- sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
- sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
- sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
- sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
- sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
- sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
- sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
- sdg_hub/core/blocks/evaluation/__init__.py +9 -0
- sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
- sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
- sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
- sdg_hub/core/blocks/filtering/__init__.py +12 -0
- sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
- sdg_hub/core/blocks/llm/__init__.py +25 -0
- sdg_hub/core/blocks/llm/client_manager.py +398 -0
- sdg_hub/core/blocks/llm/config.py +336 -0
- sdg_hub/core/blocks/llm/error_handler.py +368 -0
- sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
- sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
- sdg_hub/core/blocks/llm/text_parser_block.py +310 -0
- sdg_hub/core/blocks/registry.py +331 -0
- sdg_hub/core/blocks/transform/__init__.py +23 -0
- sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
- sdg_hub/core/blocks/transform/melt_columns.py +126 -0
- sdg_hub/core/blocks/transform/rename_columns.py +69 -0
- sdg_hub/core/blocks/transform/text_concat.py +102 -0
- sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
- sdg_hub/core/flow/__init__.py +20 -0
- sdg_hub/core/flow/base.py +980 -0
- sdg_hub/core/flow/metadata.py +344 -0
- sdg_hub/core/flow/migration.py +187 -0
- sdg_hub/core/flow/registry.py +330 -0
- sdg_hub/core/flow/validation.py +265 -0
- sdg_hub/{utils → core/utils}/__init__.py +6 -4
- sdg_hub/{utils → core/utils}/datautils.py +1 -3
- sdg_hub/core/utils/error_handling.py +208 -0
- sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +191 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
- sdg_hub-0.2.0.dist-info/METADATA +218 -0
- sdg_hub-0.2.0.dist-info/RECORD +63 -0
- sdg_hub/blocks/__init__.py +0 -42
- sdg_hub/blocks/block.py +0 -96
- sdg_hub/blocks/llmblock.py +0 -375
- sdg_hub/blocks/openaichatblock.py +0 -556
- sdg_hub/blocks/utilblocks.py +0 -597
- sdg_hub/checkpointer.py +0 -139
- sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
- sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
- sdg_hub/configs/annotations/detailed_description.yaml +0 -10
- sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
- sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
- sdg_hub/configs/knowledge/__init__.py +0 -0
- sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
- sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
- sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
- sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
- sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
- sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
- sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
- sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
- sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
- sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
- sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
- sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
- sdg_hub/configs/knowledge/router.yaml +0 -12
- sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
- sdg_hub/configs/reasoning/__init__.py +0 -0
- sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
- sdg_hub/configs/skills/__init__.py +0 -0
- sdg_hub/configs/skills/analyzer.yaml +0 -48
- sdg_hub/configs/skills/annotation.yaml +0 -36
- sdg_hub/configs/skills/contexts.yaml +0 -28
- sdg_hub/configs/skills/critic.yaml +0 -60
- sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
- sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
- sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
- sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
- sdg_hub/configs/skills/freeform_questions.yaml +0 -34
- sdg_hub/configs/skills/freeform_responses.yaml +0 -39
- sdg_hub/configs/skills/grounded_questions.yaml +0 -38
- sdg_hub/configs/skills/grounded_responses.yaml +0 -59
- sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
- sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
- sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
- sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
- sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
- sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
- sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
- sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
- sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
- sdg_hub/configs/skills/judge.yaml +0 -53
- sdg_hub/configs/skills/planner.yaml +0 -67
- sdg_hub/configs/skills/respond.yaml +0 -8
- sdg_hub/configs/skills/revised_responder.yaml +0 -78
- sdg_hub/configs/skills/router.yaml +0 -59
- sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
- sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
- sdg_hub/flow.py +0 -477
- sdg_hub/flow_runner.py +0 -450
- sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
- sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
- sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
- sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -148
- sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
- sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
- sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
- sdg_hub/pipeline.py +0 -121
- sdg_hub/prompts.py +0 -74
- sdg_hub/registry.py +0 -122
- sdg_hub/sdg.py +0 -206
- sdg_hub/utils/config_validation.py +0 -91
- sdg_hub/utils/error_handling.py +0 -94
- sdg_hub/utils/validation_result.py +0 -10
- sdg_hub-0.1.3.dist-info/METADATA +0 -190
- sdg_hub-0.1.3.dist-info/RECORD +0 -89
- sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
- /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
- /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
- {sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/WHEEL +0 -0
- {sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,54 @@
|
|
1
|
+
- role: system
|
2
|
+
content: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
|
3
|
+
|
4
|
+
- role: user
|
5
|
+
content: |
|
6
|
+
Develop a series of educational question and answer pairs from a chapter in a {{domain}} textbook.
|
7
|
+
|
8
|
+
The questions should:
|
9
|
+
* Be self-contained, not requiring references to tables, figures, or specific sections in the text for understanding.
|
10
|
+
* Focus on teaching and reinforcing the key knowledge and concepts presented in the chapter.
|
11
|
+
* Avoid sections with minimal educational content like index pages or prefaces. In such cases, respond with [UNANSWERABLE].
|
12
|
+
* Be directly relevant to the textbook's domain. For instance, in a science textbook, questions should revolve around scientific terms, definitions, and practical applications, while in a legal textbook, they should cover legal principles, case law, and precedents.
|
13
|
+
* Be formulated to allow for independent answers, avoiding direct references to specific theorems or text sections. For example, rather than asking 'Under what conditions is the fixed point of a function unique according to Theorem 3.1.5?', ask 'How does the Fixed Point Iteration method contribute to understanding function uniqueness?'
|
14
|
+
* Span a range of difficulty levels to accommodate a diverse student audience, from basic understanding to advanced comprehension.
|
15
|
+
* Include a variety of question types such as multiple-choice for basic recall, short answer for deeper understanding, and essay or problem-solving questions to test application and analysis skills.
|
16
|
+
* Align closely with the learning objectives of the textbook or the specific chapter, ensuring that the questions test the fundamental concepts and skills that the chapter aims to impart.
|
17
|
+
|
18
|
+
Strictly follow this format for each question answer pair your generate while responding:
|
19
|
+
|
20
|
+
[QUESTION]
|
21
|
+
<Insert question here>
|
22
|
+
[ANSWER]
|
23
|
+
<Insert answer here>
|
24
|
+
[END]
|
25
|
+
|
26
|
+
Each question and answer pair should stand alone as a mini-lesson, encapsulating a key concept or idea from the chapter in a way that is accessible and informative without requiring the reader to refer back to the textbook.
|
27
|
+
|
28
|
+
Here are some examples of questions:
|
29
|
+
|
30
|
+
[Document]
|
31
|
+
{{icl_document}}
|
32
|
+
|
33
|
+
[QUESTION]
|
34
|
+
{{icl_query_1}}
|
35
|
+
[ANSWER]
|
36
|
+
{{icl_response_1}}
|
37
|
+
[END]
|
38
|
+
|
39
|
+
[QUESTION]
|
40
|
+
{{icl_query_2}}
|
41
|
+
[ANSWER]
|
42
|
+
{{icl_response_2}}
|
43
|
+
[END]
|
44
|
+
|
45
|
+
[QUESTION]
|
46
|
+
{{icl_query_3}}
|
47
|
+
[ANSWER]
|
48
|
+
{{icl_response_3}}
|
49
|
+
[END]
|
50
|
+
|
51
|
+
Now, here is the document:
|
52
|
+
[DOCUMENT]
|
53
|
+
{{document_outline}}
|
54
|
+
{{document}}
|
@@ -0,0 +1,218 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: sdg_hub
|
3
|
+
Version: 0.2.0
|
4
|
+
Summary: Synthetic Data Generation
|
5
|
+
Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
|
6
|
+
License: Apache-2.0
|
7
|
+
Project-URL: homepage, https://ai-innovation.team/
|
8
|
+
Project-URL: source, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub
|
9
|
+
Project-URL: issues, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/issues
|
10
|
+
Classifier: Environment :: Console
|
11
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
13
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
14
|
+
Classifier: Operating System :: POSIX :: Linux
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
21
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
22
|
+
Requires-Python: >=3.10
|
23
|
+
Description-Content-Type: text/markdown
|
24
|
+
License-File: LICENSE
|
25
|
+
Requires-Dist: click<9.0.0,>=8.1.7
|
26
|
+
Requires-Dist: datasets<4.0.0,>=2.18.0
|
27
|
+
Requires-Dist: httpx<1.0.0,>=0.25.0
|
28
|
+
Requires-Dist: jinja2
|
29
|
+
Requires-Dist: litellm<1.75.0,>=1.73.0
|
30
|
+
Requires-Dist: openai<2.0.0,>=1.13.3
|
31
|
+
Requires-Dist: rich
|
32
|
+
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
33
|
+
Requires-Dist: python-dotenv<2.0.0,>=1.0.0
|
34
|
+
Requires-Dist: tenacity!=8.4.0,>=8.3.0
|
35
|
+
Requires-Dist: tqdm<5.0.0,>=4.66.2
|
36
|
+
Provides-Extra: vllm
|
37
|
+
Requires-Dist: vllm>=0.9.1; extra == "vllm"
|
38
|
+
Requires-Dist: torch>=2.0.0; extra == "vllm"
|
39
|
+
Requires-Dist: transformers>=4.37.0; extra == "vllm"
|
40
|
+
Requires-Dist: accelerate>=0.21.0; extra == "vllm"
|
41
|
+
Requires-Dist: xformers>=0.0.22.post7; extra == "vllm"
|
42
|
+
Provides-Extra: examples
|
43
|
+
Requires-Dist: tabulate>=0.9.0; extra == "examples"
|
44
|
+
Requires-Dist: transformers>=4.37.0; extra == "examples"
|
45
|
+
Requires-Dist: langchain-text-splitters; extra == "examples"
|
46
|
+
Requires-Dist: docling>=2.3.0; extra == "examples"
|
47
|
+
Requires-Dist: scikit-learn; extra == "examples"
|
48
|
+
Requires-Dist: pandas; extra == "examples"
|
49
|
+
Requires-Dist: polars; extra == "examples"
|
50
|
+
Requires-Dist: matplotlib; extra == "examples"
|
51
|
+
Requires-Dist: spacy; extra == "examples"
|
52
|
+
Requires-Dist: nltk; extra == "examples"
|
53
|
+
Requires-Dist: sentence-transformers; extra == "examples"
|
54
|
+
Requires-Dist: instructor; extra == "examples"
|
55
|
+
Requires-Dist: fastapi; extra == "examples"
|
56
|
+
Requires-Dist: nest-asyncio; extra == "examples"
|
57
|
+
Provides-Extra: dev
|
58
|
+
Requires-Dist: pre-commit<4.0,>=3.0.4; extra == "dev"
|
59
|
+
Requires-Dist: pylint<4.0,>=2.16.2; extra == "dev"
|
60
|
+
Requires-Dist: pylint-pydantic; extra == "dev"
|
61
|
+
Requires-Dist: pytest; extra == "dev"
|
62
|
+
Requires-Dist: pytest-asyncio; extra == "dev"
|
63
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
64
|
+
Requires-Dist: pytest-html; extra == "dev"
|
65
|
+
Requires-Dist: tox<5,>=4.4.2; extra == "dev"
|
66
|
+
Requires-Dist: ruff; extra == "dev"
|
67
|
+
Dynamic: license-file
|
68
|
+
|
69
|
+
# `sdg_hub`: Synthetic Data Generation Toolkit
|
70
|
+
|
71
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/pypi.yaml)
|
72
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/releases)
|
73
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/blob/main/LICENSE)
|
74
|
+
[](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml)
|
75
|
+
[](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub)
|
76
|
+
|
77
|
+
|
78
|
+
|
79
|
+
A modular Python framework for building synthetic data generation pipelines using composable blocks and flows. Transform datasets through **building-block composition** - mix and match LLM-powered and traditional processing blocks to create sophisticated data generation workflows.
|
80
|
+
|
81
|
+
**📖 Full documentation available at: [https://ai-innovation.team/sdg_hub](https://ai-innovation.team/sdg_hub)**
|
82
|
+
|
83
|
+
## ✨ Key Features
|
84
|
+
|
85
|
+
**🔧 Modular Composability** - Mix and match blocks like Lego pieces. Build simple transformations or complex multi-stage pipelines with YAML-configured flows.
|
86
|
+
|
87
|
+
**⚡ Async Performance** - High-throughput LLM processing with built-in error handling.
|
88
|
+
|
89
|
+
**🛡️ Built-in Validation** - Pydantic-based type safety ensures your configurations and data are correct before execution.
|
90
|
+
|
91
|
+
**🔍 Auto-Discovery** - Automatic block and flow registration. No manual imports or complex setup.
|
92
|
+
|
93
|
+
**📊 Rich Monitoring** - Detailed logging with progress bars and execution summaries.
|
94
|
+
|
95
|
+
**🧩 Easily Extensible** - Create custom blocks with simple inheritance. Rich logging and monitoring built-in.
|
96
|
+
|
97
|
+
|
98
|
+
## 📦 Installation
|
99
|
+
|
100
|
+
Recommended: Install uv — see https://docs.astral.sh/uv/getting-started/installation/
|
101
|
+
|
102
|
+
```bash
|
103
|
+
# Production
|
104
|
+
uv pip install sdg-hub
|
105
|
+
|
106
|
+
# Development
|
107
|
+
git clone https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub.git
|
108
|
+
cd sdg_hub
|
109
|
+
uv pip install .[dev]
|
110
|
+
# or: uv sync --extra dev
|
111
|
+
```
|
112
|
+
|
113
|
+
### Optional Dependencies
|
114
|
+
```bash
|
115
|
+
# For vLLM support
|
116
|
+
uv pip install sdg-hub[vllm]
|
117
|
+
|
118
|
+
# For examples
|
119
|
+
uv pip install sdg-hub[examples]
|
120
|
+
```
|
121
|
+
|
122
|
+
## 🚀 Quick Start
|
123
|
+
|
124
|
+
### 🧱 Core Concepts
|
125
|
+
|
126
|
+
**Blocks** are composable units that transform datasets - think of them as data processing Lego pieces. Each block performs a specific task: LLM chat, text parsing, evaluation, or transformation.
|
127
|
+
|
128
|
+
**Flows** orchestrate multiple blocks into complete pipelines defined in YAML. Chain blocks together to create complex data generation workflows with validation and parameter management.
|
129
|
+
|
130
|
+
```python
|
131
|
+
# Simple concept: Blocks transform data, Flows chain blocks together
|
132
|
+
dataset → Block₁ → Block₂ → Block₃ → enriched_dataset
|
133
|
+
```
|
134
|
+
|
135
|
+
### Try it out!
|
136
|
+
|
137
|
+
#### Flow Discovery
|
138
|
+
```python
|
139
|
+
from sdg_hub import FlowRegistry
|
140
|
+
|
141
|
+
# Auto-discover all available flows (no setup needed!)
|
142
|
+
FlowRegistry.discover_flows()
|
143
|
+
|
144
|
+
# List available flows
|
145
|
+
flows = FlowRegistry.list_flows()
|
146
|
+
print(f"Available flows: {flows}")
|
147
|
+
|
148
|
+
# Search for specific types
|
149
|
+
qa_flows = FlowRegistry.search_flows(tag="question-generation")
|
150
|
+
print(f"QA flows: {qa_flows}")
|
151
|
+
```
|
152
|
+
|
153
|
+
#### Using Flows
|
154
|
+
```python
|
155
|
+
from sdg_hub import FlowRegistry, Flow
|
156
|
+
from datasets import Dataset
|
157
|
+
|
158
|
+
# Load the flow by name
|
159
|
+
flow_name = "Advanced Document Grounded Question-Answer Generation Flow for Knowledge Tuning"
|
160
|
+
flow_path = FlowRegistry.get_flow_path(flow_name)
|
161
|
+
flow = Flow.from_yaml(flow_path)
|
162
|
+
|
163
|
+
# Discover recommended models
|
164
|
+
default_model = flow.get_default_model()
|
165
|
+
recommendations = flow.get_model_recommendations()
|
166
|
+
|
167
|
+
# Configure model settings at runtime
|
168
|
+
# This assumes you have a hosted vLLM instance of meta-llama/Llama-3.3-70B-Instruct running at http://localhost:8000/v1
|
169
|
+
flow.set_model_config(
|
170
|
+
model=f"hosted_vllm/{default_model}",
|
171
|
+
api_base="http://localhost:8000/v1",
|
172
|
+
api_key="your_key",
|
173
|
+
)
|
174
|
+
|
175
|
+
# Create your dataset with required columns
|
176
|
+
dataset = Dataset.from_dict({
|
177
|
+
'document': ['Your document text here...'],
|
178
|
+
'document_outline': ['1. Topic A; 2. Topic B; 3. Topic C'],
|
179
|
+
'domain': ['Computer Science'],
|
180
|
+
'icl_document': ['Example document for in-context learning...'],
|
181
|
+
'icl_query_1': ['Example question 1?'],
|
182
|
+
'icl_response_1': ['Example answer 1'],
|
183
|
+
'icl_query_2': ['Example question 2?'],
|
184
|
+
'icl_response_2': ['Example answer 2'],
|
185
|
+
'icl_query_3': ['Example question 3?'],
|
186
|
+
'icl_response_3': ['Example answer 3']
|
187
|
+
})
|
188
|
+
|
189
|
+
# Generate high-quality QA pairs
|
190
|
+
result = flow.generate(dataset)
|
191
|
+
|
192
|
+
# Access generated content
|
193
|
+
questions = result['question']
|
194
|
+
answers = result['response']
|
195
|
+
faithfulness_scores = result['faithfulness_judgment']
|
196
|
+
relevancy_scores = result['relevancy_score']
|
197
|
+
```
|
198
|
+
|
199
|
+
#### Quick Testing with Dry Run
|
200
|
+
```python
|
201
|
+
# Test the flow with a small sample first
|
202
|
+
dry_result = flow.dry_run(dataset, sample_size=1)
|
203
|
+
print(f"Dry run completed in {dry_result['execution_time_seconds']:.2f}s")
|
204
|
+
print(f"Output columns: {dry_result['final_dataset']['columns']}")
|
205
|
+
```
|
206
|
+
|
207
|
+
|
208
|
+
## 📄 License
|
209
|
+
|
210
|
+
This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.
|
211
|
+
|
212
|
+
## 🤝 Contributing
|
213
|
+
|
214
|
+
We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on how to contribute to this project.
|
215
|
+
|
216
|
+
---
|
217
|
+
|
218
|
+
Built with ❤️ by the Red Hat AI Innovation Team
|
@@ -0,0 +1,63 @@
|
|
1
|
+
sdg_hub/__init__.py,sha256=Tw-6R5a8_W1kJcTAsW3R9ltBDP1dy5-fe7Tvt3cSyCQ,550
|
2
|
+
sdg_hub/_version.py,sha256=iB5DfB5V6YB5Wo4JmvS-txT42QtmGaWcWp3udRT7zCI,511
|
3
|
+
sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
+
sdg_hub/core/__init__.py,sha256=NwqB4fwhC29W50VW7QXZssLxx122YvgO9LHDLdgAnrI,496
|
5
|
+
sdg_hub/core/blocks/__init__.py,sha256=9sCkCvDQzJGSedaePVlEIpbNwrkBz_K500VW_6FLhuE,1601
|
6
|
+
sdg_hub/core/blocks/base.py,sha256=TrzUAkG7Tiquk0Z3SOFsb5mRnHd1IbHH6gFPVH1P7T8,10424
|
7
|
+
sdg_hub/core/blocks/registry.py,sha256=a9CcjA5n7JWmfTyeQPml14aW0tlYU9QLkSkskKWJT2o,9771
|
8
|
+
sdg_hub/core/blocks/deprecated_blocks/__init__.py,sha256=RDu3MWFStDQko-TKkx8tGoB1UTatP_RSldZK43zHDvY,889
|
9
|
+
sdg_hub/core/blocks/deprecated_blocks/combine_columns.py,sha256=HCvpaYsAwgx1Dm0vIshcWsKoVsRT0KrmKp9j4oqtByc,2757
|
10
|
+
sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py,sha256=maCaaEs0EMMzt7L1xm7fAH3ylaFMHEkeC_dtOw3FrjU,2694
|
11
|
+
sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py,sha256=-fuuMKj2g2MrijMBTd0PWtYBbf9anQ2UkYXHigCxxJI,3328
|
12
|
+
sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py,sha256=IenCskrPEv09h2uT6aZKCQzaxgA_3kAzOeJSd-R_-EA,2839
|
13
|
+
sdg_hub/core/blocks/deprecated_blocks/llmblock.py,sha256=34lzC43BODpMk5AwlWA1ctdYPmN7cA6WL5vMXaI0P0Y,20385
|
14
|
+
sdg_hub/core/blocks/deprecated_blocks/rename_columns.py,sha256=thp-mHtkRmUw_nYKpldy_mLWR2AvC5YUhbqDETM6-T0,2620
|
15
|
+
sdg_hub/core/blocks/deprecated_blocks/sample_populator.py,sha256=UdueMApxOmPWaxxMrw7b1v74fKJBfqqRATEBqgmVtNw,1737
|
16
|
+
sdg_hub/core/blocks/deprecated_blocks/selector.py,sha256=ABcXZrqEMsgKfdGAkSo2plMp4LsZSqPhEQugoDEYm1I,2950
|
17
|
+
sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py,sha256=44TQu-rK5isia-otMVB1zHd8D-wWmu3C8CI1NLtfY5s,2729
|
18
|
+
sdg_hub/core/blocks/evaluation/__init__.py,sha256=kFXee-vsVVdU2XtLio9qHgPx_a0zoB_rQr509EKBGJc,357
|
19
|
+
sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py,sha256=ZuQ8jq2JwTdslUJtFi1E9NXebCWFZS8isXOafcJ_CMU,23026
|
20
|
+
sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py,sha256=ieQRwl4bx5EQ3m7Wa2P3pHLUPQY7HuwNWjHUCo98u6g,22832
|
21
|
+
sdg_hub/core/blocks/evaluation/verify_question_block.py,sha256=fSNbW1KpdfVE0fQsm4Y8QfVk6A3J5H3C0dtGn49t8tM,22853
|
22
|
+
sdg_hub/core/blocks/filtering/__init__.py,sha256=isxSVSvDqkMjG8dQSl3Q2M4g5c1t9fTjBSA21icf-yA,275
|
23
|
+
sdg_hub/core/blocks/filtering/column_value_filter.py,sha256=H8Gif0q9Wc_d1TnVow8Zpsg7blJOFGN1EZmV6OPpkcg,5971
|
24
|
+
sdg_hub/core/blocks/llm/__init__.py,sha256=qAb-pzbI3EqjOVjU48Y63cR3Oly5ZjCkhdwkk1ltqTc,732
|
25
|
+
sdg_hub/core/blocks/llm/client_manager.py,sha256=vaoPoTITJ9IlooeVRfu6M4WBc08mp4aJZ5tvnl2fMv8,12309
|
26
|
+
sdg_hub/core/blocks/llm/config.py,sha256=TmbfqxPHH3mShTK2EuCX2AGKtDvl0aSvihsaqgzABtM,11266
|
27
|
+
sdg_hub/core/blocks/llm/error_handler.py,sha256=7T-019ZFB9qgZoX1ybIiXyaLjPzrF96qcKmUu6vmO6g,12178
|
28
|
+
sdg_hub/core/blocks/llm/llm_chat_block.py,sha256=3o2oV_ecWsEHFp5FWPIpBT-yJ1imJmeZy2b9GZL-T54,20121
|
29
|
+
sdg_hub/core/blocks/llm/prompt_builder_block.py,sha256=fkJd718X1oYlMY1cjo_8WCO16Gl8Tm0bUPWR78E_uws,13935
|
30
|
+
sdg_hub/core/blocks/llm/text_parser_block.py,sha256=9n6pHKVmMD1wwEYdFs0kIz5TblmDxl5dtmbyLZHGivo,12005
|
31
|
+
sdg_hub/core/blocks/transform/__init__.py,sha256=Y_3izPCtgnMbFK-gBMeLHZspSrNLgbGheAJXU57XfFw,746
|
32
|
+
sdg_hub/core/blocks/transform/duplicate_columns.py,sha256=SaP7rIF4ZFEFFa50aU2xGNIuddXaEZrKxdWfHjzFpVI,2833
|
33
|
+
sdg_hub/core/blocks/transform/index_based_mapper.py,sha256=mGup5agvDf9kAFSvXE5X6Puo6CQc9UOdFdbhdFWJjwk,8225
|
34
|
+
sdg_hub/core/blocks/transform/melt_columns.py,sha256=vaYa5Taq6GhNZYWFL4uPK3-SfN2BsKEm-wvjd2EYYoI,4382
|
35
|
+
sdg_hub/core/blocks/transform/rename_columns.py,sha256=qeB5L2utqDQnutUetH1VKZSqDiJSH_yUp5EFCV-XCVI,1998
|
36
|
+
sdg_hub/core/blocks/transform/text_concat.py,sha256=_-B__Hob1WwgwkILPIZvTnsDzuwtoX1hKviyzHlnnes,3149
|
37
|
+
sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=XnjiT29z3PzIPy8M-mmE2w-Miab6Ed5ahy32SaxTCTE,3263
|
38
|
+
sdg_hub/core/flow/__init__.py,sha256=N2NZGngvd7qpT5FI_knKukUFM0IkD9K5jdTi-gDeUI4,475
|
39
|
+
sdg_hub/core/flow/base.py,sha256=0sx_chQIeuBcLH1fNMkkD0PxX5UeEv_pCBxYI0Byzi8,36884
|
40
|
+
sdg_hub/core/flow/metadata.py,sha256=_IfFWtCukYoMMG2QWRganUl0uGQO_jxniIVBlVmutus,11487
|
41
|
+
sdg_hub/core/flow/migration.py,sha256=g0Ug4ZrR_ssxJ-ESVP7ubkD0kql6aSChOuMmx-ZMn8A,7198
|
42
|
+
sdg_hub/core/flow/registry.py,sha256=T2veU05h4Q9vb_6F_NYHnNuFZE21orWsx1-iGl0aoJk,9564
|
43
|
+
sdg_hub/core/flow/validation.py,sha256=g0G7MH3bz7kcNsfRrlSi8iJZi8gqVcgODhHygVYtJVI,9185
|
44
|
+
sdg_hub/core/utils/__init__.py,sha256=y_D7HcRxw7FXShw5USQpCt-5h4VXOFFvMOMN3_oALiw,279
|
45
|
+
sdg_hub/core/utils/datautils.py,sha256=qKK2HXAqI4t-O-9RMu2DdaQVZwTnJj-W7-Hc5o1iqZw,379
|
46
|
+
sdg_hub/core/utils/error_handling.py,sha256=yku8cGj_nKCyXDsnb-mHCpgukkkAMucJ4iAUrIzqysc,5510
|
47
|
+
sdg_hub/core/utils/logger_config.py,sha256=MPYdpyNXh_pxFUOAvSCHa98LGjxjaLXoUoqWekqTG4s,422
|
48
|
+
sdg_hub/core/utils/path_resolution.py,sha256=yWof4kGNpQ5dKcrVHg0h9KfOKLZ6ROjdfsLAZsQT5rM,2000
|
49
|
+
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
50
|
+
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
51
|
+
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml,sha256=xgUNY793y4lcpdtuWm5Ah1CmbU2gvvPQCpZMMa6kPXU,2447
|
52
|
+
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml,sha256=_vF-AzjC8d6wqAle5pkQ103EW-BbAhNA0qllk3ojUZc,353
|
53
|
+
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml,sha256=GiIipXrjm7btghvpgFUoTZYAJRyu7yE-WEi5yDLxjY4,3032
|
54
|
+
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml,sha256=zwzklXup6khRkR88avgrJTcjaMcV1wnbeYaML5oPuNs,1767
|
55
|
+
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml,sha256=cA8igo7jMrRXaWW6k0of6KOp7YnxLtPj0fP4DbrmZNQ,3647
|
56
|
+
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml,sha256=fcMV7LaCFZo4D29nwhGJXqFFuZMYVLo9XYjv8zcU6zs,364
|
57
|
+
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml,sha256=RrWr2jaandGgLkJiBLFPPA1g6B6vmL98-qXPozqjHKQ,6286
|
58
|
+
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml,sha256=yX8aLY8dJSDML9ZJhnj9RzPbN8tH2xfcM4Gc6xZuwqQ,2596
|
59
|
+
sdg_hub-0.2.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
60
|
+
sdg_hub-0.2.0.dist-info/METADATA,sha256=APjsGUk94_tQRVlncgVxkEOTSOpHY25SOMmOO1lt0P0,8464
|
61
|
+
sdg_hub-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
62
|
+
sdg_hub-0.2.0.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
|
63
|
+
sdg_hub-0.2.0.dist-info/RECORD,,
|
sdg_hub/blocks/__init__.py
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
"""Block implementations for SDG Hub.
|
2
|
-
|
3
|
-
This package provides various block implementations for data generation, processing, and transformation.
|
4
|
-
"""
|
5
|
-
|
6
|
-
# Local
|
7
|
-
from .block import Block
|
8
|
-
from .llmblock import LLMBlock, ConditionalLLMBlock
|
9
|
-
from .openaichatblock import (
|
10
|
-
OpenAIChatBlock,
|
11
|
-
OpenAIAsyncChatBlock
|
12
|
-
)
|
13
|
-
from .utilblocks import (
|
14
|
-
SamplePopulatorBlock,
|
15
|
-
SelectorBlock,
|
16
|
-
CombineColumnsBlock,
|
17
|
-
FlattenColumnsBlock,
|
18
|
-
DuplicateColumns,
|
19
|
-
RenameColumns,
|
20
|
-
SetToMajorityValue,
|
21
|
-
FilterByValueBlock,
|
22
|
-
IterBlock,
|
23
|
-
)
|
24
|
-
from ..registry import BlockRegistry
|
25
|
-
|
26
|
-
__all__ = [
|
27
|
-
"Block",
|
28
|
-
"FilterByValueBlock",
|
29
|
-
"IterBlock",
|
30
|
-
"LLMBlock",
|
31
|
-
"ConditionalLLMBlock",
|
32
|
-
"SamplePopulatorBlock",
|
33
|
-
"SelectorBlock",
|
34
|
-
"CombineColumnsBlock",
|
35
|
-
"FlattenColumnsBlock",
|
36
|
-
"DuplicateColumns",
|
37
|
-
"RenameColumns",
|
38
|
-
"SetToMajorityValue",
|
39
|
-
"BlockRegistry",
|
40
|
-
"OpenAIChatBlock",
|
41
|
-
"OpenAIAsyncChatBlock"
|
42
|
-
]
|
sdg_hub/blocks/block.py
DELETED
@@ -1,96 +0,0 @@
|
|
1
|
-
# SPDX-License-Identifier: Apache-2.0
|
2
|
-
"""Base block implementation for the SDG Hub system.
|
3
|
-
|
4
|
-
This module provides the abstract base class for all blocks in the system,
|
5
|
-
including functionality for template validation and configuration management.
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Standard
|
9
|
-
from abc import ABC
|
10
|
-
from collections import ChainMap
|
11
|
-
from typing import Any, Dict, Optional
|
12
|
-
|
13
|
-
# Third Party
|
14
|
-
from jinja2 import Template, UndefinedError
|
15
|
-
import yaml
|
16
|
-
|
17
|
-
# Local
|
18
|
-
from ..registry import BlockRegistry
|
19
|
-
from ..logger_config import setup_logger
|
20
|
-
|
21
|
-
logger = setup_logger(__name__)
|
22
|
-
|
23
|
-
|
24
|
-
@BlockRegistry.register("Block")
|
25
|
-
class Block(ABC):
|
26
|
-
"""Base abstract class for all blocks in the system.
|
27
|
-
|
28
|
-
This class provides common functionality for block validation and configuration loading.
|
29
|
-
All specific block implementations should inherit from this class.
|
30
|
-
"""
|
31
|
-
|
32
|
-
def __init__(self, block_name: str) -> None:
|
33
|
-
self.block_name = block_name
|
34
|
-
|
35
|
-
@staticmethod
|
36
|
-
def _validate(prompt_template: Template, input_dict: Dict[str, Any]) -> bool:
|
37
|
-
"""Validate the input data for this block.
|
38
|
-
|
39
|
-
This method validates whether all required variables in the Jinja template are provided in the input_dict.
|
40
|
-
|
41
|
-
Parameters
|
42
|
-
----------
|
43
|
-
prompt_template : Template
|
44
|
-
The Jinja2 template object.
|
45
|
-
input_dict : Dict[str, Any]
|
46
|
-
A dictionary of input values to check against the template.
|
47
|
-
|
48
|
-
Returns
|
49
|
-
-------
|
50
|
-
bool
|
51
|
-
True if the input data is valid (i.e., no missing variables), False otherwise.
|
52
|
-
"""
|
53
|
-
|
54
|
-
class Default(dict):
|
55
|
-
def __missing__(self, key: str) -> None:
|
56
|
-
raise KeyError(key)
|
57
|
-
|
58
|
-
try:
|
59
|
-
# Try rendering the template with the input_dict
|
60
|
-
prompt_template.render(ChainMap(input_dict, Default()))
|
61
|
-
return True
|
62
|
-
except UndefinedError as e:
|
63
|
-
logger.error(f"Missing key: {e}")
|
64
|
-
return False
|
65
|
-
|
66
|
-
def _load_config(self, config_path: str) -> Optional[Dict[str, Any]]:
|
67
|
-
"""Load the configuration file for this block.
|
68
|
-
|
69
|
-
Parameters
|
70
|
-
----------
|
71
|
-
config_path : str
|
72
|
-
The path to the configuration file.
|
73
|
-
|
74
|
-
Returns
|
75
|
-
-------
|
76
|
-
Optional[Dict[str, Any]]
|
77
|
-
The loaded configuration. Returns None if file cannot be read or parsed.
|
78
|
-
|
79
|
-
Raises
|
80
|
-
------
|
81
|
-
FileNotFoundError
|
82
|
-
If the configuration file does not exist.
|
83
|
-
"""
|
84
|
-
try:
|
85
|
-
with open(config_path, "r", encoding="utf-8") as config_file:
|
86
|
-
try:
|
87
|
-
return yaml.safe_load(config_file)
|
88
|
-
except yaml.YAMLError as e:
|
89
|
-
logger.error(f"Error parsing YAML from {config_path}: {e}")
|
90
|
-
return None
|
91
|
-
except FileNotFoundError:
|
92
|
-
logger.error(f"Configuration file not found: {config_path}")
|
93
|
-
raise
|
94
|
-
except Exception as e:
|
95
|
-
logger.error(f"Unexpected error reading config file {config_path}: {e}")
|
96
|
-
return None
|