sdg-hub 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. sdg_hub/__init__.py +28 -1
  2. sdg_hub/_version.py +2 -2
  3. sdg_hub/core/__init__.py +22 -0
  4. sdg_hub/core/blocks/__init__.py +58 -0
  5. sdg_hub/core/blocks/base.py +313 -0
  6. sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
  7. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
  8. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
  9. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
  10. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
  11. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
  12. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
  13. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
  14. sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
  15. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
  16. sdg_hub/core/blocks/evaluation/__init__.py +9 -0
  17. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
  18. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
  19. sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
  20. sdg_hub/core/blocks/filtering/__init__.py +12 -0
  21. sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
  22. sdg_hub/core/blocks/llm/__init__.py +25 -0
  23. sdg_hub/core/blocks/llm/client_manager.py +398 -0
  24. sdg_hub/core/blocks/llm/config.py +336 -0
  25. sdg_hub/core/blocks/llm/error_handler.py +368 -0
  26. sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
  27. sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
  28. sdg_hub/core/blocks/llm/text_parser_block.py +310 -0
  29. sdg_hub/core/blocks/registry.py +331 -0
  30. sdg_hub/core/blocks/transform/__init__.py +23 -0
  31. sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
  32. sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
  33. sdg_hub/core/blocks/transform/melt_columns.py +126 -0
  34. sdg_hub/core/blocks/transform/rename_columns.py +69 -0
  35. sdg_hub/core/blocks/transform/text_concat.py +102 -0
  36. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
  37. sdg_hub/core/flow/__init__.py +20 -0
  38. sdg_hub/core/flow/base.py +980 -0
  39. sdg_hub/core/flow/metadata.py +344 -0
  40. sdg_hub/core/flow/migration.py +187 -0
  41. sdg_hub/core/flow/registry.py +330 -0
  42. sdg_hub/core/flow/validation.py +265 -0
  43. sdg_hub/{utils → core/utils}/__init__.py +6 -4
  44. sdg_hub/{utils → core/utils}/datautils.py +1 -3
  45. sdg_hub/core/utils/error_handling.py +208 -0
  46. sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
  47. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
  48. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
  49. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
  50. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
  51. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
  52. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
  53. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +191 -0
  54. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
  55. sdg_hub-0.2.0.dist-info/METADATA +218 -0
  56. sdg_hub-0.2.0.dist-info/RECORD +63 -0
  57. sdg_hub/blocks/__init__.py +0 -42
  58. sdg_hub/blocks/block.py +0 -96
  59. sdg_hub/blocks/llmblock.py +0 -375
  60. sdg_hub/blocks/openaichatblock.py +0 -556
  61. sdg_hub/blocks/utilblocks.py +0 -597
  62. sdg_hub/checkpointer.py +0 -139
  63. sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
  64. sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
  65. sdg_hub/configs/annotations/detailed_description.yaml +0 -10
  66. sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
  67. sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
  68. sdg_hub/configs/knowledge/__init__.py +0 -0
  69. sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
  70. sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
  71. sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
  72. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
  73. sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
  74. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
  75. sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
  76. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
  77. sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
  78. sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
  79. sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
  80. sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
  81. sdg_hub/configs/knowledge/router.yaml +0 -12
  82. sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
  83. sdg_hub/configs/reasoning/__init__.py +0 -0
  84. sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
  85. sdg_hub/configs/skills/__init__.py +0 -0
  86. sdg_hub/configs/skills/analyzer.yaml +0 -48
  87. sdg_hub/configs/skills/annotation.yaml +0 -36
  88. sdg_hub/configs/skills/contexts.yaml +0 -28
  89. sdg_hub/configs/skills/critic.yaml +0 -60
  90. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
  91. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
  92. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
  93. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
  94. sdg_hub/configs/skills/freeform_questions.yaml +0 -34
  95. sdg_hub/configs/skills/freeform_responses.yaml +0 -39
  96. sdg_hub/configs/skills/grounded_questions.yaml +0 -38
  97. sdg_hub/configs/skills/grounded_responses.yaml +0 -59
  98. sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
  99. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  100. sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
  101. sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
  102. sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
  103. sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
  104. sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
  105. sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
  106. sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
  107. sdg_hub/configs/skills/judge.yaml +0 -53
  108. sdg_hub/configs/skills/planner.yaml +0 -67
  109. sdg_hub/configs/skills/respond.yaml +0 -8
  110. sdg_hub/configs/skills/revised_responder.yaml +0 -78
  111. sdg_hub/configs/skills/router.yaml +0 -59
  112. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
  113. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
  114. sdg_hub/flow.py +0 -477
  115. sdg_hub/flow_runner.py +0 -450
  116. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
  117. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
  118. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
  119. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -148
  120. sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
  121. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
  122. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
  123. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
  124. sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
  125. sdg_hub/pipeline.py +0 -121
  126. sdg_hub/prompts.py +0 -74
  127. sdg_hub/registry.py +0 -122
  128. sdg_hub/sdg.py +0 -206
  129. sdg_hub/utils/config_validation.py +0 -91
  130. sdg_hub/utils/error_handling.py +0 -94
  131. sdg_hub/utils/validation_result.py +0 -10
  132. sdg_hub-0.1.3.dist-info/METADATA +0 -190
  133. sdg_hub-0.1.3.dist-info/RECORD +0 -89
  134. sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
  135. /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
  136. /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
  137. {sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/WHEEL +0 -0
  138. {sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/licenses/LICENSE +0 -0
  139. {sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,54 @@
1
+ - role: system
2
+ content: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
3
+
4
+ - role: user
5
+ content: |
6
+ Develop a series of educational question and answer pairs from a chapter in a {{domain}} textbook.
7
+
8
+ The questions should:
9
+ * Be self-contained, not requiring references to tables, figures, or specific sections in the text for understanding.
10
+ * Focus on teaching and reinforcing the key knowledge and concepts presented in the chapter.
11
+ * Avoid sections with minimal educational content like index pages or prefaces. In such cases, respond with [UNANSWERABLE].
12
+ * Be directly relevant to the textbook's domain. For instance, in a science textbook, questions should revolve around scientific terms, definitions, and practical applications, while in a legal textbook, they should cover legal principles, case law, and precedents.
13
+ * Be formulated to allow for independent answers, avoiding direct references to specific theorems or text sections. For example, rather than asking 'Under what conditions is the fixed point of a function unique according to Theorem 3.1.5?', ask 'How does the Fixed Point Iteration method contribute to understanding function uniqueness?'
14
+ * Span a range of difficulty levels to accommodate a diverse student audience, from basic understanding to advanced comprehension.
15
+ * Include a variety of question types such as multiple-choice for basic recall, short answer for deeper understanding, and essay or problem-solving questions to test application and analysis skills.
16
+ * Align closely with the learning objectives of the textbook or the specific chapter, ensuring that the questions test the fundamental concepts and skills that the chapter aims to impart.
17
+
18
+ Strictly follow this format for each question answer pair your generate while responding:
19
+
20
+ [QUESTION]
21
+ <Insert question here>
22
+ [ANSWER]
23
+ <Insert answer here>
24
+ [END]
25
+
26
+ Each question and answer pair should stand alone as a mini-lesson, encapsulating a key concept or idea from the chapter in a way that is accessible and informative without requiring the reader to refer back to the textbook.
27
+
28
+ Here are some examples of questions:
29
+
30
+ [Document]
31
+ {{icl_document}}
32
+
33
+ [QUESTION]
34
+ {{icl_query_1}}
35
+ [ANSWER]
36
+ {{icl_response_1}}
37
+ [END]
38
+
39
+ [QUESTION]
40
+ {{icl_query_2}}
41
+ [ANSWER]
42
+ {{icl_response_2}}
43
+ [END]
44
+
45
+ [QUESTION]
46
+ {{icl_query_3}}
47
+ [ANSWER]
48
+ {{icl_response_3}}
49
+ [END]
50
+
51
+ Now, here is the document:
52
+ [DOCUMENT]
53
+ {{document_outline}}
54
+ {{document}}
@@ -0,0 +1,218 @@
1
+ Metadata-Version: 2.4
2
+ Name: sdg_hub
3
+ Version: 0.2.0
4
+ Summary: Synthetic Data Generation
5
+ Author-email: Red Hat AI Innovation <abhandwa@redhat.com>
6
+ License: Apache-2.0
7
+ Project-URL: homepage, https://ai-innovation.team/
8
+ Project-URL: source, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub
9
+ Project-URL: issues, https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/issues
10
+ Classifier: Environment :: Console
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: MacOS :: MacOS X
14
+ Classifier: Operating System :: POSIX :: Linux
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: Implementation :: CPython
22
+ Requires-Python: >=3.10
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: click<9.0.0,>=8.1.7
26
+ Requires-Dist: datasets<4.0.0,>=2.18.0
27
+ Requires-Dist: httpx<1.0.0,>=0.25.0
28
+ Requires-Dist: jinja2
29
+ Requires-Dist: litellm<1.75.0,>=1.73.0
30
+ Requires-Dist: openai<2.0.0,>=1.13.3
31
+ Requires-Dist: rich
32
+ Requires-Dist: pydantic<3.0.0,>=2.0.0
33
+ Requires-Dist: python-dotenv<2.0.0,>=1.0.0
34
+ Requires-Dist: tenacity!=8.4.0,>=8.3.0
35
+ Requires-Dist: tqdm<5.0.0,>=4.66.2
36
+ Provides-Extra: vllm
37
+ Requires-Dist: vllm>=0.9.1; extra == "vllm"
38
+ Requires-Dist: torch>=2.0.0; extra == "vllm"
39
+ Requires-Dist: transformers>=4.37.0; extra == "vllm"
40
+ Requires-Dist: accelerate>=0.21.0; extra == "vllm"
41
+ Requires-Dist: xformers>=0.0.22.post7; extra == "vllm"
42
+ Provides-Extra: examples
43
+ Requires-Dist: tabulate>=0.9.0; extra == "examples"
44
+ Requires-Dist: transformers>=4.37.0; extra == "examples"
45
+ Requires-Dist: langchain-text-splitters; extra == "examples"
46
+ Requires-Dist: docling>=2.3.0; extra == "examples"
47
+ Requires-Dist: scikit-learn; extra == "examples"
48
+ Requires-Dist: pandas; extra == "examples"
49
+ Requires-Dist: polars; extra == "examples"
50
+ Requires-Dist: matplotlib; extra == "examples"
51
+ Requires-Dist: spacy; extra == "examples"
52
+ Requires-Dist: nltk; extra == "examples"
53
+ Requires-Dist: sentence-transformers; extra == "examples"
54
+ Requires-Dist: instructor; extra == "examples"
55
+ Requires-Dist: fastapi; extra == "examples"
56
+ Requires-Dist: nest-asyncio; extra == "examples"
57
+ Provides-Extra: dev
58
+ Requires-Dist: pre-commit<4.0,>=3.0.4; extra == "dev"
59
+ Requires-Dist: pylint<4.0,>=2.16.2; extra == "dev"
60
+ Requires-Dist: pylint-pydantic; extra == "dev"
61
+ Requires-Dist: pytest; extra == "dev"
62
+ Requires-Dist: pytest-asyncio; extra == "dev"
63
+ Requires-Dist: pytest-cov; extra == "dev"
64
+ Requires-Dist: pytest-html; extra == "dev"
65
+ Requires-Dist: tox<5,>=4.4.2; extra == "dev"
66
+ Requires-Dist: ruff; extra == "dev"
67
+ Dynamic: license-file
68
+
69
+ # `sdg_hub`: Synthetic Data Generation Toolkit
70
+
71
+ [![Build](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/pypi.yaml/badge.svg?branch=main)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/pypi.yaml)
72
+ [![Release](https://img.shields.io/github/v/release/Red-Hat-AI-Innovation-Team/sdg_hub)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/releases)
73
+ [![License](https://img.shields.io/github/license/Red-Hat-AI-Innovation-Team/sdg_hub)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/blob/main/LICENSE)
74
+ [![Tests](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml/badge.svg)](https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub/actions/workflows/test.yml)
75
+ [![codecov](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub/graph/badge.svg?token=SP75BCXWO2)](https://codecov.io/gh/Red-Hat-AI-Innovation-Team/sdg_hub)
76
+
77
+
78
+
79
+ A modular Python framework for building synthetic data generation pipelines using composable blocks and flows. Transform datasets through **building-block composition** - mix and match LLM-powered and traditional processing blocks to create sophisticated data generation workflows.
80
+
81
+ **📖 Full documentation available at: [https://ai-innovation.team/sdg_hub](https://ai-innovation.team/sdg_hub)**
82
+
83
+ ## ✨ Key Features
84
+
85
+ **🔧 Modular Composability** - Mix and match blocks like Lego pieces. Build simple transformations or complex multi-stage pipelines with YAML-configured flows.
86
+
87
+ **⚡ Async Performance** - High-throughput LLM processing with built-in error handling.
88
+
89
+ **🛡️ Built-in Validation** - Pydantic-based type safety ensures your configurations and data are correct before execution.
90
+
91
+ **🔍 Auto-Discovery** - Automatic block and flow registration. No manual imports or complex setup.
92
+
93
+ **📊 Rich Monitoring** - Detailed logging with progress bars and execution summaries.
94
+
95
+ **🧩 Easily Extensible** - Create custom blocks with simple inheritance. Rich logging and monitoring built-in.
96
+
97
+
98
+ ## 📦 Installation
99
+
100
+ Recommended: Install uv — see https://docs.astral.sh/uv/getting-started/installation/
101
+
102
+ ```bash
103
+ # Production
104
+ uv pip install sdg-hub
105
+
106
+ # Development
107
+ git clone https://github.com/Red-Hat-AI-Innovation-Team/sdg_hub.git
108
+ cd sdg_hub
109
+ uv pip install .[dev]
110
+ # or: uv sync --extra dev
111
+ ```
112
+
113
+ ### Optional Dependencies
114
+ ```bash
115
+ # For vLLM support
116
+ uv pip install sdg-hub[vllm]
117
+
118
+ # For examples
119
+ uv pip install sdg-hub[examples]
120
+ ```
121
+
122
+ ## 🚀 Quick Start
123
+
124
+ ### 🧱 Core Concepts
125
+
126
+ **Blocks** are composable units that transform datasets - think of them as data processing Lego pieces. Each block performs a specific task: LLM chat, text parsing, evaluation, or transformation.
127
+
128
+ **Flows** orchestrate multiple blocks into complete pipelines defined in YAML. Chain blocks together to create complex data generation workflows with validation and parameter management.
129
+
130
+ ```python
131
+ # Simple concept: Blocks transform data, Flows chain blocks together
132
+ dataset → Block₁ → Block₂ → Block₃ → enriched_dataset
133
+ ```
134
+
135
+ ### Try it out!
136
+
137
+ #### Flow Discovery
138
+ ```python
139
+ from sdg_hub import FlowRegistry
140
+
141
+ # Auto-discover all available flows (no setup needed!)
142
+ FlowRegistry.discover_flows()
143
+
144
+ # List available flows
145
+ flows = FlowRegistry.list_flows()
146
+ print(f"Available flows: {flows}")
147
+
148
+ # Search for specific types
149
+ qa_flows = FlowRegistry.search_flows(tag="question-generation")
150
+ print(f"QA flows: {qa_flows}")
151
+ ```
152
+
153
+ #### Using Flows
154
+ ```python
155
+ from sdg_hub import FlowRegistry, Flow
156
+ from datasets import Dataset
157
+
158
+ # Load the flow by name
159
+ flow_name = "Advanced Document Grounded Question-Answer Generation Flow for Knowledge Tuning"
160
+ flow_path = FlowRegistry.get_flow_path(flow_name)
161
+ flow = Flow.from_yaml(flow_path)
162
+
163
+ # Discover recommended models
164
+ default_model = flow.get_default_model()
165
+ recommendations = flow.get_model_recommendations()
166
+
167
+ # Configure model settings at runtime
168
+ # This assumes you have a hosted vLLM instance of meta-llama/Llama-3.3-70B-Instruct running at http://localhost:8000/v1
169
+ flow.set_model_config(
170
+ model=f"hosted_vllm/{default_model}",
171
+ api_base="http://localhost:8000/v1",
172
+ api_key="your_key",
173
+ )
174
+
175
+ # Create your dataset with required columns
176
+ dataset = Dataset.from_dict({
177
+ 'document': ['Your document text here...'],
178
+ 'document_outline': ['1. Topic A; 2. Topic B; 3. Topic C'],
179
+ 'domain': ['Computer Science'],
180
+ 'icl_document': ['Example document for in-context learning...'],
181
+ 'icl_query_1': ['Example question 1?'],
182
+ 'icl_response_1': ['Example answer 1'],
183
+ 'icl_query_2': ['Example question 2?'],
184
+ 'icl_response_2': ['Example answer 2'],
185
+ 'icl_query_3': ['Example question 3?'],
186
+ 'icl_response_3': ['Example answer 3']
187
+ })
188
+
189
+ # Generate high-quality QA pairs
190
+ result = flow.generate(dataset)
191
+
192
+ # Access generated content
193
+ questions = result['question']
194
+ answers = result['response']
195
+ faithfulness_scores = result['faithfulness_judgment']
196
+ relevancy_scores = result['relevancy_score']
197
+ ```
198
+
199
+ #### Quick Testing with Dry Run
200
+ ```python
201
+ # Test the flow with a small sample first
202
+ dry_result = flow.dry_run(dataset, sample_size=1)
203
+ print(f"Dry run completed in {dry_result['execution_time_seconds']:.2f}s")
204
+ print(f"Output columns: {dry_result['final_dataset']['columns']}")
205
+ ```
206
+
207
+
208
+ ## 📄 License
209
+
210
+ This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.
211
+
212
+ ## 🤝 Contributing
213
+
214
+ We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on how to contribute to this project.
215
+
216
+ ---
217
+
218
+ Built with ❤️ by the Red Hat AI Innovation Team
@@ -0,0 +1,63 @@
1
+ sdg_hub/__init__.py,sha256=Tw-6R5a8_W1kJcTAsW3R9ltBDP1dy5-fe7Tvt3cSyCQ,550
2
+ sdg_hub/_version.py,sha256=iB5DfB5V6YB5Wo4JmvS-txT42QtmGaWcWp3udRT7zCI,511
3
+ sdg_hub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ sdg_hub/core/__init__.py,sha256=NwqB4fwhC29W50VW7QXZssLxx122YvgO9LHDLdgAnrI,496
5
+ sdg_hub/core/blocks/__init__.py,sha256=9sCkCvDQzJGSedaePVlEIpbNwrkBz_K500VW_6FLhuE,1601
6
+ sdg_hub/core/blocks/base.py,sha256=TrzUAkG7Tiquk0Z3SOFsb5mRnHd1IbHH6gFPVH1P7T8,10424
7
+ sdg_hub/core/blocks/registry.py,sha256=a9CcjA5n7JWmfTyeQPml14aW0tlYU9QLkSkskKWJT2o,9771
8
+ sdg_hub/core/blocks/deprecated_blocks/__init__.py,sha256=RDu3MWFStDQko-TKkx8tGoB1UTatP_RSldZK43zHDvY,889
9
+ sdg_hub/core/blocks/deprecated_blocks/combine_columns.py,sha256=HCvpaYsAwgx1Dm0vIshcWsKoVsRT0KrmKp9j4oqtByc,2757
10
+ sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py,sha256=maCaaEs0EMMzt7L1xm7fAH3ylaFMHEkeC_dtOw3FrjU,2694
11
+ sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py,sha256=-fuuMKj2g2MrijMBTd0PWtYBbf9anQ2UkYXHigCxxJI,3328
12
+ sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py,sha256=IenCskrPEv09h2uT6aZKCQzaxgA_3kAzOeJSd-R_-EA,2839
13
+ sdg_hub/core/blocks/deprecated_blocks/llmblock.py,sha256=34lzC43BODpMk5AwlWA1ctdYPmN7cA6WL5vMXaI0P0Y,20385
14
+ sdg_hub/core/blocks/deprecated_blocks/rename_columns.py,sha256=thp-mHtkRmUw_nYKpldy_mLWR2AvC5YUhbqDETM6-T0,2620
15
+ sdg_hub/core/blocks/deprecated_blocks/sample_populator.py,sha256=UdueMApxOmPWaxxMrw7b1v74fKJBfqqRATEBqgmVtNw,1737
16
+ sdg_hub/core/blocks/deprecated_blocks/selector.py,sha256=ABcXZrqEMsgKfdGAkSo2plMp4LsZSqPhEQugoDEYm1I,2950
17
+ sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py,sha256=44TQu-rK5isia-otMVB1zHd8D-wWmu3C8CI1NLtfY5s,2729
18
+ sdg_hub/core/blocks/evaluation/__init__.py,sha256=kFXee-vsVVdU2XtLio9qHgPx_a0zoB_rQr509EKBGJc,357
19
+ sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py,sha256=ZuQ8jq2JwTdslUJtFi1E9NXebCWFZS8isXOafcJ_CMU,23026
20
+ sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py,sha256=ieQRwl4bx5EQ3m7Wa2P3pHLUPQY7HuwNWjHUCo98u6g,22832
21
+ sdg_hub/core/blocks/evaluation/verify_question_block.py,sha256=fSNbW1KpdfVE0fQsm4Y8QfVk6A3J5H3C0dtGn49t8tM,22853
22
+ sdg_hub/core/blocks/filtering/__init__.py,sha256=isxSVSvDqkMjG8dQSl3Q2M4g5c1t9fTjBSA21icf-yA,275
23
+ sdg_hub/core/blocks/filtering/column_value_filter.py,sha256=H8Gif0q9Wc_d1TnVow8Zpsg7blJOFGN1EZmV6OPpkcg,5971
24
+ sdg_hub/core/blocks/llm/__init__.py,sha256=qAb-pzbI3EqjOVjU48Y63cR3Oly5ZjCkhdwkk1ltqTc,732
25
+ sdg_hub/core/blocks/llm/client_manager.py,sha256=vaoPoTITJ9IlooeVRfu6M4WBc08mp4aJZ5tvnl2fMv8,12309
26
+ sdg_hub/core/blocks/llm/config.py,sha256=TmbfqxPHH3mShTK2EuCX2AGKtDvl0aSvihsaqgzABtM,11266
27
+ sdg_hub/core/blocks/llm/error_handler.py,sha256=7T-019ZFB9qgZoX1ybIiXyaLjPzrF96qcKmUu6vmO6g,12178
28
+ sdg_hub/core/blocks/llm/llm_chat_block.py,sha256=3o2oV_ecWsEHFp5FWPIpBT-yJ1imJmeZy2b9GZL-T54,20121
29
+ sdg_hub/core/blocks/llm/prompt_builder_block.py,sha256=fkJd718X1oYlMY1cjo_8WCO16Gl8Tm0bUPWR78E_uws,13935
30
+ sdg_hub/core/blocks/llm/text_parser_block.py,sha256=9n6pHKVmMD1wwEYdFs0kIz5TblmDxl5dtmbyLZHGivo,12005
31
+ sdg_hub/core/blocks/transform/__init__.py,sha256=Y_3izPCtgnMbFK-gBMeLHZspSrNLgbGheAJXU57XfFw,746
32
+ sdg_hub/core/blocks/transform/duplicate_columns.py,sha256=SaP7rIF4ZFEFFa50aU2xGNIuddXaEZrKxdWfHjzFpVI,2833
33
+ sdg_hub/core/blocks/transform/index_based_mapper.py,sha256=mGup5agvDf9kAFSvXE5X6Puo6CQc9UOdFdbhdFWJjwk,8225
34
+ sdg_hub/core/blocks/transform/melt_columns.py,sha256=vaYa5Taq6GhNZYWFL4uPK3-SfN2BsKEm-wvjd2EYYoI,4382
35
+ sdg_hub/core/blocks/transform/rename_columns.py,sha256=qeB5L2utqDQnutUetH1VKZSqDiJSH_yUp5EFCV-XCVI,1998
36
+ sdg_hub/core/blocks/transform/text_concat.py,sha256=_-B__Hob1WwgwkILPIZvTnsDzuwtoX1hKviyzHlnnes,3149
37
+ sdg_hub/core/blocks/transform/uniform_col_val_setter.py,sha256=XnjiT29z3PzIPy8M-mmE2w-Miab6Ed5ahy32SaxTCTE,3263
38
+ sdg_hub/core/flow/__init__.py,sha256=N2NZGngvd7qpT5FI_knKukUFM0IkD9K5jdTi-gDeUI4,475
39
+ sdg_hub/core/flow/base.py,sha256=0sx_chQIeuBcLH1fNMkkD0PxX5UeEv_pCBxYI0Byzi8,36884
40
+ sdg_hub/core/flow/metadata.py,sha256=_IfFWtCukYoMMG2QWRganUl0uGQO_jxniIVBlVmutus,11487
41
+ sdg_hub/core/flow/migration.py,sha256=g0Ug4ZrR_ssxJ-ESVP7ubkD0kql6aSChOuMmx-ZMn8A,7198
42
+ sdg_hub/core/flow/registry.py,sha256=T2veU05h4Q9vb_6F_NYHnNuFZE21orWsx1-iGl0aoJk,9564
43
+ sdg_hub/core/flow/validation.py,sha256=g0G7MH3bz7kcNsfRrlSi8iJZi8gqVcgODhHygVYtJVI,9185
44
+ sdg_hub/core/utils/__init__.py,sha256=y_D7HcRxw7FXShw5USQpCt-5h4VXOFFvMOMN3_oALiw,279
45
+ sdg_hub/core/utils/datautils.py,sha256=qKK2HXAqI4t-O-9RMu2DdaQVZwTnJj-W7-Hc5o1iqZw,379
46
+ sdg_hub/core/utils/error_handling.py,sha256=yku8cGj_nKCyXDsnb-mHCpgukkkAMucJ4iAUrIzqysc,5510
47
+ sdg_hub/core/utils/logger_config.py,sha256=MPYdpyNXh_pxFUOAvSCHa98LGjxjaLXoUoqWekqTG4s,422
48
+ sdg_hub/core/utils/path_resolution.py,sha256=yWof4kGNpQ5dKcrVHg0h9KfOKLZ6ROjdfsLAZsQT5rM,2000
49
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml,sha256=xgUNY793y4lcpdtuWm5Ah1CmbU2gvvPQCpZMMa6kPXU,2447
52
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml,sha256=_vF-AzjC8d6wqAle5pkQ103EW-BbAhNA0qllk3ojUZc,353
53
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml,sha256=GiIipXrjm7btghvpgFUoTZYAJRyu7yE-WEi5yDLxjY4,3032
54
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml,sha256=zwzklXup6khRkR88avgrJTcjaMcV1wnbeYaML5oPuNs,1767
55
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml,sha256=cA8igo7jMrRXaWW6k0of6KOp7YnxLtPj0fP4DbrmZNQ,3647
56
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml,sha256=fcMV7LaCFZo4D29nwhGJXqFFuZMYVLo9XYjv8zcU6zs,364
57
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml,sha256=RrWr2jaandGgLkJiBLFPPA1g6B6vmL98-qXPozqjHKQ,6286
58
+ sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml,sha256=yX8aLY8dJSDML9ZJhnj9RzPbN8tH2xfcM4Gc6xZuwqQ,2596
59
+ sdg_hub-0.2.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
60
+ sdg_hub-0.2.0.dist-info/METADATA,sha256=APjsGUk94_tQRVlncgVxkEOTSOpHY25SOMmOO1lt0P0,8464
61
+ sdg_hub-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
62
+ sdg_hub-0.2.0.dist-info/top_level.txt,sha256=TqI7d-HE1n6zkXFkU0nF3A1Ct0P0pBaqI675uFokhx4,8
63
+ sdg_hub-0.2.0.dist-info/RECORD,,
@@ -1,42 +0,0 @@
1
- """Block implementations for SDG Hub.
2
-
3
- This package provides various block implementations for data generation, processing, and transformation.
4
- """
5
-
6
- # Local
7
- from .block import Block
8
- from .llmblock import LLMBlock, ConditionalLLMBlock
9
- from .openaichatblock import (
10
- OpenAIChatBlock,
11
- OpenAIAsyncChatBlock
12
- )
13
- from .utilblocks import (
14
- SamplePopulatorBlock,
15
- SelectorBlock,
16
- CombineColumnsBlock,
17
- FlattenColumnsBlock,
18
- DuplicateColumns,
19
- RenameColumns,
20
- SetToMajorityValue,
21
- FilterByValueBlock,
22
- IterBlock,
23
- )
24
- from ..registry import BlockRegistry
25
-
26
- __all__ = [
27
- "Block",
28
- "FilterByValueBlock",
29
- "IterBlock",
30
- "LLMBlock",
31
- "ConditionalLLMBlock",
32
- "SamplePopulatorBlock",
33
- "SelectorBlock",
34
- "CombineColumnsBlock",
35
- "FlattenColumnsBlock",
36
- "DuplicateColumns",
37
- "RenameColumns",
38
- "SetToMajorityValue",
39
- "BlockRegistry",
40
- "OpenAIChatBlock",
41
- "OpenAIAsyncChatBlock"
42
- ]
sdg_hub/blocks/block.py DELETED
@@ -1,96 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- """Base block implementation for the SDG Hub system.
3
-
4
- This module provides the abstract base class for all blocks in the system,
5
- including functionality for template validation and configuration management.
6
- """
7
-
8
- # Standard
9
- from abc import ABC
10
- from collections import ChainMap
11
- from typing import Any, Dict, Optional
12
-
13
- # Third Party
14
- from jinja2 import Template, UndefinedError
15
- import yaml
16
-
17
- # Local
18
- from ..registry import BlockRegistry
19
- from ..logger_config import setup_logger
20
-
21
- logger = setup_logger(__name__)
22
-
23
-
24
- @BlockRegistry.register("Block")
25
- class Block(ABC):
26
- """Base abstract class for all blocks in the system.
27
-
28
- This class provides common functionality for block validation and configuration loading.
29
- All specific block implementations should inherit from this class.
30
- """
31
-
32
- def __init__(self, block_name: str) -> None:
33
- self.block_name = block_name
34
-
35
- @staticmethod
36
- def _validate(prompt_template: Template, input_dict: Dict[str, Any]) -> bool:
37
- """Validate the input data for this block.
38
-
39
- This method validates whether all required variables in the Jinja template are provided in the input_dict.
40
-
41
- Parameters
42
- ----------
43
- prompt_template : Template
44
- The Jinja2 template object.
45
- input_dict : Dict[str, Any]
46
- A dictionary of input values to check against the template.
47
-
48
- Returns
49
- -------
50
- bool
51
- True if the input data is valid (i.e., no missing variables), False otherwise.
52
- """
53
-
54
- class Default(dict):
55
- def __missing__(self, key: str) -> None:
56
- raise KeyError(key)
57
-
58
- try:
59
- # Try rendering the template with the input_dict
60
- prompt_template.render(ChainMap(input_dict, Default()))
61
- return True
62
- except UndefinedError as e:
63
- logger.error(f"Missing key: {e}")
64
- return False
65
-
66
- def _load_config(self, config_path: str) -> Optional[Dict[str, Any]]:
67
- """Load the configuration file for this block.
68
-
69
- Parameters
70
- ----------
71
- config_path : str
72
- The path to the configuration file.
73
-
74
- Returns
75
- -------
76
- Optional[Dict[str, Any]]
77
- The loaded configuration. Returns None if file cannot be read or parsed.
78
-
79
- Raises
80
- ------
81
- FileNotFoundError
82
- If the configuration file does not exist.
83
- """
84
- try:
85
- with open(config_path, "r", encoding="utf-8") as config_file:
86
- try:
87
- return yaml.safe_load(config_file)
88
- except yaml.YAMLError as e:
89
- logger.error(f"Error parsing YAML from {config_path}: {e}")
90
- return None
91
- except FileNotFoundError:
92
- logger.error(f"Configuration file not found: {config_path}")
93
- raise
94
- except Exception as e:
95
- logger.error(f"Unexpected error reading config file {config_path}: {e}")
96
- return None