sdg-hub 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/__init__.py +28 -1
- sdg_hub/_version.py +2 -2
- sdg_hub/core/__init__.py +22 -0
- sdg_hub/core/blocks/__init__.py +58 -0
- sdg_hub/core/blocks/base.py +313 -0
- sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
- sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
- sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
- sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
- sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
- sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
- sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
- sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
- sdg_hub/core/blocks/evaluation/__init__.py +9 -0
- sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
- sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
- sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
- sdg_hub/core/blocks/filtering/__init__.py +12 -0
- sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
- sdg_hub/core/blocks/llm/__init__.py +25 -0
- sdg_hub/core/blocks/llm/client_manager.py +398 -0
- sdg_hub/core/blocks/llm/config.py +336 -0
- sdg_hub/core/blocks/llm/error_handler.py +368 -0
- sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
- sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
- sdg_hub/core/blocks/llm/text_parser_block.py +310 -0
- sdg_hub/core/blocks/registry.py +331 -0
- sdg_hub/core/blocks/transform/__init__.py +23 -0
- sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
- sdg_hub/core/blocks/transform/melt_columns.py +126 -0
- sdg_hub/core/blocks/transform/rename_columns.py +69 -0
- sdg_hub/core/blocks/transform/text_concat.py +102 -0
- sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
- sdg_hub/core/flow/__init__.py +20 -0
- sdg_hub/core/flow/base.py +980 -0
- sdg_hub/core/flow/metadata.py +344 -0
- sdg_hub/core/flow/migration.py +187 -0
- sdg_hub/core/flow/registry.py +330 -0
- sdg_hub/core/flow/validation.py +265 -0
- sdg_hub/{utils → core/utils}/__init__.py +6 -4
- sdg_hub/{utils → core/utils}/datautils.py +1 -3
- sdg_hub/core/utils/error_handling.py +208 -0
- sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +191 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
- sdg_hub-0.2.0.dist-info/METADATA +218 -0
- sdg_hub-0.2.0.dist-info/RECORD +63 -0
- sdg_hub/blocks/__init__.py +0 -42
- sdg_hub/blocks/block.py +0 -96
- sdg_hub/blocks/llmblock.py +0 -375
- sdg_hub/blocks/openaichatblock.py +0 -556
- sdg_hub/blocks/utilblocks.py +0 -597
- sdg_hub/checkpointer.py +0 -139
- sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
- sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
- sdg_hub/configs/annotations/detailed_description.yaml +0 -10
- sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
- sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
- sdg_hub/configs/knowledge/__init__.py +0 -0
- sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
- sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
- sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
- sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
- sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
- sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
- sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
- sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
- sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
- sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
- sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
- sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
- sdg_hub/configs/knowledge/router.yaml +0 -12
- sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
- sdg_hub/configs/reasoning/__init__.py +0 -0
- sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
- sdg_hub/configs/skills/__init__.py +0 -0
- sdg_hub/configs/skills/analyzer.yaml +0 -48
- sdg_hub/configs/skills/annotation.yaml +0 -36
- sdg_hub/configs/skills/contexts.yaml +0 -28
- sdg_hub/configs/skills/critic.yaml +0 -60
- sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
- sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
- sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
- sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
- sdg_hub/configs/skills/freeform_questions.yaml +0 -34
- sdg_hub/configs/skills/freeform_responses.yaml +0 -39
- sdg_hub/configs/skills/grounded_questions.yaml +0 -38
- sdg_hub/configs/skills/grounded_responses.yaml +0 -59
- sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
- sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
- sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
- sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
- sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
- sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
- sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
- sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
- sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
- sdg_hub/configs/skills/judge.yaml +0 -53
- sdg_hub/configs/skills/planner.yaml +0 -67
- sdg_hub/configs/skills/respond.yaml +0 -8
- sdg_hub/configs/skills/revised_responder.yaml +0 -78
- sdg_hub/configs/skills/router.yaml +0 -59
- sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
- sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
- sdg_hub/flow.py +0 -477
- sdg_hub/flow_runner.py +0 -450
- sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
- sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
- sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
- sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -148
- sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
- sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
- sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
- sdg_hub/pipeline.py +0 -121
- sdg_hub/prompts.py +0 -74
- sdg_hub/registry.py +0 -122
- sdg_hub/sdg.py +0 -206
- sdg_hub/utils/config_validation.py +0 -91
- sdg_hub/utils/error_handling.py +0 -94
- sdg_hub/utils/validation_result.py +0 -10
- sdg_hub-0.1.3.dist-info/METADATA +0 -190
- sdg_hub-0.1.3.dist-info/RECORD +0 -89
- sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
- /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
- /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
- {sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/WHEEL +0 -0
- {sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,330 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Flow registry for managing contributed flows."""
|
3
|
+
|
4
|
+
# Standard
|
5
|
+
from dataclasses import dataclass
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import Optional
|
8
|
+
import os
|
9
|
+
|
10
|
+
# Third Party
|
11
|
+
import yaml
|
12
|
+
|
13
|
+
# Local
|
14
|
+
from ..utils.logger_config import setup_logger
|
15
|
+
from .metadata import FlowMetadata
|
16
|
+
|
17
|
+
logger = setup_logger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
@dataclass
|
21
|
+
class FlowRegistryEntry:
|
22
|
+
"""Entry in the flow registry.
|
23
|
+
|
24
|
+
Parameters
|
25
|
+
----------
|
26
|
+
path : str
|
27
|
+
Path to the flow YAML file.
|
28
|
+
metadata : FlowMetadata
|
29
|
+
Flow metadata extracted from the file.
|
30
|
+
"""
|
31
|
+
|
32
|
+
path: str
|
33
|
+
metadata: FlowMetadata
|
34
|
+
|
35
|
+
|
36
|
+
class FlowRegistry:
|
37
|
+
"""Registry for managing contributed flows."""
|
38
|
+
|
39
|
+
_entries: dict[str, FlowRegistryEntry] = {}
|
40
|
+
_search_paths: list[str] = []
|
41
|
+
_initialized: bool = False
|
42
|
+
|
43
|
+
@classmethod
|
44
|
+
def _ensure_initialized(cls) -> None:
|
45
|
+
"""Ensure the registry is initialized with built-in flows."""
|
46
|
+
if cls._initialized:
|
47
|
+
return
|
48
|
+
|
49
|
+
try:
|
50
|
+
# Find the sdg_hub package directory
|
51
|
+
# First Party
|
52
|
+
import sdg_hub
|
53
|
+
|
54
|
+
package_path = Path(sdg_hub.__file__).parent
|
55
|
+
flows_dir = package_path / "flows"
|
56
|
+
|
57
|
+
# Register built-in flows directory if it exists
|
58
|
+
if flows_dir.exists():
|
59
|
+
flows_dir_str = str(flows_dir)
|
60
|
+
if flows_dir_str not in cls._search_paths:
|
61
|
+
cls._search_paths.append(flows_dir_str)
|
62
|
+
logger.debug(
|
63
|
+
f"Auto-registered built-in flows directory: {flows_dir}"
|
64
|
+
)
|
65
|
+
else:
|
66
|
+
logger.debug(f"Built-in flows directory not found: {flows_dir}")
|
67
|
+
|
68
|
+
except Exception as exc:
|
69
|
+
logger.warning(f"Failed to auto-register built-in flows: {exc}")
|
70
|
+
|
71
|
+
cls._initialized = True
|
72
|
+
|
73
|
+
@classmethod
|
74
|
+
def register_search_path(cls, path: str) -> None:
|
75
|
+
"""Add a directory to search for flows.
|
76
|
+
|
77
|
+
Parameters
|
78
|
+
----------
|
79
|
+
path : str
|
80
|
+
Path to directory containing flow YAML files.
|
81
|
+
"""
|
82
|
+
if path not in cls._search_paths:
|
83
|
+
cls._search_paths.append(path)
|
84
|
+
logger.debug(f"Added flow search path: {path}")
|
85
|
+
|
86
|
+
@classmethod
|
87
|
+
def _discover_flows(cls, force_refresh: bool = False) -> None:
|
88
|
+
"""Discover and register flows from search paths (private method).
|
89
|
+
|
90
|
+
Parameters
|
91
|
+
----------
|
92
|
+
force_refresh : bool, optional
|
93
|
+
Whether to force refresh the registry.
|
94
|
+
"""
|
95
|
+
# Ensure built-in flows are registered
|
96
|
+
cls._ensure_initialized()
|
97
|
+
|
98
|
+
if cls._entries and not force_refresh:
|
99
|
+
return
|
100
|
+
|
101
|
+
cls._entries.clear()
|
102
|
+
|
103
|
+
for search_path in cls._search_paths:
|
104
|
+
if not os.path.exists(search_path):
|
105
|
+
logger.warning(f"Flow search path does not exist: {search_path}")
|
106
|
+
continue
|
107
|
+
|
108
|
+
cls._discover_flows_in_directory(search_path)
|
109
|
+
|
110
|
+
logger.info(f"Discovered {len(cls._entries)} flows")
|
111
|
+
|
112
|
+
@classmethod
|
113
|
+
def _discover_flows_in_directory(cls, directory: str) -> None:
|
114
|
+
"""Discover flows in a specific directory."""
|
115
|
+
path = Path(directory)
|
116
|
+
|
117
|
+
for yaml_file in path.rglob("*.yaml"):
|
118
|
+
try:
|
119
|
+
with open(yaml_file, encoding="utf-8") as f:
|
120
|
+
flow_config = yaml.safe_load(f)
|
121
|
+
|
122
|
+
# Check if this is a flow file
|
123
|
+
if "metadata" in flow_config and "blocks" in flow_config:
|
124
|
+
metadata_dict = flow_config["metadata"]
|
125
|
+
metadata = FlowMetadata(**metadata_dict)
|
126
|
+
|
127
|
+
entry = FlowRegistryEntry(path=str(yaml_file), metadata=metadata)
|
128
|
+
|
129
|
+
cls._entries[metadata.name] = entry
|
130
|
+
logger.debug(f"Registered flow: {metadata.name} from {yaml_file}")
|
131
|
+
|
132
|
+
except Exception as exc:
|
133
|
+
logger.debug(f"Skipped {yaml_file}: {exc}")
|
134
|
+
|
135
|
+
@classmethod
|
136
|
+
def get_flow_path(cls, flow_name: str) -> Optional[str]:
|
137
|
+
"""Get the path to a registered flow.
|
138
|
+
|
139
|
+
Parameters
|
140
|
+
----------
|
141
|
+
flow_name : str
|
142
|
+
Name of the flow to find.
|
143
|
+
|
144
|
+
Returns
|
145
|
+
-------
|
146
|
+
Optional[str]
|
147
|
+
Path to the flow file, or None if not found.
|
148
|
+
"""
|
149
|
+
cls._ensure_initialized()
|
150
|
+
cls._discover_flows()
|
151
|
+
|
152
|
+
if flow_name in cls._entries:
|
153
|
+
return cls._entries[flow_name].path
|
154
|
+
return None
|
155
|
+
|
156
|
+
@classmethod
|
157
|
+
def get_flow_metadata(cls, flow_name: str) -> Optional[FlowMetadata]:
|
158
|
+
"""Get metadata for a registered flow.
|
159
|
+
|
160
|
+
Parameters
|
161
|
+
----------
|
162
|
+
flow_name : str
|
163
|
+
Name of the flow.
|
164
|
+
|
165
|
+
Returns
|
166
|
+
-------
|
167
|
+
Optional[FlowMetadata]
|
168
|
+
Flow metadata, or None if not found.
|
169
|
+
"""
|
170
|
+
cls._ensure_initialized()
|
171
|
+
cls._discover_flows()
|
172
|
+
|
173
|
+
if flow_name in cls._entries:
|
174
|
+
return cls._entries[flow_name].metadata
|
175
|
+
return None
|
176
|
+
|
177
|
+
@classmethod
|
178
|
+
def list_flows(cls) -> list[str]:
|
179
|
+
"""List all registered flow names.
|
180
|
+
|
181
|
+
Returns
|
182
|
+
-------
|
183
|
+
List[str]
|
184
|
+
List of flow names.
|
185
|
+
"""
|
186
|
+
cls._ensure_initialized()
|
187
|
+
cls._discover_flows()
|
188
|
+
return list(cls._entries.keys())
|
189
|
+
|
190
|
+
@classmethod
|
191
|
+
def search_flows(
|
192
|
+
cls, tag: Optional[str] = None, author: Optional[str] = None
|
193
|
+
) -> list[str]:
|
194
|
+
"""Search flows by criteria.
|
195
|
+
|
196
|
+
Parameters
|
197
|
+
----------
|
198
|
+
tag : Optional[str]
|
199
|
+
Tag to filter by.
|
200
|
+
author : Optional[str]
|
201
|
+
Author to filter by.
|
202
|
+
|
203
|
+
Returns
|
204
|
+
-------
|
205
|
+
List[str]
|
206
|
+
List of matching flow names.
|
207
|
+
"""
|
208
|
+
cls._ensure_initialized()
|
209
|
+
cls._discover_flows()
|
210
|
+
|
211
|
+
matching_flows = []
|
212
|
+
|
213
|
+
for name, entry in cls._entries.items():
|
214
|
+
metadata = entry.metadata
|
215
|
+
|
216
|
+
# Filter by tag
|
217
|
+
if tag and tag not in metadata.tags:
|
218
|
+
continue
|
219
|
+
|
220
|
+
# Filter by author
|
221
|
+
if author and author.lower() not in metadata.author.lower():
|
222
|
+
continue
|
223
|
+
|
224
|
+
matching_flows.append(name)
|
225
|
+
|
226
|
+
return matching_flows
|
227
|
+
|
228
|
+
@classmethod
|
229
|
+
def get_flows_by_category(cls) -> dict[str, list[str]]:
|
230
|
+
"""Get flows organized by their primary tag.
|
231
|
+
|
232
|
+
Returns
|
233
|
+
-------
|
234
|
+
Dict[str, List[str]]
|
235
|
+
Dictionary mapping tags to flow names.
|
236
|
+
"""
|
237
|
+
cls._ensure_initialized()
|
238
|
+
cls._discover_flows()
|
239
|
+
|
240
|
+
categories = {}
|
241
|
+
|
242
|
+
for name, entry in cls._entries.items():
|
243
|
+
metadata = entry.metadata
|
244
|
+
|
245
|
+
# Use first tag as primary category, or "uncategorized"
|
246
|
+
category = metadata.tags[0] if metadata.tags else "uncategorized"
|
247
|
+
|
248
|
+
if category not in categories:
|
249
|
+
categories[category] = []
|
250
|
+
|
251
|
+
categories[category].append(name)
|
252
|
+
|
253
|
+
return categories
|
254
|
+
|
255
|
+
@classmethod
|
256
|
+
def discover_flows(cls, show_all_columns: bool = False) -> None:
|
257
|
+
"""Discover and display all flows in a formatted table.
|
258
|
+
|
259
|
+
This is the main public API for flow discovery. It finds all flows
|
260
|
+
in registered search paths and displays them in a beautiful Rich table.
|
261
|
+
|
262
|
+
Parameters
|
263
|
+
----------
|
264
|
+
show_all_columns : bool, optional
|
265
|
+
Whether to show extended table with all columns, by default False
|
266
|
+
"""
|
267
|
+
cls._ensure_initialized()
|
268
|
+
cls._discover_flows()
|
269
|
+
|
270
|
+
if not cls._entries:
|
271
|
+
print(
|
272
|
+
"No flows discovered. Try adding search paths with register_search_path()"
|
273
|
+
)
|
274
|
+
print("Note: Only flows with 'metadata' section are discoverable.")
|
275
|
+
return
|
276
|
+
|
277
|
+
# Prepare data with fallbacks
|
278
|
+
flow_data = []
|
279
|
+
for name, entry in cls._entries.items():
|
280
|
+
metadata = entry.metadata
|
281
|
+
flow_data.append(
|
282
|
+
{
|
283
|
+
"name": name,
|
284
|
+
"author": metadata.author or "Unknown",
|
285
|
+
"tags": ", ".join(metadata.tags) if metadata.tags else "-",
|
286
|
+
"description": metadata.description or "No description",
|
287
|
+
"version": metadata.version,
|
288
|
+
"cost": metadata.estimated_cost,
|
289
|
+
}
|
290
|
+
)
|
291
|
+
|
292
|
+
# Sort by name for consistency
|
293
|
+
flow_data.sort(key=lambda x: x["name"])
|
294
|
+
|
295
|
+
# Display Rich table
|
296
|
+
# Third Party
|
297
|
+
from rich.console import Console
|
298
|
+
from rich.table import Table
|
299
|
+
|
300
|
+
console = Console()
|
301
|
+
table = Table(show_header=True, header_style="bold magenta")
|
302
|
+
|
303
|
+
# Add columns
|
304
|
+
table.add_column("Name", style="cyan", no_wrap=True)
|
305
|
+
table.add_column("Author", style="green")
|
306
|
+
|
307
|
+
if show_all_columns:
|
308
|
+
table.add_column("Version", style="blue")
|
309
|
+
table.add_column("Cost", style="yellow")
|
310
|
+
|
311
|
+
table.add_column("Tags", style="dim")
|
312
|
+
table.add_column("Description")
|
313
|
+
|
314
|
+
# Add rows
|
315
|
+
for flow in flow_data:
|
316
|
+
if show_all_columns:
|
317
|
+
table.add_row(
|
318
|
+
flow["name"],
|
319
|
+
flow["author"],
|
320
|
+
flow["version"],
|
321
|
+
flow["cost"],
|
322
|
+
flow["tags"],
|
323
|
+
flow["description"],
|
324
|
+
)
|
325
|
+
else:
|
326
|
+
table.add_row(
|
327
|
+
flow["name"], flow["author"], flow["tags"], flow["description"]
|
328
|
+
)
|
329
|
+
|
330
|
+
console.print(table)
|
@@ -0,0 +1,265 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Flow validation utilities."""
|
3
|
+
|
4
|
+
# Standard
|
5
|
+
from typing import TYPE_CHECKING, Any
|
6
|
+
|
7
|
+
# Third Party
|
8
|
+
from datasets import Dataset
|
9
|
+
|
10
|
+
if TYPE_CHECKING:
|
11
|
+
# Local
|
12
|
+
from .base import Flow
|
13
|
+
|
14
|
+
|
15
|
+
class FlowValidator:
|
16
|
+
"""Validator for flow configurations and execution readiness."""
|
17
|
+
|
18
|
+
def validate_yaml_structure(self, flow_config: dict[str, Any]) -> list[str]:
|
19
|
+
"""Validate the structure of a flow YAML configuration.
|
20
|
+
|
21
|
+
Parameters
|
22
|
+
----------
|
23
|
+
flow_config : Dict[str, Any]
|
24
|
+
The loaded YAML configuration.
|
25
|
+
|
26
|
+
Returns
|
27
|
+
-------
|
28
|
+
List[str]
|
29
|
+
List of validation error messages. Empty if valid.
|
30
|
+
"""
|
31
|
+
errors = []
|
32
|
+
|
33
|
+
# Check required top-level keys
|
34
|
+
if "blocks" not in flow_config:
|
35
|
+
errors.append("Flow configuration must contain 'blocks' section")
|
36
|
+
return errors # Can't continue without blocks
|
37
|
+
|
38
|
+
blocks = flow_config["blocks"]
|
39
|
+
if not isinstance(blocks, list):
|
40
|
+
errors.append("'blocks' must be a list")
|
41
|
+
return errors
|
42
|
+
|
43
|
+
if not blocks:
|
44
|
+
errors.append("Flow must contain at least one block")
|
45
|
+
return errors
|
46
|
+
|
47
|
+
# Validate each block configuration
|
48
|
+
for i, block_config in enumerate(blocks):
|
49
|
+
block_errors = self._validate_block_config(block_config, i)
|
50
|
+
errors.extend(block_errors)
|
51
|
+
|
52
|
+
# Validate metadata if present
|
53
|
+
if "metadata" in flow_config:
|
54
|
+
metadata_errors = self._validate_metadata_config(flow_config["metadata"])
|
55
|
+
errors.extend(metadata_errors)
|
56
|
+
|
57
|
+
# Validate parameters if present
|
58
|
+
if "parameters" in flow_config:
|
59
|
+
param_errors = self._validate_parameters_config(flow_config["parameters"])
|
60
|
+
errors.extend(param_errors)
|
61
|
+
|
62
|
+
return errors
|
63
|
+
|
64
|
+
def _validate_block_config(
|
65
|
+
self, block_config: dict[str, Any], index: int
|
66
|
+
) -> list[str]:
|
67
|
+
"""Validate a single block configuration."""
|
68
|
+
errors = []
|
69
|
+
prefix = f"Block {index}"
|
70
|
+
|
71
|
+
if not isinstance(block_config, dict):
|
72
|
+
errors.append(f"{prefix}: Block configuration must be a dictionary")
|
73
|
+
return errors
|
74
|
+
|
75
|
+
# Check required fields
|
76
|
+
if "block_type" not in block_config:
|
77
|
+
errors.append(f"{prefix}: Missing required field 'block_type'")
|
78
|
+
|
79
|
+
if "block_config" not in block_config:
|
80
|
+
errors.append(f"{prefix}: Missing required field 'block_config'")
|
81
|
+
else:
|
82
|
+
# Validate block_config structure
|
83
|
+
inner_config = block_config["block_config"]
|
84
|
+
if not isinstance(inner_config, dict):
|
85
|
+
errors.append(f"{prefix}: 'block_config' must be a dictionary")
|
86
|
+
elif "block_name" not in inner_config:
|
87
|
+
errors.append(f"{prefix}: 'block_config' must contain 'block_name'")
|
88
|
+
|
89
|
+
# Validate optional fields
|
90
|
+
if "runtime_overrides" in block_config:
|
91
|
+
overrides = block_config["runtime_overrides"]
|
92
|
+
if not isinstance(overrides, list):
|
93
|
+
errors.append(f"{prefix}: 'runtime_overrides' must be a list")
|
94
|
+
elif not all(isinstance(item, str) for item in overrides):
|
95
|
+
errors.append(
|
96
|
+
f"{prefix}: All 'runtime_overrides' items must be strings"
|
97
|
+
)
|
98
|
+
|
99
|
+
return errors
|
100
|
+
|
101
|
+
def _validate_metadata_config(self, metadata: dict[str, Any]) -> list[str]:
|
102
|
+
"""Validate metadata configuration."""
|
103
|
+
errors = []
|
104
|
+
|
105
|
+
if not isinstance(metadata, dict):
|
106
|
+
errors.append("'metadata' must be a dictionary")
|
107
|
+
return errors
|
108
|
+
|
109
|
+
# Check required name field
|
110
|
+
if "name" not in metadata:
|
111
|
+
errors.append("Metadata must contain 'name' field")
|
112
|
+
elif not isinstance(metadata["name"], str) or not metadata["name"].strip():
|
113
|
+
errors.append("Metadata 'name' must be a non-empty string")
|
114
|
+
|
115
|
+
# Validate optional fields
|
116
|
+
string_fields = [
|
117
|
+
"description",
|
118
|
+
"version",
|
119
|
+
"author",
|
120
|
+
"recommended_model",
|
121
|
+
"license",
|
122
|
+
]
|
123
|
+
for field in string_fields:
|
124
|
+
if field in metadata and not isinstance(metadata[field], str):
|
125
|
+
errors.append(f"Metadata '{field}' must be a string")
|
126
|
+
|
127
|
+
if "tags" in metadata:
|
128
|
+
tags = metadata["tags"]
|
129
|
+
if not isinstance(tags, list):
|
130
|
+
errors.append("Metadata 'tags' must be a list")
|
131
|
+
elif not all(isinstance(tag, str) for tag in tags):
|
132
|
+
errors.append("All metadata 'tags' must be strings")
|
133
|
+
|
134
|
+
return errors
|
135
|
+
|
136
|
+
def _validate_parameters_config(self, parameters: dict[str, Any]) -> list[str]:
|
137
|
+
"""Validate parameters configuration."""
|
138
|
+
errors = []
|
139
|
+
|
140
|
+
if not isinstance(parameters, dict):
|
141
|
+
errors.append("'parameters' must be a dictionary")
|
142
|
+
return errors
|
143
|
+
|
144
|
+
for param_name, param_config in parameters.items():
|
145
|
+
if not isinstance(param_name, str):
|
146
|
+
errors.append("Parameter names must be strings")
|
147
|
+
continue
|
148
|
+
|
149
|
+
if isinstance(param_config, dict):
|
150
|
+
# Full parameter specification
|
151
|
+
if "default" not in param_config:
|
152
|
+
errors.append(f"Parameter '{param_name}' must have 'default' value")
|
153
|
+
|
154
|
+
# Validate optional fields
|
155
|
+
if "description" in param_config and not isinstance(
|
156
|
+
param_config["description"], str
|
157
|
+
):
|
158
|
+
errors.append(
|
159
|
+
f"Parameter '{param_name}' description must be a string"
|
160
|
+
)
|
161
|
+
|
162
|
+
if "required" in param_config and not isinstance(
|
163
|
+
param_config["required"], bool
|
164
|
+
):
|
165
|
+
errors.append(
|
166
|
+
f"Parameter '{param_name}' required field must be boolean"
|
167
|
+
)
|
168
|
+
|
169
|
+
return errors
|
170
|
+
|
171
|
+
def validate_flow_execution(self, flow: "Flow", dataset: Dataset) -> list[str]:
|
172
|
+
"""Validate that a flow can be executed with the given dataset.
|
173
|
+
|
174
|
+
Parameters
|
175
|
+
----------
|
176
|
+
flow : Flow
|
177
|
+
The flow to validate.
|
178
|
+
dataset : Dataset
|
179
|
+
Dataset to validate against.
|
180
|
+
|
181
|
+
Returns
|
182
|
+
-------
|
183
|
+
List[str]
|
184
|
+
List of validation error messages. Empty if validation passes.
|
185
|
+
"""
|
186
|
+
errors = []
|
187
|
+
|
188
|
+
if not flow.blocks:
|
189
|
+
errors.append("Flow contains no blocks")
|
190
|
+
return errors
|
191
|
+
|
192
|
+
if len(dataset) == 0:
|
193
|
+
errors.append("Dataset is empty")
|
194
|
+
return errors
|
195
|
+
|
196
|
+
# Track available columns as we progress through blocks
|
197
|
+
current_columns = set(dataset.column_names)
|
198
|
+
|
199
|
+
for _i, block in enumerate(flow.blocks):
|
200
|
+
block_name = block.block_name
|
201
|
+
|
202
|
+
# Check input columns
|
203
|
+
if hasattr(block, "input_cols") and block.input_cols:
|
204
|
+
missing_cols = self._check_missing_columns(
|
205
|
+
block.input_cols, current_columns
|
206
|
+
)
|
207
|
+
if missing_cols:
|
208
|
+
errors.append(
|
209
|
+
f"Block '{block_name}' missing input columns: {missing_cols}"
|
210
|
+
)
|
211
|
+
|
212
|
+
# Update available columns for next block
|
213
|
+
if hasattr(block, "output_cols") and block.output_cols:
|
214
|
+
new_columns = self._extract_column_names(block.output_cols)
|
215
|
+
current_columns.update(new_columns)
|
216
|
+
|
217
|
+
return errors
|
218
|
+
|
219
|
+
def _check_missing_columns(
|
220
|
+
self, required_cols: Any, available_cols: set[str]
|
221
|
+
) -> list[str]:
|
222
|
+
"""Check which required columns are missing."""
|
223
|
+
if isinstance(required_cols, (list, dict)):
|
224
|
+
return [col for col in required_cols if col not in available_cols]
|
225
|
+
return []
|
226
|
+
|
227
|
+
def _extract_column_names(self, output_cols: Any) -> list[str]:
|
228
|
+
"""Extract column names from output specification."""
|
229
|
+
if isinstance(output_cols, list):
|
230
|
+
return output_cols
|
231
|
+
elif isinstance(output_cols, dict):
|
232
|
+
return list(output_cols.keys())
|
233
|
+
return []
|
234
|
+
|
235
|
+
def validate_block_chain(self, blocks: list[Any]) -> list[str]:
|
236
|
+
"""Validate that blocks can be chained together.
|
237
|
+
|
238
|
+
Parameters
|
239
|
+
----------
|
240
|
+
blocks : List[Any]
|
241
|
+
List of block instances to validate.
|
242
|
+
|
243
|
+
Returns
|
244
|
+
-------
|
245
|
+
List[str]
|
246
|
+
List of validation error messages.
|
247
|
+
"""
|
248
|
+
errors = []
|
249
|
+
|
250
|
+
if not blocks:
|
251
|
+
errors.append("Block chain is empty")
|
252
|
+
return errors
|
253
|
+
|
254
|
+
# Check that all blocks have unique names
|
255
|
+
block_names = []
|
256
|
+
for i, block in enumerate(blocks):
|
257
|
+
if hasattr(block, "block_name"):
|
258
|
+
name = block.block_name
|
259
|
+
if name in block_names:
|
260
|
+
errors.append(f"Duplicate block name '{name}' at index {i}")
|
261
|
+
block_names.append(name)
|
262
|
+
else:
|
263
|
+
errors.append(f"Block at index {i} missing 'block_name' attribute")
|
264
|
+
|
265
|
+
return errors
|
@@ -1,10 +1,12 @@
|
|
1
1
|
# SPDX-License-Identifier: Apache-2.0
|
2
2
|
|
3
|
+
# Local
|
4
|
+
from .path_resolution import resolve_path
|
5
|
+
|
6
|
+
|
3
7
|
# This is part of the public API, and used by instructlab
|
4
|
-
class
|
8
|
+
class GenerateError(Exception):
|
5
9
|
"""An exception raised during generate step."""
|
6
10
|
|
7
11
|
|
8
|
-
|
9
|
-
|
10
|
-
__all__ = ["GenerateException", "resolve_path"]
|
12
|
+
__all__ = ["GenerateError", "resolve_path"]
|
@@ -3,9 +3,7 @@ from datasets import concatenate_datasets
|
|
3
3
|
|
4
4
|
|
5
5
|
def safe_concatenate_datasets(datasets: list):
|
6
|
-
"""
|
7
|
-
Concatenate datasets safely, ignoring any datasets that are None or empty.
|
8
|
-
"""
|
6
|
+
"""Concatenate datasets safely, ignoring any datasets that are None or empty."""
|
9
7
|
filtered_datasets = [ds for ds in datasets if ds is not None and ds.num_rows > 0]
|
10
8
|
|
11
9
|
if not filtered_datasets:
|