sdg-hub 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. sdg_hub/__init__.py +28 -1
  2. sdg_hub/_version.py +2 -2
  3. sdg_hub/core/__init__.py +22 -0
  4. sdg_hub/core/blocks/__init__.py +58 -0
  5. sdg_hub/core/blocks/base.py +313 -0
  6. sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
  7. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
  8. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
  9. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
  10. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
  11. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
  12. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
  13. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
  14. sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
  15. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
  16. sdg_hub/core/blocks/evaluation/__init__.py +9 -0
  17. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
  18. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
  19. sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
  20. sdg_hub/core/blocks/filtering/__init__.py +12 -0
  21. sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
  22. sdg_hub/core/blocks/llm/__init__.py +25 -0
  23. sdg_hub/core/blocks/llm/client_manager.py +398 -0
  24. sdg_hub/core/blocks/llm/config.py +336 -0
  25. sdg_hub/core/blocks/llm/error_handler.py +368 -0
  26. sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
  27. sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
  28. sdg_hub/core/blocks/llm/text_parser_block.py +310 -0
  29. sdg_hub/core/blocks/registry.py +331 -0
  30. sdg_hub/core/blocks/transform/__init__.py +23 -0
  31. sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
  32. sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
  33. sdg_hub/core/blocks/transform/melt_columns.py +126 -0
  34. sdg_hub/core/blocks/transform/rename_columns.py +69 -0
  35. sdg_hub/core/blocks/transform/text_concat.py +102 -0
  36. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
  37. sdg_hub/core/flow/__init__.py +20 -0
  38. sdg_hub/core/flow/base.py +980 -0
  39. sdg_hub/core/flow/metadata.py +344 -0
  40. sdg_hub/core/flow/migration.py +187 -0
  41. sdg_hub/core/flow/registry.py +330 -0
  42. sdg_hub/core/flow/validation.py +265 -0
  43. sdg_hub/{utils → core/utils}/__init__.py +6 -4
  44. sdg_hub/{utils → core/utils}/datautils.py +1 -3
  45. sdg_hub/core/utils/error_handling.py +208 -0
  46. sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
  47. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
  48. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
  49. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
  50. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
  51. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
  52. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
  53. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +191 -0
  54. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
  55. sdg_hub-0.2.0.dist-info/METADATA +218 -0
  56. sdg_hub-0.2.0.dist-info/RECORD +63 -0
  57. sdg_hub/blocks/__init__.py +0 -42
  58. sdg_hub/blocks/block.py +0 -96
  59. sdg_hub/blocks/llmblock.py +0 -375
  60. sdg_hub/blocks/openaichatblock.py +0 -556
  61. sdg_hub/blocks/utilblocks.py +0 -597
  62. sdg_hub/checkpointer.py +0 -139
  63. sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
  64. sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
  65. sdg_hub/configs/annotations/detailed_description.yaml +0 -10
  66. sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
  67. sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
  68. sdg_hub/configs/knowledge/__init__.py +0 -0
  69. sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
  70. sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
  71. sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
  72. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
  73. sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
  74. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
  75. sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
  76. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
  77. sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
  78. sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
  79. sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
  80. sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
  81. sdg_hub/configs/knowledge/router.yaml +0 -12
  82. sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
  83. sdg_hub/configs/reasoning/__init__.py +0 -0
  84. sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
  85. sdg_hub/configs/skills/__init__.py +0 -0
  86. sdg_hub/configs/skills/analyzer.yaml +0 -48
  87. sdg_hub/configs/skills/annotation.yaml +0 -36
  88. sdg_hub/configs/skills/contexts.yaml +0 -28
  89. sdg_hub/configs/skills/critic.yaml +0 -60
  90. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
  91. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
  92. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
  93. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
  94. sdg_hub/configs/skills/freeform_questions.yaml +0 -34
  95. sdg_hub/configs/skills/freeform_responses.yaml +0 -39
  96. sdg_hub/configs/skills/grounded_questions.yaml +0 -38
  97. sdg_hub/configs/skills/grounded_responses.yaml +0 -59
  98. sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
  99. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  100. sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
  101. sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
  102. sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
  103. sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
  104. sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
  105. sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
  106. sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
  107. sdg_hub/configs/skills/judge.yaml +0 -53
  108. sdg_hub/configs/skills/planner.yaml +0 -67
  109. sdg_hub/configs/skills/respond.yaml +0 -8
  110. sdg_hub/configs/skills/revised_responder.yaml +0 -78
  111. sdg_hub/configs/skills/router.yaml +0 -59
  112. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
  113. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
  114. sdg_hub/flow.py +0 -477
  115. sdg_hub/flow_runner.py +0 -450
  116. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
  117. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
  118. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
  119. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
  120. sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
  121. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
  122. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
  123. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
  124. sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
  125. sdg_hub/pipeline.py +0 -121
  126. sdg_hub/prompts.py +0 -80
  127. sdg_hub/registry.py +0 -122
  128. sdg_hub/sdg.py +0 -206
  129. sdg_hub/utils/config_validation.py +0 -91
  130. sdg_hub/utils/error_handling.py +0 -94
  131. sdg_hub/utils/validation_result.py +0 -10
  132. sdg_hub-0.1.4.dist-info/METADATA +0 -190
  133. sdg_hub-0.1.4.dist-info/RECORD +0 -89
  134. sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
  135. /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
  136. /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
  137. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.0.dist-info}/WHEEL +0 -0
  138. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.0.dist-info}/licenses/LICENSE +0 -0
  139. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,330 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Flow registry for managing contributed flows."""
3
+
4
+ # Standard
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Optional
8
+ import os
9
+
10
+ # Third Party
11
+ import yaml
12
+
13
+ # Local
14
+ from ..utils.logger_config import setup_logger
15
+ from .metadata import FlowMetadata
16
+
17
+ logger = setup_logger(__name__)
18
+
19
+
20
+ @dataclass
21
+ class FlowRegistryEntry:
22
+ """Entry in the flow registry.
23
+
24
+ Parameters
25
+ ----------
26
+ path : str
27
+ Path to the flow YAML file.
28
+ metadata : FlowMetadata
29
+ Flow metadata extracted from the file.
30
+ """
31
+
32
+ path: str
33
+ metadata: FlowMetadata
34
+
35
+
36
+ class FlowRegistry:
37
+ """Registry for managing contributed flows."""
38
+
39
+ _entries: dict[str, FlowRegistryEntry] = {}
40
+ _search_paths: list[str] = []
41
+ _initialized: bool = False
42
+
43
+ @classmethod
44
+ def _ensure_initialized(cls) -> None:
45
+ """Ensure the registry is initialized with built-in flows."""
46
+ if cls._initialized:
47
+ return
48
+
49
+ try:
50
+ # Find the sdg_hub package directory
51
+ # First Party
52
+ import sdg_hub
53
+
54
+ package_path = Path(sdg_hub.__file__).parent
55
+ flows_dir = package_path / "flows"
56
+
57
+ # Register built-in flows directory if it exists
58
+ if flows_dir.exists():
59
+ flows_dir_str = str(flows_dir)
60
+ if flows_dir_str not in cls._search_paths:
61
+ cls._search_paths.append(flows_dir_str)
62
+ logger.debug(
63
+ f"Auto-registered built-in flows directory: {flows_dir}"
64
+ )
65
+ else:
66
+ logger.debug(f"Built-in flows directory not found: {flows_dir}")
67
+
68
+ except Exception as exc:
69
+ logger.warning(f"Failed to auto-register built-in flows: {exc}")
70
+
71
+ cls._initialized = True
72
+
73
+ @classmethod
74
+ def register_search_path(cls, path: str) -> None:
75
+ """Add a directory to search for flows.
76
+
77
+ Parameters
78
+ ----------
79
+ path : str
80
+ Path to directory containing flow YAML files.
81
+ """
82
+ if path not in cls._search_paths:
83
+ cls._search_paths.append(path)
84
+ logger.debug(f"Added flow search path: {path}")
85
+
86
+ @classmethod
87
+ def _discover_flows(cls, force_refresh: bool = False) -> None:
88
+ """Discover and register flows from search paths (private method).
89
+
90
+ Parameters
91
+ ----------
92
+ force_refresh : bool, optional
93
+ Whether to force refresh the registry.
94
+ """
95
+ # Ensure built-in flows are registered
96
+ cls._ensure_initialized()
97
+
98
+ if cls._entries and not force_refresh:
99
+ return
100
+
101
+ cls._entries.clear()
102
+
103
+ for search_path in cls._search_paths:
104
+ if not os.path.exists(search_path):
105
+ logger.warning(f"Flow search path does not exist: {search_path}")
106
+ continue
107
+
108
+ cls._discover_flows_in_directory(search_path)
109
+
110
+ logger.info(f"Discovered {len(cls._entries)} flows")
111
+
112
+ @classmethod
113
+ def _discover_flows_in_directory(cls, directory: str) -> None:
114
+ """Discover flows in a specific directory."""
115
+ path = Path(directory)
116
+
117
+ for yaml_file in path.rglob("*.yaml"):
118
+ try:
119
+ with open(yaml_file, encoding="utf-8") as f:
120
+ flow_config = yaml.safe_load(f)
121
+
122
+ # Check if this is a flow file
123
+ if "metadata" in flow_config and "blocks" in flow_config:
124
+ metadata_dict = flow_config["metadata"]
125
+ metadata = FlowMetadata(**metadata_dict)
126
+
127
+ entry = FlowRegistryEntry(path=str(yaml_file), metadata=metadata)
128
+
129
+ cls._entries[metadata.name] = entry
130
+ logger.debug(f"Registered flow: {metadata.name} from {yaml_file}")
131
+
132
+ except Exception as exc:
133
+ logger.debug(f"Skipped {yaml_file}: {exc}")
134
+
135
+ @classmethod
136
+ def get_flow_path(cls, flow_name: str) -> Optional[str]:
137
+ """Get the path to a registered flow.
138
+
139
+ Parameters
140
+ ----------
141
+ flow_name : str
142
+ Name of the flow to find.
143
+
144
+ Returns
145
+ -------
146
+ Optional[str]
147
+ Path to the flow file, or None if not found.
148
+ """
149
+ cls._ensure_initialized()
150
+ cls._discover_flows()
151
+
152
+ if flow_name in cls._entries:
153
+ return cls._entries[flow_name].path
154
+ return None
155
+
156
+ @classmethod
157
+ def get_flow_metadata(cls, flow_name: str) -> Optional[FlowMetadata]:
158
+ """Get metadata for a registered flow.
159
+
160
+ Parameters
161
+ ----------
162
+ flow_name : str
163
+ Name of the flow.
164
+
165
+ Returns
166
+ -------
167
+ Optional[FlowMetadata]
168
+ Flow metadata, or None if not found.
169
+ """
170
+ cls._ensure_initialized()
171
+ cls._discover_flows()
172
+
173
+ if flow_name in cls._entries:
174
+ return cls._entries[flow_name].metadata
175
+ return None
176
+
177
+ @classmethod
178
+ def list_flows(cls) -> list[str]:
179
+ """List all registered flow names.
180
+
181
+ Returns
182
+ -------
183
+ List[str]
184
+ List of flow names.
185
+ """
186
+ cls._ensure_initialized()
187
+ cls._discover_flows()
188
+ return list(cls._entries.keys())
189
+
190
+ @classmethod
191
+ def search_flows(
192
+ cls, tag: Optional[str] = None, author: Optional[str] = None
193
+ ) -> list[str]:
194
+ """Search flows by criteria.
195
+
196
+ Parameters
197
+ ----------
198
+ tag : Optional[str]
199
+ Tag to filter by.
200
+ author : Optional[str]
201
+ Author to filter by.
202
+
203
+ Returns
204
+ -------
205
+ List[str]
206
+ List of matching flow names.
207
+ """
208
+ cls._ensure_initialized()
209
+ cls._discover_flows()
210
+
211
+ matching_flows = []
212
+
213
+ for name, entry in cls._entries.items():
214
+ metadata = entry.metadata
215
+
216
+ # Filter by tag
217
+ if tag and tag not in metadata.tags:
218
+ continue
219
+
220
+ # Filter by author
221
+ if author and author.lower() not in metadata.author.lower():
222
+ continue
223
+
224
+ matching_flows.append(name)
225
+
226
+ return matching_flows
227
+
228
+ @classmethod
229
+ def get_flows_by_category(cls) -> dict[str, list[str]]:
230
+ """Get flows organized by their primary tag.
231
+
232
+ Returns
233
+ -------
234
+ Dict[str, List[str]]
235
+ Dictionary mapping tags to flow names.
236
+ """
237
+ cls._ensure_initialized()
238
+ cls._discover_flows()
239
+
240
+ categories = {}
241
+
242
+ for name, entry in cls._entries.items():
243
+ metadata = entry.metadata
244
+
245
+ # Use first tag as primary category, or "uncategorized"
246
+ category = metadata.tags[0] if metadata.tags else "uncategorized"
247
+
248
+ if category not in categories:
249
+ categories[category] = []
250
+
251
+ categories[category].append(name)
252
+
253
+ return categories
254
+
255
+ @classmethod
256
+ def discover_flows(cls, show_all_columns: bool = False) -> None:
257
+ """Discover and display all flows in a formatted table.
258
+
259
+ This is the main public API for flow discovery. It finds all flows
260
+ in registered search paths and displays them in a beautiful Rich table.
261
+
262
+ Parameters
263
+ ----------
264
+ show_all_columns : bool, optional
265
+ Whether to show extended table with all columns, by default False
266
+ """
267
+ cls._ensure_initialized()
268
+ cls._discover_flows()
269
+
270
+ if not cls._entries:
271
+ print(
272
+ "No flows discovered. Try adding search paths with register_search_path()"
273
+ )
274
+ print("Note: Only flows with 'metadata' section are discoverable.")
275
+ return
276
+
277
+ # Prepare data with fallbacks
278
+ flow_data = []
279
+ for name, entry in cls._entries.items():
280
+ metadata = entry.metadata
281
+ flow_data.append(
282
+ {
283
+ "name": name,
284
+ "author": metadata.author or "Unknown",
285
+ "tags": ", ".join(metadata.tags) if metadata.tags else "-",
286
+ "description": metadata.description or "No description",
287
+ "version": metadata.version,
288
+ "cost": metadata.estimated_cost,
289
+ }
290
+ )
291
+
292
+ # Sort by name for consistency
293
+ flow_data.sort(key=lambda x: x["name"])
294
+
295
+ # Display Rich table
296
+ # Third Party
297
+ from rich.console import Console
298
+ from rich.table import Table
299
+
300
+ console = Console()
301
+ table = Table(show_header=True, header_style="bold magenta")
302
+
303
+ # Add columns
304
+ table.add_column("Name", style="cyan", no_wrap=True)
305
+ table.add_column("Author", style="green")
306
+
307
+ if show_all_columns:
308
+ table.add_column("Version", style="blue")
309
+ table.add_column("Cost", style="yellow")
310
+
311
+ table.add_column("Tags", style="dim")
312
+ table.add_column("Description")
313
+
314
+ # Add rows
315
+ for flow in flow_data:
316
+ if show_all_columns:
317
+ table.add_row(
318
+ flow["name"],
319
+ flow["author"],
320
+ flow["version"],
321
+ flow["cost"],
322
+ flow["tags"],
323
+ flow["description"],
324
+ )
325
+ else:
326
+ table.add_row(
327
+ flow["name"], flow["author"], flow["tags"], flow["description"]
328
+ )
329
+
330
+ console.print(table)
@@ -0,0 +1,265 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Flow validation utilities."""
3
+
4
+ # Standard
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ # Third Party
8
+ from datasets import Dataset
9
+
10
+ if TYPE_CHECKING:
11
+ # Local
12
+ from .base import Flow
13
+
14
+
15
+ class FlowValidator:
16
+ """Validator for flow configurations and execution readiness."""
17
+
18
+ def validate_yaml_structure(self, flow_config: dict[str, Any]) -> list[str]:
19
+ """Validate the structure of a flow YAML configuration.
20
+
21
+ Parameters
22
+ ----------
23
+ flow_config : Dict[str, Any]
24
+ The loaded YAML configuration.
25
+
26
+ Returns
27
+ -------
28
+ List[str]
29
+ List of validation error messages. Empty if valid.
30
+ """
31
+ errors = []
32
+
33
+ # Check required top-level keys
34
+ if "blocks" not in flow_config:
35
+ errors.append("Flow configuration must contain 'blocks' section")
36
+ return errors # Can't continue without blocks
37
+
38
+ blocks = flow_config["blocks"]
39
+ if not isinstance(blocks, list):
40
+ errors.append("'blocks' must be a list")
41
+ return errors
42
+
43
+ if not blocks:
44
+ errors.append("Flow must contain at least one block")
45
+ return errors
46
+
47
+ # Validate each block configuration
48
+ for i, block_config in enumerate(blocks):
49
+ block_errors = self._validate_block_config(block_config, i)
50
+ errors.extend(block_errors)
51
+
52
+ # Validate metadata if present
53
+ if "metadata" in flow_config:
54
+ metadata_errors = self._validate_metadata_config(flow_config["metadata"])
55
+ errors.extend(metadata_errors)
56
+
57
+ # Validate parameters if present
58
+ if "parameters" in flow_config:
59
+ param_errors = self._validate_parameters_config(flow_config["parameters"])
60
+ errors.extend(param_errors)
61
+
62
+ return errors
63
+
64
+ def _validate_block_config(
65
+ self, block_config: dict[str, Any], index: int
66
+ ) -> list[str]:
67
+ """Validate a single block configuration."""
68
+ errors = []
69
+ prefix = f"Block {index}"
70
+
71
+ if not isinstance(block_config, dict):
72
+ errors.append(f"{prefix}: Block configuration must be a dictionary")
73
+ return errors
74
+
75
+ # Check required fields
76
+ if "block_type" not in block_config:
77
+ errors.append(f"{prefix}: Missing required field 'block_type'")
78
+
79
+ if "block_config" not in block_config:
80
+ errors.append(f"{prefix}: Missing required field 'block_config'")
81
+ else:
82
+ # Validate block_config structure
83
+ inner_config = block_config["block_config"]
84
+ if not isinstance(inner_config, dict):
85
+ errors.append(f"{prefix}: 'block_config' must be a dictionary")
86
+ elif "block_name" not in inner_config:
87
+ errors.append(f"{prefix}: 'block_config' must contain 'block_name'")
88
+
89
+ # Validate optional fields
90
+ if "runtime_overrides" in block_config:
91
+ overrides = block_config["runtime_overrides"]
92
+ if not isinstance(overrides, list):
93
+ errors.append(f"{prefix}: 'runtime_overrides' must be a list")
94
+ elif not all(isinstance(item, str) for item in overrides):
95
+ errors.append(
96
+ f"{prefix}: All 'runtime_overrides' items must be strings"
97
+ )
98
+
99
+ return errors
100
+
101
+ def _validate_metadata_config(self, metadata: dict[str, Any]) -> list[str]:
102
+ """Validate metadata configuration."""
103
+ errors = []
104
+
105
+ if not isinstance(metadata, dict):
106
+ errors.append("'metadata' must be a dictionary")
107
+ return errors
108
+
109
+ # Check required name field
110
+ if "name" not in metadata:
111
+ errors.append("Metadata must contain 'name' field")
112
+ elif not isinstance(metadata["name"], str) or not metadata["name"].strip():
113
+ errors.append("Metadata 'name' must be a non-empty string")
114
+
115
+ # Validate optional fields
116
+ string_fields = [
117
+ "description",
118
+ "version",
119
+ "author",
120
+ "recommended_model",
121
+ "license",
122
+ ]
123
+ for field in string_fields:
124
+ if field in metadata and not isinstance(metadata[field], str):
125
+ errors.append(f"Metadata '{field}' must be a string")
126
+
127
+ if "tags" in metadata:
128
+ tags = metadata["tags"]
129
+ if not isinstance(tags, list):
130
+ errors.append("Metadata 'tags' must be a list")
131
+ elif not all(isinstance(tag, str) for tag in tags):
132
+ errors.append("All metadata 'tags' must be strings")
133
+
134
+ return errors
135
+
136
+ def _validate_parameters_config(self, parameters: dict[str, Any]) -> list[str]:
137
+ """Validate parameters configuration."""
138
+ errors = []
139
+
140
+ if not isinstance(parameters, dict):
141
+ errors.append("'parameters' must be a dictionary")
142
+ return errors
143
+
144
+ for param_name, param_config in parameters.items():
145
+ if not isinstance(param_name, str):
146
+ errors.append("Parameter names must be strings")
147
+ continue
148
+
149
+ if isinstance(param_config, dict):
150
+ # Full parameter specification
151
+ if "default" not in param_config:
152
+ errors.append(f"Parameter '{param_name}' must have 'default' value")
153
+
154
+ # Validate optional fields
155
+ if "description" in param_config and not isinstance(
156
+ param_config["description"], str
157
+ ):
158
+ errors.append(
159
+ f"Parameter '{param_name}' description must be a string"
160
+ )
161
+
162
+ if "required" in param_config and not isinstance(
163
+ param_config["required"], bool
164
+ ):
165
+ errors.append(
166
+ f"Parameter '{param_name}' required field must be boolean"
167
+ )
168
+
169
+ return errors
170
+
171
+ def validate_flow_execution(self, flow: "Flow", dataset: Dataset) -> list[str]:
172
+ """Validate that a flow can be executed with the given dataset.
173
+
174
+ Parameters
175
+ ----------
176
+ flow : Flow
177
+ The flow to validate.
178
+ dataset : Dataset
179
+ Dataset to validate against.
180
+
181
+ Returns
182
+ -------
183
+ List[str]
184
+ List of validation error messages. Empty if validation passes.
185
+ """
186
+ errors = []
187
+
188
+ if not flow.blocks:
189
+ errors.append("Flow contains no blocks")
190
+ return errors
191
+
192
+ if len(dataset) == 0:
193
+ errors.append("Dataset is empty")
194
+ return errors
195
+
196
+ # Track available columns as we progress through blocks
197
+ current_columns = set(dataset.column_names)
198
+
199
+ for _i, block in enumerate(flow.blocks):
200
+ block_name = block.block_name
201
+
202
+ # Check input columns
203
+ if hasattr(block, "input_cols") and block.input_cols:
204
+ missing_cols = self._check_missing_columns(
205
+ block.input_cols, current_columns
206
+ )
207
+ if missing_cols:
208
+ errors.append(
209
+ f"Block '{block_name}' missing input columns: {missing_cols}"
210
+ )
211
+
212
+ # Update available columns for next block
213
+ if hasattr(block, "output_cols") and block.output_cols:
214
+ new_columns = self._extract_column_names(block.output_cols)
215
+ current_columns.update(new_columns)
216
+
217
+ return errors
218
+
219
+ def _check_missing_columns(
220
+ self, required_cols: Any, available_cols: set[str]
221
+ ) -> list[str]:
222
+ """Check which required columns are missing."""
223
+ if isinstance(required_cols, (list, dict)):
224
+ return [col for col in required_cols if col not in available_cols]
225
+ return []
226
+
227
+ def _extract_column_names(self, output_cols: Any) -> list[str]:
228
+ """Extract column names from output specification."""
229
+ if isinstance(output_cols, list):
230
+ return output_cols
231
+ elif isinstance(output_cols, dict):
232
+ return list(output_cols.keys())
233
+ return []
234
+
235
+ def validate_block_chain(self, blocks: list[Any]) -> list[str]:
236
+ """Validate that blocks can be chained together.
237
+
238
+ Parameters
239
+ ----------
240
+ blocks : List[Any]
241
+ List of block instances to validate.
242
+
243
+ Returns
244
+ -------
245
+ List[str]
246
+ List of validation error messages.
247
+ """
248
+ errors = []
249
+
250
+ if not blocks:
251
+ errors.append("Block chain is empty")
252
+ return errors
253
+
254
+ # Check that all blocks have unique names
255
+ block_names = []
256
+ for i, block in enumerate(blocks):
257
+ if hasattr(block, "block_name"):
258
+ name = block.block_name
259
+ if name in block_names:
260
+ errors.append(f"Duplicate block name '{name}' at index {i}")
261
+ block_names.append(name)
262
+ else:
263
+ errors.append(f"Block at index {i} missing 'block_name' attribute")
264
+
265
+ return errors
@@ -1,10 +1,12 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
 
3
+ # Local
4
+ from .path_resolution import resolve_path
5
+
6
+
3
7
  # This is part of the public API, and used by instructlab
4
- class GenerateException(Exception):
8
+ class GenerateError(Exception):
5
9
  """An exception raised during generate step."""
6
10
 
7
11
 
8
- from .path_resolution import resolve_path
9
-
10
- __all__ = ["GenerateException", "resolve_path"]
12
+ __all__ = ["GenerateError", "resolve_path"]
@@ -3,9 +3,7 @@ from datasets import concatenate_datasets
3
3
 
4
4
 
5
5
  def safe_concatenate_datasets(datasets: list):
6
- """
7
- Concatenate datasets safely, ignoring any datasets that are None or empty.
8
- """
6
+ """Concatenate datasets safely, ignoring any datasets that are None or empty."""
9
7
  filtered_datasets = [ds for ds in datasets if ds is not None and ds.num_rows > 0]
10
8
 
11
9
  if not filtered_datasets: