sdg-hub 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. sdg_hub/__init__.py +28 -1
  2. sdg_hub/_version.py +2 -2
  3. sdg_hub/core/__init__.py +22 -0
  4. sdg_hub/core/blocks/__init__.py +58 -0
  5. sdg_hub/core/blocks/base.py +313 -0
  6. sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
  7. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
  8. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
  9. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
  10. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
  11. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
  12. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
  13. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
  14. sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
  15. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
  16. sdg_hub/core/blocks/evaluation/__init__.py +9 -0
  17. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
  18. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
  19. sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
  20. sdg_hub/core/blocks/filtering/__init__.py +12 -0
  21. sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
  22. sdg_hub/core/blocks/llm/__init__.py +27 -0
  23. sdg_hub/core/blocks/llm/client_manager.py +398 -0
  24. sdg_hub/core/blocks/llm/config.py +336 -0
  25. sdg_hub/core/blocks/llm/error_handler.py +368 -0
  26. sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
  27. sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +491 -0
  28. sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
  29. sdg_hub/core/blocks/llm/text_parser_block.py +357 -0
  30. sdg_hub/core/blocks/registry.py +331 -0
  31. sdg_hub/core/blocks/transform/__init__.py +23 -0
  32. sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
  33. sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
  34. sdg_hub/core/blocks/transform/melt_columns.py +126 -0
  35. sdg_hub/core/blocks/transform/rename_columns.py +69 -0
  36. sdg_hub/core/blocks/transform/text_concat.py +102 -0
  37. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
  38. sdg_hub/core/flow/__init__.py +20 -0
  39. sdg_hub/core/flow/base.py +1209 -0
  40. sdg_hub/core/flow/checkpointer.py +333 -0
  41. sdg_hub/core/flow/metadata.py +389 -0
  42. sdg_hub/core/flow/migration.py +198 -0
  43. sdg_hub/core/flow/registry.py +393 -0
  44. sdg_hub/core/flow/validation.py +277 -0
  45. sdg_hub/{utils → core/utils}/__init__.py +7 -4
  46. sdg_hub/core/utils/datautils.py +63 -0
  47. sdg_hub/core/utils/error_handling.py +208 -0
  48. sdg_hub/core/utils/flow_id_words.yaml +231 -0
  49. sdg_hub/core/utils/flow_identifier.py +94 -0
  50. sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
  51. sdg_hub/core/utils/yaml_utils.py +59 -0
  52. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
  53. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
  54. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
  55. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
  56. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
  57. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
  58. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +192 -0
  59. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
  60. sdg_hub-0.2.1.dist-info/METADATA +221 -0
  61. sdg_hub-0.2.1.dist-info/RECORD +68 -0
  62. sdg_hub/blocks/__init__.py +0 -42
  63. sdg_hub/blocks/block.py +0 -96
  64. sdg_hub/blocks/llmblock.py +0 -375
  65. sdg_hub/blocks/openaichatblock.py +0 -556
  66. sdg_hub/blocks/utilblocks.py +0 -597
  67. sdg_hub/checkpointer.py +0 -139
  68. sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
  69. sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
  70. sdg_hub/configs/annotations/detailed_description.yaml +0 -10
  71. sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
  72. sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
  73. sdg_hub/configs/knowledge/__init__.py +0 -0
  74. sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
  75. sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
  76. sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
  77. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
  78. sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
  79. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
  80. sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
  81. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
  82. sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
  83. sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
  84. sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
  85. sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
  86. sdg_hub/configs/knowledge/router.yaml +0 -12
  87. sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
  88. sdg_hub/configs/reasoning/__init__.py +0 -0
  89. sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
  90. sdg_hub/configs/skills/__init__.py +0 -0
  91. sdg_hub/configs/skills/analyzer.yaml +0 -48
  92. sdg_hub/configs/skills/annotation.yaml +0 -36
  93. sdg_hub/configs/skills/contexts.yaml +0 -28
  94. sdg_hub/configs/skills/critic.yaml +0 -60
  95. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
  96. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
  97. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
  98. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
  99. sdg_hub/configs/skills/freeform_questions.yaml +0 -34
  100. sdg_hub/configs/skills/freeform_responses.yaml +0 -39
  101. sdg_hub/configs/skills/grounded_questions.yaml +0 -38
  102. sdg_hub/configs/skills/grounded_responses.yaml +0 -59
  103. sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
  104. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  105. sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
  106. sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
  107. sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
  108. sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
  109. sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
  110. sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
  111. sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
  112. sdg_hub/configs/skills/judge.yaml +0 -53
  113. sdg_hub/configs/skills/planner.yaml +0 -67
  114. sdg_hub/configs/skills/respond.yaml +0 -8
  115. sdg_hub/configs/skills/revised_responder.yaml +0 -78
  116. sdg_hub/configs/skills/router.yaml +0 -59
  117. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
  118. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
  119. sdg_hub/flow.py +0 -477
  120. sdg_hub/flow_runner.py +0 -450
  121. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
  122. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
  123. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
  124. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
  125. sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
  126. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
  127. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
  128. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
  129. sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
  130. sdg_hub/pipeline.py +0 -121
  131. sdg_hub/prompts.py +0 -80
  132. sdg_hub/registry.py +0 -122
  133. sdg_hub/sdg.py +0 -206
  134. sdg_hub/utils/config_validation.py +0 -91
  135. sdg_hub/utils/datautils.py +0 -14
  136. sdg_hub/utils/error_handling.py +0 -94
  137. sdg_hub/utils/validation_result.py +0 -10
  138. sdg_hub-0.1.4.dist-info/METADATA +0 -190
  139. sdg_hub-0.1.4.dist-info/RECORD +0 -89
  140. sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
  141. /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
  142. /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
  143. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/WHEEL +0 -0
  144. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/licenses/LICENSE +0 -0
  145. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,393 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Flow registry for managing contributed flows."""
3
+
4
+ # Standard
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Dict, List, Optional
8
+ import os
9
+
10
+ # Third Party
11
+ from rich.console import Console
12
+ from rich.table import Table
13
+ import yaml
14
+
15
+ # Local
16
+ from ..utils.logger_config import setup_logger
17
+ from ..utils.yaml_utils import save_flow_yaml
18
+ from .metadata import FlowMetadata
19
+
20
+ logger = setup_logger(__name__)
21
+
22
+
23
+ @dataclass
24
+ class FlowRegistryEntry:
25
+ """Entry in the flow registry.
26
+
27
+ Parameters
28
+ ----------
29
+ path : str
30
+ Path to the flow YAML file.
31
+ metadata : FlowMetadata
32
+ Flow metadata extracted from the file.
33
+ """
34
+
35
+ path: str
36
+ metadata: FlowMetadata
37
+
38
+
39
+ class FlowRegistry:
40
+ """Registry for managing contributed flows."""
41
+
42
+ _entries: dict[str, FlowRegistryEntry] = {}
43
+ _search_paths: list[str] = []
44
+ _initialized: bool = False
45
+
46
+ @classmethod
47
+ def _ensure_initialized(cls) -> None:
48
+ """Ensure the registry is initialized with built-in flows."""
49
+ if cls._initialized:
50
+ return
51
+
52
+ try:
53
+ # Find the sdg_hub package directory
54
+ # First Party
55
+ import sdg_hub
56
+
57
+ package_path = Path(sdg_hub.__file__).parent
58
+ flows_dir = package_path / "flows"
59
+
60
+ # Register built-in flows directory if it exists
61
+ if flows_dir.exists():
62
+ flows_dir_str = str(flows_dir)
63
+ if flows_dir_str not in cls._search_paths:
64
+ cls._search_paths.append(flows_dir_str)
65
+ logger.debug(
66
+ f"Auto-registered built-in flows directory: {flows_dir}"
67
+ )
68
+ else:
69
+ logger.debug(f"Built-in flows directory not found: {flows_dir}")
70
+
71
+ except Exception as exc:
72
+ logger.warning(f"Failed to auto-register built-in flows: {exc}")
73
+
74
+ cls._initialized = True
75
+
76
+ @classmethod
77
+ def register_search_path(cls, path: str) -> None:
78
+ """Add a directory to search for flows.
79
+
80
+ Parameters
81
+ ----------
82
+ path : str
83
+ Path to directory containing flow YAML files.
84
+ """
85
+ if path not in cls._search_paths:
86
+ cls._search_paths.append(path)
87
+ logger.debug(f"Added flow search path: {path}")
88
+
89
+ @classmethod
90
+ def _discover_flows(cls, force_refresh: bool = False) -> None:
91
+ """Discover and register flows from search paths (private method).
92
+
93
+ Parameters
94
+ ----------
95
+ force_refresh : bool, optional
96
+ Whether to force refresh the registry.
97
+ """
98
+ # Ensure built-in flows are registered
99
+ cls._ensure_initialized()
100
+
101
+ if cls._entries and not force_refresh:
102
+ return
103
+
104
+ cls._entries.clear()
105
+
106
+ for search_path in cls._search_paths:
107
+ if not os.path.exists(search_path):
108
+ logger.warning(f"Flow search path does not exist: {search_path}")
109
+ continue
110
+
111
+ cls._discover_flows_in_directory(search_path)
112
+
113
+ logger.info(f"Discovered {len(cls._entries)} flows")
114
+
115
+ @classmethod
116
+ def _discover_flows_in_directory(cls, directory: str) -> None:
117
+ """Discover flows in a specific directory."""
118
+ path = Path(directory)
119
+
120
+ for yaml_file in path.rglob("*.yaml"):
121
+ try:
122
+ with open(yaml_file, encoding="utf-8") as f:
123
+ flow_config = yaml.safe_load(f)
124
+
125
+ # Check if this is a flow file
126
+ if "metadata" in flow_config and "blocks" in flow_config:
127
+ metadata_dict = flow_config["metadata"]
128
+ metadata = FlowMetadata(**metadata_dict)
129
+
130
+ # If id was generated, update the YAML
131
+ if metadata.id and "id" not in metadata_dict:
132
+ flow_config["metadata"]["id"] = metadata.id
133
+
134
+ save_flow_yaml(
135
+ yaml_file,
136
+ flow_config,
137
+ f"updated with generated id: {metadata.id}",
138
+ )
139
+
140
+ entry = FlowRegistryEntry(path=str(yaml_file), metadata=metadata)
141
+ cls._entries[metadata.name] = entry
142
+ logger.debug(
143
+ f"Registered flow: {metadata.name} (id: {metadata.id}) from {yaml_file}"
144
+ )
145
+
146
+ except Exception as exc:
147
+ logger.debug(f"Skipped {yaml_file}: {exc}")
148
+
149
+ @classmethod
150
+ def get_flow_path(cls, flow_name_or_id: str) -> Optional[str]:
151
+ """Get the path to a registered flow.
152
+
153
+ For backward compatibility, this function accepts either a flow id or flow_name.
154
+ Flow ID is preferred and should be used in new code.
155
+
156
+ Parameters
157
+ ----------
158
+ flow_name_or_id : str
159
+ Either the flow id or flow_name to find.
160
+
161
+ Returns
162
+ -------
163
+ Optional[str]
164
+ Path to the flow file, or None if not found.
165
+ """
166
+ cls._ensure_initialized()
167
+ cls._discover_flows()
168
+
169
+ # First try to find by id (preferred)
170
+ for entry in cls._entries.values():
171
+ if entry.metadata.id == flow_name_or_id:
172
+ return entry.path
173
+
174
+ # If not found, try by name (backward compatibility)
175
+ for entry in cls._entries.values():
176
+ if entry.metadata.name == flow_name_or_id:
177
+ logger.debug(
178
+ f"Found flow by name (deprecated): {flow_name_or_id}, use id: {entry.metadata.id} instead"
179
+ )
180
+ return entry.path
181
+
182
+ return None
183
+
184
+ @classmethod
185
+ def get_flow_path_safe(cls, flow_name_or_id: str) -> str:
186
+ """Get the path to a registered flow with better error handling.
187
+
188
+ For backward compatibility, this function accepts either a flow id or flow_name.
189
+ Flow ID is preferred and should be used in new code.
190
+
191
+ Parameters
192
+ ----------
193
+ flow_name_or_id : str
194
+ Either the flow id or flow_name to find.
195
+
196
+ Returns
197
+ -------
198
+ str
199
+ Path to the flow file.
200
+
201
+ Raises
202
+ ------
203
+ ValueError
204
+ If the flow is not found, with helpful suggestions.
205
+ """
206
+ cls._ensure_initialized()
207
+ cls._discover_flows()
208
+
209
+ path = cls.get_flow_path(flow_name_or_id)
210
+ if path is None:
211
+ # Get available flows for better error message
212
+ available_flows = cls.list_flows()
213
+
214
+ error_msg = f"Flow '{flow_name_or_id}' not found.\n"
215
+
216
+ if available_flows:
217
+ error_msg += "Available flows:\n"
218
+ for flow in available_flows:
219
+ error_msg += f" - ID: '{flow['id']}', Name: '{flow['name']}'\n"
220
+ else:
221
+ error_msg += "No flows are currently registered. Try running FlowRegistry.discover_flows() first."
222
+
223
+ raise ValueError(error_msg.strip())
224
+
225
+ return path
226
+
227
+ @classmethod
228
+ def get_flow_metadata(cls, flow_name: str) -> Optional[FlowMetadata]:
229
+ """Get metadata for a registered flow.
230
+
231
+ Parameters
232
+ ----------
233
+ flow_name : str
234
+ Name of the flow.
235
+
236
+ Returns
237
+ -------
238
+ Optional[FlowMetadata]
239
+ Flow metadata, or None if not found.
240
+ """
241
+ cls._ensure_initialized()
242
+ cls._discover_flows()
243
+
244
+ if flow_name in cls._entries:
245
+ return cls._entries[flow_name].metadata
246
+ return None
247
+
248
+ @classmethod
249
+ def list_flows(cls) -> List[Dict[str, str]]:
250
+ """List all registered flows with their IDs.
251
+
252
+ Returns
253
+ -------
254
+ List[Dict[str, str]]
255
+ List of dictionaries containing flow IDs and names.
256
+ Each dictionary has 'id' and 'name' keys.
257
+ """
258
+ cls._ensure_initialized()
259
+ cls._discover_flows()
260
+ return [
261
+ {"id": entry.metadata.id, "name": entry.metadata.name}
262
+ for entry in cls._entries.values()
263
+ ]
264
+
265
+ @classmethod
266
+ def search_flows(
267
+ cls, tag: Optional[str] = None, author: Optional[str] = None
268
+ ) -> List[Dict[str, str]]:
269
+ """Search flows by criteria.
270
+
271
+ Parameters
272
+ ----------
273
+ tag : Optional[str]
274
+ Tag to filter by.
275
+ author : Optional[str]
276
+ Author to filter by.
277
+
278
+ Returns
279
+ -------
280
+ List[Dict[str, str]]
281
+ List of matching flows. Each dictionary contains:
282
+ - id: Flow ID
283
+ - name: Flow name
284
+ """
285
+ cls._ensure_initialized()
286
+ cls._discover_flows()
287
+
288
+ matching_flows = []
289
+
290
+ for entry in cls._entries.values():
291
+ metadata = entry.metadata
292
+
293
+ # Filter by tag
294
+ if tag and tag not in metadata.tags:
295
+ continue
296
+
297
+ # Filter by author
298
+ if author and author.lower() not in metadata.author.lower():
299
+ continue
300
+
301
+ matching_flows.append({"id": metadata.id, "name": metadata.name})
302
+
303
+ return matching_flows
304
+
305
+ @classmethod
306
+ def get_flows_by_category(cls) -> Dict[str, List[Dict[str, str]]]:
307
+ """Get flows organized by their primary tag.
308
+
309
+ Returns
310
+ -------
311
+ Dict[str, List[Dict[str, str]]]
312
+ Dictionary mapping tags to flow information. Each flow is represented by:
313
+ - id: Flow ID
314
+ - name: Flow name
315
+ """
316
+ cls._ensure_initialized()
317
+ cls._discover_flows()
318
+
319
+ categories = {}
320
+
321
+ for entry in cls._entries.values():
322
+ metadata = entry.metadata
323
+
324
+ # Use first tag as primary category, or "uncategorized"
325
+ category = metadata.tags[0] if metadata.tags else "uncategorized"
326
+
327
+ if category not in categories:
328
+ categories[category] = []
329
+
330
+ categories[category].append({"id": metadata.id, "name": metadata.name})
331
+
332
+ return categories
333
+
334
+ @classmethod
335
+ def discover_flows(cls) -> None:
336
+ """Discover and display all flows in a formatted table.
337
+
338
+ This is the main public API for flow discovery. It finds all flows
339
+ in registered search paths and displays them in a beautiful Rich table.
340
+ """
341
+ cls._ensure_initialized()
342
+ cls._discover_flows()
343
+
344
+ if not cls._entries:
345
+ print(
346
+ "No flows discovered. Try adding search paths with register_search_path()"
347
+ )
348
+ print("Note: Only flows with 'metadata' section are discoverable.")
349
+ return
350
+
351
+ # Prepare data with fallbacks
352
+ flow_data = []
353
+ for _, entry in cls._entries.items():
354
+ metadata = entry.metadata
355
+ flow_data.append(
356
+ {
357
+ "name": metadata.name,
358
+ "id": metadata.id,
359
+ "author": metadata.author or "Unknown",
360
+ "tags": ", ".join(metadata.tags) if metadata.tags else "-",
361
+ "description": metadata.description or "No description",
362
+ "version": metadata.version,
363
+ "cost": metadata.estimated_cost,
364
+ }
365
+ )
366
+
367
+ # Sort by name for consistency
368
+ flow_data.sort(key=lambda x: x["id"])
369
+
370
+ # Display Rich table
371
+ # Third Party
372
+
373
+ console = Console()
374
+ table = Table(show_header=True, header_style="bold bright_magenta")
375
+
376
+ # Add columns with better visibility colors
377
+ table.add_column("ID", style="bold bright_magenta", no_wrap=True)
378
+ table.add_column("Name", style="bold bright_cyan")
379
+ table.add_column("Author", style="bright_green")
380
+ table.add_column("Tags", style="yellow")
381
+ table.add_column("Description", style="white")
382
+
383
+ # Add rows
384
+ for flow in flow_data:
385
+ table.add_row(
386
+ flow["id"],
387
+ flow["name"],
388
+ flow["author"],
389
+ flow["tags"],
390
+ flow["description"],
391
+ )
392
+
393
+ console.print(table)
@@ -0,0 +1,277 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Flow validation utilities."""
3
+
4
+ # Standard
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ # Third Party
8
+ from datasets import Dataset
9
+
10
+ if TYPE_CHECKING:
11
+ # Local
12
+ from .base import Flow
13
+
14
+
15
+ class FlowValidator:
16
+ """Validator for flow configurations and execution readiness."""
17
+
18
+ def validate_yaml_structure(self, flow_config: dict[str, Any]) -> list[str]:
19
+ """Validate the structure of a flow YAML configuration.
20
+
21
+ Parameters
22
+ ----------
23
+ flow_config : Dict[str, Any]
24
+ The loaded YAML configuration.
25
+
26
+ Returns
27
+ -------
28
+ List[str]
29
+ List of validation error messages. Empty if valid.
30
+ """
31
+ errors = []
32
+
33
+ # Check required top-level keys
34
+ if "blocks" not in flow_config:
35
+ errors.append("Flow configuration must contain 'blocks' section")
36
+ return errors # Can't continue without blocks
37
+
38
+ blocks = flow_config["blocks"]
39
+ if not isinstance(blocks, list):
40
+ errors.append("'blocks' must be a list")
41
+ return errors
42
+
43
+ if not blocks:
44
+ errors.append("Flow must contain at least one block")
45
+ return errors
46
+
47
+ # Validate each block configuration
48
+ for i, block_config in enumerate(blocks):
49
+ block_errors = self._validate_block_config(block_config, i)
50
+ errors.extend(block_errors)
51
+
52
+ # Validate metadata if present
53
+ if "metadata" in flow_config:
54
+ metadata_errors = self._validate_metadata_config(flow_config["metadata"])
55
+ errors.extend(metadata_errors)
56
+
57
+ # Validate parameters if present
58
+ if "parameters" in flow_config:
59
+ param_errors = self._validate_parameters_config(flow_config["parameters"])
60
+ errors.extend(param_errors)
61
+
62
+ return errors
63
+
64
+ def _validate_block_config(
65
+ self, block_config: dict[str, Any], index: int
66
+ ) -> list[str]:
67
+ """Validate a single block configuration."""
68
+ errors = []
69
+ prefix = f"Block {index}"
70
+
71
+ if not isinstance(block_config, dict):
72
+ errors.append(f"{prefix}: Block configuration must be a dictionary")
73
+ return errors
74
+
75
+ # Check required fields
76
+ if "block_type" not in block_config:
77
+ errors.append(f"{prefix}: Missing required field 'block_type'")
78
+
79
+ if "block_config" not in block_config:
80
+ errors.append(f"{prefix}: Missing required field 'block_config'")
81
+ else:
82
+ # Validate block_config structure
83
+ inner_config = block_config["block_config"]
84
+ if not isinstance(inner_config, dict):
85
+ errors.append(f"{prefix}: 'block_config' must be a dictionary")
86
+ elif "block_name" not in inner_config:
87
+ errors.append(f"{prefix}: 'block_config' must contain 'block_name'")
88
+
89
+ # Validate optional fields
90
+ if "runtime_overrides" in block_config:
91
+ overrides = block_config["runtime_overrides"]
92
+ if not isinstance(overrides, list):
93
+ errors.append(f"{prefix}: 'runtime_overrides' must be a list")
94
+ elif not all(isinstance(item, str) for item in overrides):
95
+ errors.append(
96
+ f"{prefix}: All 'runtime_overrides' items must be strings"
97
+ )
98
+
99
+ return errors
100
+
101
+ def _validate_metadata_config(self, metadata: dict[str, Any]) -> list[str]:
102
+ """Validate metadata configuration."""
103
+ errors = []
104
+
105
+ if not isinstance(metadata, dict):
106
+ errors.append("'metadata' must be a dictionary")
107
+ return errors
108
+
109
+ # Check required name field
110
+ if "name" not in metadata:
111
+ errors.append("Metadata must contain 'name' field")
112
+ elif not isinstance(metadata["name"], str) or not metadata["name"].strip():
113
+ errors.append("Metadata 'name' must be a non-empty string")
114
+
115
+ # Validate id if present
116
+ if "id" in metadata:
117
+ flow_id = metadata["id"]
118
+ if not isinstance(flow_id, str):
119
+ errors.append("Metadata: 'id' must be a string")
120
+ elif flow_id and not flow_id.islower():
121
+ errors.append("Metadata: 'id' must be lowercase")
122
+ elif flow_id and not flow_id.replace("-", "").isalnum():
123
+ errors.append(
124
+ "Metadata: 'id' must contain only alphanumeric characters and hyphens"
125
+ )
126
+
127
+ # Validate optional fields
128
+ string_fields = [
129
+ "description",
130
+ "version",
131
+ "author",
132
+ "recommended_model",
133
+ "license",
134
+ ]
135
+ for field in string_fields:
136
+ if field in metadata and not isinstance(metadata[field], str):
137
+ errors.append(f"Metadata '{field}' must be a string")
138
+
139
+ if "tags" in metadata:
140
+ tags = metadata["tags"]
141
+ if not isinstance(tags, list):
142
+ errors.append("Metadata 'tags' must be a list")
143
+ elif not all(isinstance(tag, str) for tag in tags):
144
+ errors.append("All metadata 'tags' must be strings")
145
+
146
+ return errors
147
+
148
+ def _validate_parameters_config(self, parameters: dict[str, Any]) -> list[str]:
149
+ """Validate parameters configuration."""
150
+ errors = []
151
+
152
+ if not isinstance(parameters, dict):
153
+ errors.append("'parameters' must be a dictionary")
154
+ return errors
155
+
156
+ for param_name, param_config in parameters.items():
157
+ if not isinstance(param_name, str):
158
+ errors.append("Parameter names must be strings")
159
+ continue
160
+
161
+ if isinstance(param_config, dict):
162
+ # Full parameter specification
163
+ if "default" not in param_config:
164
+ errors.append(f"Parameter '{param_name}' must have 'default' value")
165
+
166
+ # Validate optional fields
167
+ if "description" in param_config and not isinstance(
168
+ param_config["description"], str
169
+ ):
170
+ errors.append(
171
+ f"Parameter '{param_name}' description must be a string"
172
+ )
173
+
174
+ if "required" in param_config and not isinstance(
175
+ param_config["required"], bool
176
+ ):
177
+ errors.append(
178
+ f"Parameter '{param_name}' required field must be boolean"
179
+ )
180
+
181
+ return errors
182
+
183
+ def validate_flow_execution(self, flow: "Flow", dataset: Dataset) -> list[str]:
184
+ """Validate that a flow can be executed with the given dataset.
185
+
186
+ Parameters
187
+ ----------
188
+ flow : Flow
189
+ The flow to validate.
190
+ dataset : Dataset
191
+ Dataset to validate against.
192
+
193
+ Returns
194
+ -------
195
+ List[str]
196
+ List of validation error messages. Empty if validation passes.
197
+ """
198
+ errors = []
199
+
200
+ if not flow.blocks:
201
+ errors.append("Flow contains no blocks")
202
+ return errors
203
+
204
+ if len(dataset) == 0:
205
+ errors.append("Dataset is empty")
206
+ return errors
207
+
208
+ # Track available columns as we progress through blocks
209
+ current_columns = set(dataset.column_names)
210
+
211
+ for _i, block in enumerate(flow.blocks):
212
+ block_name = block.block_name
213
+
214
+ # Check input columns
215
+ if hasattr(block, "input_cols") and block.input_cols:
216
+ missing_cols = self._check_missing_columns(
217
+ block.input_cols, current_columns
218
+ )
219
+ if missing_cols:
220
+ errors.append(
221
+ f"Block '{block_name}' missing input columns: {missing_cols}"
222
+ )
223
+
224
+ # Update available columns for next block
225
+ if hasattr(block, "output_cols") and block.output_cols:
226
+ new_columns = self._extract_column_names(block.output_cols)
227
+ current_columns.update(new_columns)
228
+
229
+ return errors
230
+
231
+ def _check_missing_columns(
232
+ self, required_cols: Any, available_cols: set[str]
233
+ ) -> list[str]:
234
+ """Check which required columns are missing."""
235
+ if isinstance(required_cols, (list, dict)):
236
+ return [col for col in required_cols if col not in available_cols]
237
+ return []
238
+
239
+ def _extract_column_names(self, output_cols: Any) -> list[str]:
240
+ """Extract column names from output specification."""
241
+ if isinstance(output_cols, list):
242
+ return output_cols
243
+ elif isinstance(output_cols, dict):
244
+ return list(output_cols.keys())
245
+ return []
246
+
247
+ def validate_block_chain(self, blocks: list[Any]) -> list[str]:
248
+ """Validate that blocks can be chained together.
249
+
250
+ Parameters
251
+ ----------
252
+ blocks : List[Any]
253
+ List of block instances to validate.
254
+
255
+ Returns
256
+ -------
257
+ List[str]
258
+ List of validation error messages.
259
+ """
260
+ errors = []
261
+
262
+ if not blocks:
263
+ errors.append("Block chain is empty")
264
+ return errors
265
+
266
+ # Check that all blocks have unique names
267
+ block_names = []
268
+ for i, block in enumerate(blocks):
269
+ if hasattr(block, "block_name"):
270
+ name = block.block_name
271
+ if name in block_names:
272
+ errors.append(f"Duplicate block name '{name}' at index {i}")
273
+ block_names.append(name)
274
+ else:
275
+ errors.append(f"Block at index {i} missing 'block_name' attribute")
276
+
277
+ return errors
@@ -1,10 +1,13 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
 
3
+ # Local
4
+ from .flow_identifier import get_flow_identifier
5
+ from .path_resolution import resolve_path
6
+
7
+
3
8
  # This is part of the public API, and used by instructlab
4
- class GenerateException(Exception):
9
+ class GenerateError(Exception):
5
10
  """An exception raised during generate step."""
6
11
 
7
12
 
8
- from .path_resolution import resolve_path
9
-
10
- __all__ = ["GenerateException", "resolve_path"]
13
+ __all__ = ["GenerateError", "resolve_path", "get_flow_identifier"]