sdg-hub 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. sdg_hub/__init__.py +28 -1
  2. sdg_hub/_version.py +2 -2
  3. sdg_hub/core/__init__.py +22 -0
  4. sdg_hub/core/blocks/__init__.py +58 -0
  5. sdg_hub/core/blocks/base.py +313 -0
  6. sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
  7. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
  8. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
  9. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
  10. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
  11. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
  12. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
  13. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
  14. sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
  15. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
  16. sdg_hub/core/blocks/evaluation/__init__.py +9 -0
  17. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
  18. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
  19. sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
  20. sdg_hub/core/blocks/filtering/__init__.py +12 -0
  21. sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
  22. sdg_hub/core/blocks/llm/__init__.py +27 -0
  23. sdg_hub/core/blocks/llm/client_manager.py +398 -0
  24. sdg_hub/core/blocks/llm/config.py +336 -0
  25. sdg_hub/core/blocks/llm/error_handler.py +368 -0
  26. sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
  27. sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +491 -0
  28. sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
  29. sdg_hub/core/blocks/llm/text_parser_block.py +357 -0
  30. sdg_hub/core/blocks/registry.py +331 -0
  31. sdg_hub/core/blocks/transform/__init__.py +23 -0
  32. sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
  33. sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
  34. sdg_hub/core/blocks/transform/melt_columns.py +126 -0
  35. sdg_hub/core/blocks/transform/rename_columns.py +69 -0
  36. sdg_hub/core/blocks/transform/text_concat.py +102 -0
  37. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
  38. sdg_hub/core/flow/__init__.py +20 -0
  39. sdg_hub/core/flow/base.py +1209 -0
  40. sdg_hub/core/flow/checkpointer.py +333 -0
  41. sdg_hub/core/flow/metadata.py +389 -0
  42. sdg_hub/core/flow/migration.py +198 -0
  43. sdg_hub/core/flow/registry.py +393 -0
  44. sdg_hub/core/flow/validation.py +277 -0
  45. sdg_hub/{utils → core/utils}/__init__.py +7 -4
  46. sdg_hub/core/utils/datautils.py +63 -0
  47. sdg_hub/core/utils/error_handling.py +208 -0
  48. sdg_hub/core/utils/flow_id_words.yaml +231 -0
  49. sdg_hub/core/utils/flow_identifier.py +94 -0
  50. sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
  51. sdg_hub/core/utils/yaml_utils.py +59 -0
  52. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
  53. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
  54. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
  55. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
  56. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
  57. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
  58. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +192 -0
  59. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
  60. sdg_hub-0.2.1.dist-info/METADATA +221 -0
  61. sdg_hub-0.2.1.dist-info/RECORD +68 -0
  62. sdg_hub/blocks/__init__.py +0 -42
  63. sdg_hub/blocks/block.py +0 -96
  64. sdg_hub/blocks/llmblock.py +0 -375
  65. sdg_hub/blocks/openaichatblock.py +0 -556
  66. sdg_hub/blocks/utilblocks.py +0 -597
  67. sdg_hub/checkpointer.py +0 -139
  68. sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
  69. sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
  70. sdg_hub/configs/annotations/detailed_description.yaml +0 -10
  71. sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
  72. sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
  73. sdg_hub/configs/knowledge/__init__.py +0 -0
  74. sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
  75. sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
  76. sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
  77. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
  78. sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
  79. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
  80. sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
  81. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
  82. sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
  83. sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
  84. sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
  85. sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
  86. sdg_hub/configs/knowledge/router.yaml +0 -12
  87. sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
  88. sdg_hub/configs/reasoning/__init__.py +0 -0
  89. sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
  90. sdg_hub/configs/skills/__init__.py +0 -0
  91. sdg_hub/configs/skills/analyzer.yaml +0 -48
  92. sdg_hub/configs/skills/annotation.yaml +0 -36
  93. sdg_hub/configs/skills/contexts.yaml +0 -28
  94. sdg_hub/configs/skills/critic.yaml +0 -60
  95. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
  96. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
  97. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
  98. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
  99. sdg_hub/configs/skills/freeform_questions.yaml +0 -34
  100. sdg_hub/configs/skills/freeform_responses.yaml +0 -39
  101. sdg_hub/configs/skills/grounded_questions.yaml +0 -38
  102. sdg_hub/configs/skills/grounded_responses.yaml +0 -59
  103. sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
  104. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  105. sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
  106. sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
  107. sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
  108. sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
  109. sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
  110. sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
  111. sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
  112. sdg_hub/configs/skills/judge.yaml +0 -53
  113. sdg_hub/configs/skills/planner.yaml +0 -67
  114. sdg_hub/configs/skills/respond.yaml +0 -8
  115. sdg_hub/configs/skills/revised_responder.yaml +0 -78
  116. sdg_hub/configs/skills/router.yaml +0 -59
  117. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
  118. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
  119. sdg_hub/flow.py +0 -477
  120. sdg_hub/flow_runner.py +0 -450
  121. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
  122. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
  123. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
  124. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
  125. sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
  126. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
  127. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
  128. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
  129. sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
  130. sdg_hub/pipeline.py +0 -121
  131. sdg_hub/prompts.py +0 -80
  132. sdg_hub/registry.py +0 -122
  133. sdg_hub/sdg.py +0 -206
  134. sdg_hub/utils/config_validation.py +0 -91
  135. sdg_hub/utils/datautils.py +0 -14
  136. sdg_hub/utils/error_handling.py +0 -94
  137. sdg_hub/utils/validation_result.py +0 -10
  138. sdg_hub-0.1.4.dist-info/METADATA +0 -190
  139. sdg_hub-0.1.4.dist-info/RECORD +0 -89
  140. sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
  141. /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
  142. /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
  143. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/WHEEL +0 -0
  144. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/licenses/LICENSE +0 -0
  145. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,389 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Flow metadata and parameter definitions."""
3
+
4
+ # Standard
5
+ from datetime import datetime
6
+ from enum import Enum
7
+ from typing import Any, Optional
8
+
9
+ # Third Party
10
+ from pydantic import BaseModel, Field, field_validator, model_validator
11
+
12
+ # Local
13
+ from ..utils.flow_identifier import get_flow_identifier
14
+
15
+
16
+ class ModelCompatibility(str, Enum):
17
+ """Model compatibility levels."""
18
+
19
+ REQUIRED = "required"
20
+ RECOMMENDED = "recommended"
21
+ COMPATIBLE = "compatible"
22
+ EXPERIMENTAL = "experimental"
23
+
24
+
25
+ class ModelOption(BaseModel):
26
+ """Represents a model option with compatibility level.
27
+
28
+ Attributes
29
+ ----------
30
+ name : str
31
+ Model identifier (e.g., "gpt-4", "claude-3-sonnet")
32
+ compatibility : ModelCompatibility
33
+ Compatibility level with the flow
34
+ """
35
+
36
+ name: str = Field(..., description="Model identifier")
37
+ compatibility: ModelCompatibility = Field(
38
+ default=ModelCompatibility.COMPATIBLE,
39
+ description="Compatibility level with the flow",
40
+ )
41
+
42
+ @field_validator("name")
43
+ @classmethod
44
+ def validate_name(cls, v: str) -> str:
45
+ """Validate model name is not empty."""
46
+ if not v.strip():
47
+ raise ValueError("Model name cannot be empty")
48
+ return v.strip()
49
+
50
+
51
+ class RecommendedModels(BaseModel):
52
+ """Simplified recommended models structure.
53
+
54
+ Attributes
55
+ ----------
56
+ default : str
57
+ The default model to use
58
+ compatible : List[str]
59
+ List of compatible models
60
+ experimental : List[str]
61
+ List of experimental models
62
+ """
63
+
64
+ default: str = Field(..., description="Default model to use")
65
+ compatible: list[str] = Field(default_factory=list, description="Compatible models")
66
+ experimental: list[str] = Field(
67
+ default_factory=list, description="Experimental models"
68
+ )
69
+
70
+ @field_validator("default")
71
+ @classmethod
72
+ def validate_default(cls, v: str) -> str:
73
+ """Validate default model name is not empty."""
74
+ if not v.strip():
75
+ raise ValueError("Default model name cannot be empty")
76
+ return v.strip()
77
+
78
+ @field_validator("compatible", "experimental")
79
+ @classmethod
80
+ def validate_model_lists(cls, v: list[str]) -> list[str]:
81
+ """Validate model lists contain non-empty names."""
82
+ return [model.strip() for model in v if model.strip()]
83
+
84
+ def get_all_models(self) -> list[str]:
85
+ """Get all models (default + compatible + experimental)."""
86
+ return [self.default] + self.compatible + self.experimental
87
+
88
+ def get_best_model(
89
+ self, available_models: Optional[list[str]] = None
90
+ ) -> Optional[str]:
91
+ """Get the best model based on availability.
92
+
93
+ Parameters
94
+ ----------
95
+ available_models : Optional[List[str]]
96
+ List of available model names. If None, returns default.
97
+
98
+ Returns
99
+ -------
100
+ Optional[str]
101
+ Best model name or None if no models available.
102
+ """
103
+ if available_models is None:
104
+ return self.default
105
+
106
+ # Check in priority order: default, compatible, experimental
107
+ if self.default in available_models:
108
+ return self.default
109
+
110
+ for model in self.compatible:
111
+ if model in available_models:
112
+ return model
113
+
114
+ for model in self.experimental:
115
+ if model in available_models:
116
+ return model
117
+
118
+ return None
119
+
120
+
121
+ class FlowParameter(BaseModel):
122
+ """Represents a runtime parameter for a flow.
123
+
124
+ Attributes
125
+ ----------
126
+ default : Any
127
+ Default value for the parameter.
128
+ description : str
129
+ Human-readable description of the parameter.
130
+ type_hint : str
131
+ Type hint as string (e.g., "float", "str").
132
+ required : bool
133
+ Whether this parameter is required at runtime.
134
+ constraints : Dict[str, Any]
135
+ Additional constraints for the parameter.
136
+ """
137
+
138
+ default: Any = Field(..., description="Default value for the parameter")
139
+ description: str = Field(default="", description="Human-readable description")
140
+ type_hint: str = Field(default="Any", description="Type hint as string")
141
+ required: bool = Field(default=False, description="Whether parameter is required")
142
+ constraints: dict[str, Any] = Field(
143
+ default_factory=dict, description="Additional constraints for the parameter"
144
+ )
145
+
146
+ @model_validator(mode="after")
147
+ def validate_required_default(self) -> "FlowParameter":
148
+ """Validate that required parameters have appropriate defaults."""
149
+ if self.required and self.default is None:
150
+ raise ValueError("Required parameters cannot have None as default")
151
+ return self
152
+
153
+
154
+ class DatasetRequirements(BaseModel):
155
+ """Dataset requirements for flow execution.
156
+
157
+ Attributes
158
+ ----------
159
+ required_columns : List[str]
160
+ Column names that must be present in the input dataset.
161
+ optional_columns : List[str]
162
+ Column names that are optional but can enhance flow performance.
163
+ min_samples : int
164
+ Minimum number of samples required in the dataset.
165
+ max_samples : Optional[int]
166
+ Maximum number of samples to process (for resource management).
167
+ column_types : Dict[str, str]
168
+ Expected types for specific columns.
169
+ description : str
170
+ Human-readable description of dataset requirements.
171
+ """
172
+
173
+ required_columns: list[str] = Field(
174
+ default_factory=list, description="Column names that must be present"
175
+ )
176
+ optional_columns: list[str] = Field(
177
+ default_factory=list,
178
+ description="Optional columns that can enhance performance",
179
+ )
180
+ min_samples: int = Field(
181
+ default=1, ge=1, description="Minimum number of samples required"
182
+ )
183
+ max_samples: Optional[int] = Field(
184
+ default=None, gt=0, description="Maximum number of samples to process"
185
+ )
186
+ column_types: dict[str, str] = Field(
187
+ default_factory=dict, description="Expected types for specific columns"
188
+ )
189
+ description: str = Field(default="", description="Human-readable description")
190
+
191
+ @field_validator("required_columns", "optional_columns")
192
+ @classmethod
193
+ def validate_column_names(cls, v: list[str]) -> list[str]:
194
+ """Validate column names are not empty."""
195
+ return [col.strip() for col in v if col.strip()]
196
+
197
+ @model_validator(mode="after")
198
+ def validate_sample_limits(self) -> "DatasetRequirements":
199
+ """Validate sample limits are consistent."""
200
+ if self.max_samples is not None and self.max_samples < self.min_samples:
201
+ raise ValueError("max_samples must be greater than or equal to min_samples")
202
+ return self
203
+
204
+ def validate_dataset(
205
+ self, dataset_columns: list[str], dataset_size: int
206
+ ) -> list[str]:
207
+ """Validate a dataset against these requirements.
208
+
209
+ Parameters
210
+ ----------
211
+ dataset_columns : List[str]
212
+ Column names in the dataset.
213
+ dataset_size : int
214
+ Number of samples in the dataset.
215
+
216
+ Returns
217
+ -------
218
+ List[str]
219
+ List of validation error messages. Empty if valid.
220
+ """
221
+ errors = []
222
+
223
+ # Check required columns
224
+ missing_columns = [
225
+ col for col in self.required_columns if col not in dataset_columns
226
+ ]
227
+ if missing_columns:
228
+ errors.append(f"Missing required columns: {missing_columns}")
229
+
230
+ # Check minimum samples
231
+ if dataset_size < self.min_samples:
232
+ errors.append(
233
+ f"Dataset has {dataset_size} samples, minimum required: {self.min_samples}"
234
+ )
235
+
236
+ return errors
237
+
238
+
239
+ class FlowMetadata(BaseModel):
240
+ """Metadata for flow configuration and open source contributions.
241
+
242
+ Attributes
243
+ ----------
244
+ id : str
245
+ Unique identifier for the flow.
246
+ name : str
247
+ Human-readable name of the flow.
248
+ description : str
249
+ Detailed description of what the flow does.
250
+ version : str
251
+ Semantic version (e.g., "1.0.0").
252
+ author : str
253
+ Author or contributor name.
254
+ recommended_models : Optional[RecommendedModels]
255
+ Simplified recommended models structure with default, compatible, and experimental lists.
256
+ tags : List[str]
257
+ Tags for categorization and search.
258
+ created_at : str
259
+ Creation timestamp.
260
+ updated_at : str
261
+ Last update timestamp.
262
+ license : str
263
+ License identifier.
264
+ min_sdg_hub_version : str
265
+ Minimum required SDG Hub version.
266
+ dataset_requirements : Optional[DatasetRequirements]
267
+ Requirements for input datasets.
268
+ estimated_cost : str
269
+ Estimated cost tier for running the flow.
270
+ estimated_duration : str
271
+ Estimated duration for flow execution.
272
+ """
273
+
274
+ name: str = Field(..., min_length=1, description="Human-readable name")
275
+ id: str = Field(
276
+ default="", description="Unique identifier for the flow, generated from name"
277
+ )
278
+ description: str = Field(default="", description="Detailed description")
279
+ version: str = Field(
280
+ default="1.0.0",
281
+ pattern=r"^\d+\.\d+\.\d+(-[a-zA-Z0-9.-]+)?$",
282
+ description="Semantic version",
283
+ )
284
+ author: str = Field(default="", description="Author or contributor name")
285
+ recommended_models: Optional[RecommendedModels] = Field(
286
+ default=None, description="Simplified recommended models structure"
287
+ )
288
+ tags: list[str] = Field(
289
+ default_factory=list, description="Tags for categorization and search"
290
+ )
291
+ created_at: str = Field(
292
+ default_factory=lambda: datetime.now().isoformat(),
293
+ description="Creation timestamp",
294
+ )
295
+ updated_at: str = Field(
296
+ default_factory=lambda: datetime.now().isoformat(),
297
+ description="Last update timestamp",
298
+ )
299
+ license: str = Field(default="Apache-2.0", description="License identifier")
300
+ min_sdg_hub_version: str = Field(
301
+ default="", description="Minimum required SDG Hub version"
302
+ )
303
+ dataset_requirements: Optional[DatasetRequirements] = Field(
304
+ default=None, description="Requirements for input datasets"
305
+ )
306
+ estimated_cost: str = Field(
307
+ default="medium",
308
+ pattern="^(low|medium|high)$",
309
+ description="Estimated cost tier for running the flow",
310
+ )
311
+ estimated_duration: str = Field(
312
+ default="", description="Estimated duration for flow execution"
313
+ )
314
+
315
+ @field_validator("id")
316
+ @classmethod
317
+ def validate_id(cls, v: str) -> str:
318
+ """Validate flow id."""
319
+ # Note: Auto-generation is handled in the model_validator since field_validator
320
+ # doesn't have access to other field values in Pydantic v2
321
+
322
+ # Validate id format if provided
323
+ if v:
324
+ # Must be lowercase
325
+ if not v.islower():
326
+ raise ValueError("id must be lowercase")
327
+
328
+ # Must contain only alphanumeric characters and hyphens
329
+ if not v.replace("-", "").isalnum():
330
+ raise ValueError(
331
+ "id must contain only alphanumeric characters and hyphens"
332
+ )
333
+
334
+ # Must not start or end with a hyphen
335
+ if v.startswith("-") or v.endswith("-"):
336
+ raise ValueError("id must not start or end with a hyphen")
337
+
338
+ return v
339
+
340
+ @field_validator("tags")
341
+ @classmethod
342
+ def validate_tags(cls, v: list[str]) -> list[str]:
343
+ """Validate and clean tags."""
344
+ return [tag.strip().lower() for tag in v if tag.strip()]
345
+
346
+ @field_validator("recommended_models")
347
+ @classmethod
348
+ def validate_recommended_models(
349
+ cls, v: Optional[RecommendedModels]
350
+ ) -> Optional[RecommendedModels]:
351
+ """Validate recommended models structure."""
352
+ # Validation is handled within RecommendedModels class
353
+ return v
354
+
355
+ def update_timestamp(self) -> None:
356
+ """Update the updated_at timestamp."""
357
+ self.updated_at = datetime.now().isoformat()
358
+
359
+ @model_validator(mode="after")
360
+ def ensure_id(self) -> "FlowMetadata":
361
+ """Ensure id is set.
362
+
363
+ Note: YAML persistence is handled by Flow.from_yaml() and FlowRegistry
364
+ to maintain proper separation of concerns.
365
+ """
366
+ if not self.id and self.name:
367
+ self.id = get_flow_identifier(self.name)
368
+
369
+ return self
370
+
371
+ def get_best_model(
372
+ self, available_models: Optional[list[str]] = None
373
+ ) -> Optional[str]:
374
+ """Get the best recommended model based on availability.
375
+
376
+ Parameters
377
+ ----------
378
+ available_models : Optional[List[str]]
379
+ List of available model names. If None, returns default model.
380
+
381
+ Returns
382
+ -------
383
+ Optional[str]
384
+ Best model name or None if no models available.
385
+ """
386
+ if not self.recommended_models:
387
+ return None
388
+
389
+ return self.recommended_models.get_best_model(available_models)
@@ -0,0 +1,198 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Migration utilities for backward compatibility with old flow formats."""
3
+
4
+ # Standard
5
+ from pathlib import Path
6
+ from typing import Any, Union
7
+
8
+ # Local
9
+ from ..utils.logger_config import setup_logger
10
+
11
+ logger = setup_logger(__name__)
12
+
13
+
14
+ class FlowMigration:
15
+ """Utility class for migrating old flow formats to new format."""
16
+
17
+ @staticmethod
18
+ def is_old_format(flow_config: Union[list[dict[str, Any]], dict[str, Any]]) -> bool:
19
+ """Detect if a flow configuration is in the old format.
20
+
21
+ Parameters
22
+ ----------
23
+ flow_config : Union[List[Dict[str, Any]], Dict[str, Any]]
24
+ The loaded YAML configuration.
25
+
26
+ Returns
27
+ -------
28
+ bool
29
+ True if the configuration is in old format, False otherwise.
30
+ """
31
+ # Old format: Direct array of blocks
32
+ # New format: Dictionary with 'metadata' and 'blocks' keys
33
+ if isinstance(flow_config, list):
34
+ return True
35
+
36
+ if isinstance(flow_config, dict):
37
+ # Check if it has the new format structure
38
+ has_metadata = "metadata" in flow_config
39
+ has_blocks = "blocks" in flow_config
40
+
41
+ # If it has both metadata and blocks, it's new format
42
+ if has_metadata and has_blocks:
43
+ return False
44
+
45
+ # If it doesn't have the expected new format structure but is a dict,
46
+ # check if it looks like old format (all keys are block configs)
47
+ if not has_metadata and not has_blocks:
48
+ # Check first few items to see if they look like old block configs
49
+ for value in flow_config.values():
50
+ if isinstance(value, dict) and "block_type" in value:
51
+ return True
52
+ # If it's a dict but doesn't look like blocks, assume new format
53
+ return False
54
+
55
+ # If we can't determine, assume new format
56
+ return False
57
+
58
+ @staticmethod
59
+ def migrate_to_new_format(
60
+ flow_config: list[dict[str, Any]], yaml_path: str
61
+ ) -> tuple[dict[str, Any], dict[str, dict[str, Any]]]:
62
+ """Migrate old format flow configuration to new format.
63
+
64
+ Parameters
65
+ ----------
66
+ flow_config : List[Dict[str, Any]]
67
+ Old format flow configuration (array of blocks).
68
+ yaml_path : str
69
+ Path to the original YAML file for generating metadata.
70
+
71
+ Returns
72
+ -------
73
+ tuple[Dict[str, Any], Dict[str, Dict[str, Any]]]
74
+ Tuple of (new format flow configuration, extracted runtime_params).
75
+ """
76
+ logger.info(f"Migrating old flow format from: {yaml_path}")
77
+
78
+ # Generate default metadata
79
+ flow_name = Path(yaml_path).stem
80
+ metadata = FlowMigration._generate_default_metadata(flow_name)
81
+
82
+ # Process blocks and extract runtime parameters
83
+ migrated_blocks = []
84
+ runtime_params = {}
85
+
86
+ for i, block_config in enumerate(flow_config):
87
+ try:
88
+ migrated_block, block_runtime_params = (
89
+ FlowMigration._migrate_block_config(block_config)
90
+ )
91
+ migrated_blocks.append(migrated_block)
92
+
93
+ # Add block's runtime params if any
94
+ if block_runtime_params:
95
+ block_name = migrated_block.get("block_config", {}).get(
96
+ "block_name"
97
+ )
98
+ if block_name:
99
+ runtime_params[block_name] = block_runtime_params
100
+
101
+ except Exception as exc:
102
+ logger.warning(f"Failed to migrate block at index {i}: {exc}")
103
+ # Keep original block config as fallback
104
+ migrated_blocks.append(block_config)
105
+
106
+ # Create new format structure
107
+ new_config = {"metadata": metadata, "blocks": migrated_blocks}
108
+
109
+ logger.info(f"Successfully migrated flow with {len(migrated_blocks)} blocks")
110
+ logger.info(f"Extracted runtime_params for {len(runtime_params)} blocks")
111
+
112
+ return new_config, runtime_params
113
+
114
+ @staticmethod
115
+ def _generate_default_metadata(flow_name: str) -> dict[str, Any]:
116
+ """Generate default metadata for migrated flows."""
117
+ # Import here to avoid circular import
118
+ from ..utils.flow_identifier import get_flow_identifier
119
+
120
+ metadata = {
121
+ "name": flow_name,
122
+ "description": f"Migrated flow: {flow_name}",
123
+ "version": "1.0.0",
124
+ "author": "SDG_Hub",
125
+ "tags": ["migrated"],
126
+ "recommended_models": {
127
+ "default": "meta-llama/Llama-3.3-70B-Instruct",
128
+ "compatible": [],
129
+ "experimental": [],
130
+ },
131
+ }
132
+
133
+ # Generate id for migrated flows
134
+ flow_id = get_flow_identifier(flow_name)
135
+ if flow_id:
136
+ metadata["id"] = flow_id
137
+ logger.debug(f"Generated id for migrated flow: {flow_id}")
138
+
139
+ return metadata
140
+
141
+ @staticmethod
142
+ def _migrate_block_config(
143
+ block_config: dict[str, Any],
144
+ ) -> tuple[dict[str, Any], dict[str, Any]]:
145
+ """Migrate individual block configuration from old to new format.
146
+
147
+ Parameters
148
+ ----------
149
+ block_config : Dict[str, Any]
150
+ Old format block configuration.
151
+
152
+ Returns
153
+ -------
154
+ tuple[Dict[str, Any], Dict[str, Any]]
155
+ Tuple of (migrated block configuration, extracted runtime_params).
156
+ """
157
+ if not isinstance(block_config, dict):
158
+ return block_config, {}
159
+
160
+ # Start with the original config
161
+ migrated_config = block_config.copy()
162
+ runtime_params = {}
163
+
164
+ # Extract gen_kwargs as runtime_params
165
+ if "gen_kwargs" in migrated_config:
166
+ runtime_params = migrated_config.pop("gen_kwargs")
167
+ logger.debug(f"Extracted gen_kwargs as runtime_params: {runtime_params}")
168
+
169
+ # Remove unsupported fields
170
+ for unsupported_field in ["drop_columns", "drop_duplicates", "batch_kwargs"]:
171
+ if unsupported_field in migrated_config:
172
+ migrated_config.pop(unsupported_field)
173
+ logger.debug(
174
+ f"Ignoring {unsupported_field} as it's not supported in new flow format"
175
+ )
176
+
177
+ # Handle parser_kwargs for LLMBlock (keep in block_config)
178
+ if migrated_config.get("block_type") == "LLMBlock":
179
+ block_config_section = migrated_config.get("block_config", {})
180
+ if "parser_kwargs" in block_config_section:
181
+ parser_kwargs = block_config_section["parser_kwargs"]
182
+ logger.debug(f"Preserving parser_kwargs for LLMBlock: {parser_kwargs}")
183
+
184
+ # Handle operator string conversion for FilterByValueBlock
185
+ if migrated_config.get("block_type") == "FilterByValueBlock":
186
+ block_config_section = migrated_config.get("block_config", {})
187
+ if "operation" in block_config_section:
188
+ operation = block_config_section["operation"]
189
+ if isinstance(operation, str) and operation.startswith("operator."):
190
+ # Convert "operator.eq" to "eq"
191
+ block_config_section["operation"] = operation.replace(
192
+ "operator.", ""
193
+ )
194
+ logger.debug(
195
+ f"Converted operation from {operation} to {block_config_section['operation']}"
196
+ )
197
+
198
+ return migrated_config, runtime_params