sdg-hub 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. sdg_hub/__init__.py +28 -1
  2. sdg_hub/_version.py +2 -2
  3. sdg_hub/core/__init__.py +22 -0
  4. sdg_hub/core/blocks/__init__.py +58 -0
  5. sdg_hub/core/blocks/base.py +313 -0
  6. sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
  7. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
  8. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
  9. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
  10. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
  11. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
  12. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
  13. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
  14. sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
  15. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
  16. sdg_hub/core/blocks/evaluation/__init__.py +9 -0
  17. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
  18. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
  19. sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
  20. sdg_hub/core/blocks/filtering/__init__.py +12 -0
  21. sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
  22. sdg_hub/core/blocks/llm/__init__.py +25 -0
  23. sdg_hub/core/blocks/llm/client_manager.py +398 -0
  24. sdg_hub/core/blocks/llm/config.py +336 -0
  25. sdg_hub/core/blocks/llm/error_handler.py +368 -0
  26. sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
  27. sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
  28. sdg_hub/core/blocks/llm/text_parser_block.py +310 -0
  29. sdg_hub/core/blocks/registry.py +331 -0
  30. sdg_hub/core/blocks/transform/__init__.py +23 -0
  31. sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
  32. sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
  33. sdg_hub/core/blocks/transform/melt_columns.py +126 -0
  34. sdg_hub/core/blocks/transform/rename_columns.py +69 -0
  35. sdg_hub/core/blocks/transform/text_concat.py +102 -0
  36. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
  37. sdg_hub/core/flow/__init__.py +20 -0
  38. sdg_hub/core/flow/base.py +980 -0
  39. sdg_hub/core/flow/metadata.py +344 -0
  40. sdg_hub/core/flow/migration.py +187 -0
  41. sdg_hub/core/flow/registry.py +330 -0
  42. sdg_hub/core/flow/validation.py +265 -0
  43. sdg_hub/{utils → core/utils}/__init__.py +6 -4
  44. sdg_hub/{utils → core/utils}/datautils.py +1 -3
  45. sdg_hub/core/utils/error_handling.py +208 -0
  46. sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
  47. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
  48. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
  49. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
  50. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
  51. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
  52. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
  53. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +191 -0
  54. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
  55. sdg_hub-0.2.0.dist-info/METADATA +218 -0
  56. sdg_hub-0.2.0.dist-info/RECORD +63 -0
  57. sdg_hub/blocks/__init__.py +0 -42
  58. sdg_hub/blocks/block.py +0 -96
  59. sdg_hub/blocks/llmblock.py +0 -375
  60. sdg_hub/blocks/openaichatblock.py +0 -556
  61. sdg_hub/blocks/utilblocks.py +0 -597
  62. sdg_hub/checkpointer.py +0 -139
  63. sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
  64. sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
  65. sdg_hub/configs/annotations/detailed_description.yaml +0 -10
  66. sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
  67. sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
  68. sdg_hub/configs/knowledge/__init__.py +0 -0
  69. sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
  70. sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
  71. sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
  72. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
  73. sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
  74. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
  75. sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
  76. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
  77. sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
  78. sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
  79. sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
  80. sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
  81. sdg_hub/configs/knowledge/router.yaml +0 -12
  82. sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
  83. sdg_hub/configs/reasoning/__init__.py +0 -0
  84. sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
  85. sdg_hub/configs/skills/__init__.py +0 -0
  86. sdg_hub/configs/skills/analyzer.yaml +0 -48
  87. sdg_hub/configs/skills/annotation.yaml +0 -36
  88. sdg_hub/configs/skills/contexts.yaml +0 -28
  89. sdg_hub/configs/skills/critic.yaml +0 -60
  90. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
  91. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
  92. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
  93. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
  94. sdg_hub/configs/skills/freeform_questions.yaml +0 -34
  95. sdg_hub/configs/skills/freeform_responses.yaml +0 -39
  96. sdg_hub/configs/skills/grounded_questions.yaml +0 -38
  97. sdg_hub/configs/skills/grounded_responses.yaml +0 -59
  98. sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
  99. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  100. sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
  101. sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
  102. sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
  103. sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
  104. sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
  105. sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
  106. sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
  107. sdg_hub/configs/skills/judge.yaml +0 -53
  108. sdg_hub/configs/skills/planner.yaml +0 -67
  109. sdg_hub/configs/skills/respond.yaml +0 -8
  110. sdg_hub/configs/skills/revised_responder.yaml +0 -78
  111. sdg_hub/configs/skills/router.yaml +0 -59
  112. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
  113. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
  114. sdg_hub/flow.py +0 -477
  115. sdg_hub/flow_runner.py +0 -450
  116. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
  117. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
  118. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
  119. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
  120. sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
  121. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
  122. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
  123. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
  124. sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
  125. sdg_hub/pipeline.py +0 -121
  126. sdg_hub/prompts.py +0 -80
  127. sdg_hub/registry.py +0 -122
  128. sdg_hub/sdg.py +0 -206
  129. sdg_hub/utils/config_validation.py +0 -91
  130. sdg_hub/utils/error_handling.py +0 -94
  131. sdg_hub/utils/validation_result.py +0 -10
  132. sdg_hub-0.1.4.dist-info/METADATA +0 -190
  133. sdg_hub-0.1.4.dist-info/RECORD +0 -89
  134. sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
  135. /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
  136. /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
  137. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.0.dist-info}/WHEEL +0 -0
  138. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.0.dist-info}/licenses/LICENSE +0 -0
  139. {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,344 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Flow metadata and parameter definitions."""
3
+
4
+ # Standard
5
+ from datetime import datetime
6
+ from enum import Enum
7
+ from typing import Any, Optional
8
+
9
+ # Third Party
10
+ from pydantic import BaseModel, Field, field_validator, model_validator
11
+
12
+
13
+ class ModelCompatibility(str, Enum):
14
+ """Model compatibility levels."""
15
+
16
+ REQUIRED = "required"
17
+ RECOMMENDED = "recommended"
18
+ COMPATIBLE = "compatible"
19
+ EXPERIMENTAL = "experimental"
20
+
21
+
22
+ class ModelOption(BaseModel):
23
+ """Represents a model option with compatibility level.
24
+
25
+ Attributes
26
+ ----------
27
+ name : str
28
+ Model identifier (e.g., "gpt-4", "claude-3-sonnet")
29
+ compatibility : ModelCompatibility
30
+ Compatibility level with the flow
31
+ """
32
+
33
+ name: str = Field(..., description="Model identifier")
34
+ compatibility: ModelCompatibility = Field(
35
+ default=ModelCompatibility.COMPATIBLE,
36
+ description="Compatibility level with the flow",
37
+ )
38
+
39
+ @field_validator("name")
40
+ @classmethod
41
+ def validate_name(cls, v: str) -> str:
42
+ """Validate model name is not empty."""
43
+ if not v.strip():
44
+ raise ValueError("Model name cannot be empty")
45
+ return v.strip()
46
+
47
+
48
+ class RecommendedModels(BaseModel):
49
+ """Simplified recommended models structure.
50
+
51
+ Attributes
52
+ ----------
53
+ default : str
54
+ The default model to use
55
+ compatible : List[str]
56
+ List of compatible models
57
+ experimental : List[str]
58
+ List of experimental models
59
+ """
60
+
61
+ default: str = Field(..., description="Default model to use")
62
+ compatible: list[str] = Field(default_factory=list, description="Compatible models")
63
+ experimental: list[str] = Field(
64
+ default_factory=list, description="Experimental models"
65
+ )
66
+
67
+ @field_validator("default")
68
+ @classmethod
69
+ def validate_default(cls, v: str) -> str:
70
+ """Validate default model name is not empty."""
71
+ if not v.strip():
72
+ raise ValueError("Default model name cannot be empty")
73
+ return v.strip()
74
+
75
+ @field_validator("compatible", "experimental")
76
+ @classmethod
77
+ def validate_model_lists(cls, v: list[str]) -> list[str]:
78
+ """Validate model lists contain non-empty names."""
79
+ return [model.strip() for model in v if model.strip()]
80
+
81
+ def get_all_models(self) -> list[str]:
82
+ """Get all models (default + compatible + experimental)."""
83
+ return [self.default] + self.compatible + self.experimental
84
+
85
+ def get_best_model(
86
+ self, available_models: Optional[list[str]] = None
87
+ ) -> Optional[str]:
88
+ """Get the best model based on availability.
89
+
90
+ Parameters
91
+ ----------
92
+ available_models : Optional[List[str]]
93
+ List of available model names. If None, returns default.
94
+
95
+ Returns
96
+ -------
97
+ Optional[str]
98
+ Best model name or None if no models available.
99
+ """
100
+ if available_models is None:
101
+ return self.default
102
+
103
+ # Check in priority order: default, compatible, experimental
104
+ if self.default in available_models:
105
+ return self.default
106
+
107
+ for model in self.compatible:
108
+ if model in available_models:
109
+ return model
110
+
111
+ for model in self.experimental:
112
+ if model in available_models:
113
+ return model
114
+
115
+ return None
116
+
117
+
118
+ class FlowParameter(BaseModel):
119
+ """Represents a runtime parameter for a flow.
120
+
121
+ Attributes
122
+ ----------
123
+ default : Any
124
+ Default value for the parameter.
125
+ description : str
126
+ Human-readable description of the parameter.
127
+ type_hint : str
128
+ Type hint as string (e.g., "float", "str").
129
+ required : bool
130
+ Whether this parameter is required at runtime.
131
+ constraints : Dict[str, Any]
132
+ Additional constraints for the parameter.
133
+ """
134
+
135
+ default: Any = Field(..., description="Default value for the parameter")
136
+ description: str = Field(default="", description="Human-readable description")
137
+ type_hint: str = Field(default="Any", description="Type hint as string")
138
+ required: bool = Field(default=False, description="Whether parameter is required")
139
+ constraints: dict[str, Any] = Field(
140
+ default_factory=dict, description="Additional constraints for the parameter"
141
+ )
142
+
143
+ @model_validator(mode="after")
144
+ def validate_required_default(self) -> "FlowParameter":
145
+ """Validate that required parameters have appropriate defaults."""
146
+ if self.required and self.default is None:
147
+ raise ValueError("Required parameters cannot have None as default")
148
+ return self
149
+
150
+
151
+ class DatasetRequirements(BaseModel):
152
+ """Dataset requirements for flow execution.
153
+
154
+ Attributes
155
+ ----------
156
+ required_columns : List[str]
157
+ Column names that must be present in the input dataset.
158
+ optional_columns : List[str]
159
+ Column names that are optional but can enhance flow performance.
160
+ min_samples : int
161
+ Minimum number of samples required in the dataset.
162
+ max_samples : Optional[int]
163
+ Maximum number of samples to process (for resource management).
164
+ column_types : Dict[str, str]
165
+ Expected types for specific columns.
166
+ description : str
167
+ Human-readable description of dataset requirements.
168
+ """
169
+
170
+ required_columns: list[str] = Field(
171
+ default_factory=list, description="Column names that must be present"
172
+ )
173
+ optional_columns: list[str] = Field(
174
+ default_factory=list,
175
+ description="Optional columns that can enhance performance",
176
+ )
177
+ min_samples: int = Field(
178
+ default=1, ge=1, description="Minimum number of samples required"
179
+ )
180
+ max_samples: Optional[int] = Field(
181
+ default=None, gt=0, description="Maximum number of samples to process"
182
+ )
183
+ column_types: dict[str, str] = Field(
184
+ default_factory=dict, description="Expected types for specific columns"
185
+ )
186
+ description: str = Field(default="", description="Human-readable description")
187
+
188
+ @field_validator("required_columns", "optional_columns")
189
+ @classmethod
190
+ def validate_column_names(cls, v: list[str]) -> list[str]:
191
+ """Validate column names are not empty."""
192
+ return [col.strip() for col in v if col.strip()]
193
+
194
+ @model_validator(mode="after")
195
+ def validate_sample_limits(self) -> "DatasetRequirements":
196
+ """Validate sample limits are consistent."""
197
+ if self.max_samples is not None and self.max_samples < self.min_samples:
198
+ raise ValueError("max_samples must be greater than or equal to min_samples")
199
+ return self
200
+
201
+ def validate_dataset(
202
+ self, dataset_columns: list[str], dataset_size: int
203
+ ) -> list[str]:
204
+ """Validate a dataset against these requirements.
205
+
206
+ Parameters
207
+ ----------
208
+ dataset_columns : List[str]
209
+ Column names in the dataset.
210
+ dataset_size : int
211
+ Number of samples in the dataset.
212
+
213
+ Returns
214
+ -------
215
+ List[str]
216
+ List of validation error messages. Empty if valid.
217
+ """
218
+ errors = []
219
+
220
+ # Check required columns
221
+ missing_columns = [
222
+ col for col in self.required_columns if col not in dataset_columns
223
+ ]
224
+ if missing_columns:
225
+ errors.append(f"Missing required columns: {missing_columns}")
226
+
227
+ # Check minimum samples
228
+ if dataset_size < self.min_samples:
229
+ errors.append(
230
+ f"Dataset has {dataset_size} samples, minimum required: {self.min_samples}"
231
+ )
232
+
233
+ return errors
234
+
235
+
236
+ class FlowMetadata(BaseModel):
237
+ """Metadata for flow configuration and open source contributions.
238
+
239
+ Attributes
240
+ ----------
241
+ name : str
242
+ Human-readable name of the flow.
243
+ description : str
244
+ Detailed description of what the flow does.
245
+ version : str
246
+ Semantic version (e.g., "1.0.0").
247
+ author : str
248
+ Author or contributor name.
249
+ recommended_models : Optional[RecommendedModels]
250
+ Simplified recommended models structure with default, compatible, and experimental lists.
251
+ tags : List[str]
252
+ Tags for categorization and search.
253
+ created_at : str
254
+ Creation timestamp.
255
+ updated_at : str
256
+ Last update timestamp.
257
+ license : str
258
+ License identifier.
259
+ min_sdg_hub_version : str
260
+ Minimum required SDG Hub version.
261
+ dataset_requirements : Optional[DatasetRequirements]
262
+ Requirements for input datasets.
263
+ estimated_cost : str
264
+ Estimated cost tier for running the flow.
265
+ estimated_duration : str
266
+ Estimated duration for flow execution.
267
+ """
268
+
269
+ name: str = Field(..., min_length=1, description="Human-readable name")
270
+ description: str = Field(default="", description="Detailed description")
271
+ version: str = Field(
272
+ default="1.0.0",
273
+ pattern=r"^\d+\.\d+\.\d+(-[a-zA-Z0-9.-]+)?$",
274
+ description="Semantic version",
275
+ )
276
+ author: str = Field(default="", description="Author or contributor name")
277
+ recommended_models: Optional[RecommendedModels] = Field(
278
+ default=None, description="Simplified recommended models structure"
279
+ )
280
+ tags: list[str] = Field(
281
+ default_factory=list, description="Tags for categorization and search"
282
+ )
283
+ created_at: str = Field(
284
+ default_factory=lambda: datetime.now().isoformat(),
285
+ description="Creation timestamp",
286
+ )
287
+ updated_at: str = Field(
288
+ default_factory=lambda: datetime.now().isoformat(),
289
+ description="Last update timestamp",
290
+ )
291
+ license: str = Field(default="Apache-2.0", description="License identifier")
292
+ min_sdg_hub_version: str = Field(
293
+ default="", description="Minimum required SDG Hub version"
294
+ )
295
+ dataset_requirements: Optional[DatasetRequirements] = Field(
296
+ default=None, description="Requirements for input datasets"
297
+ )
298
+ estimated_cost: str = Field(
299
+ default="medium",
300
+ pattern="^(low|medium|high)$",
301
+ description="Estimated cost tier for running the flow",
302
+ )
303
+ estimated_duration: str = Field(
304
+ default="", description="Estimated duration for flow execution"
305
+ )
306
+
307
+ @field_validator("tags")
308
+ @classmethod
309
+ def validate_tags(cls, v: list[str]) -> list[str]:
310
+ """Validate and clean tags."""
311
+ return [tag.strip().lower() for tag in v if tag.strip()]
312
+
313
+ @field_validator("recommended_models")
314
+ @classmethod
315
+ def validate_recommended_models(
316
+ cls, v: Optional[RecommendedModels]
317
+ ) -> Optional[RecommendedModels]:
318
+ """Validate recommended models structure."""
319
+ # Validation is handled within RecommendedModels class
320
+ return v
321
+
322
+ def update_timestamp(self) -> None:
323
+ """Update the updated_at timestamp."""
324
+ self.updated_at = datetime.now().isoformat()
325
+
326
+ def get_best_model(
327
+ self, available_models: Optional[list[str]] = None
328
+ ) -> Optional[str]:
329
+ """Get the best recommended model based on availability.
330
+
331
+ Parameters
332
+ ----------
333
+ available_models : Optional[List[str]]
334
+ List of available model names. If None, returns default model.
335
+
336
+ Returns
337
+ -------
338
+ Optional[str]
339
+ Best model name or None if no models available.
340
+ """
341
+ if not self.recommended_models:
342
+ return None
343
+
344
+ return self.recommended_models.get_best_model(available_models)
@@ -0,0 +1,187 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Migration utilities for backward compatibility with old flow formats."""
3
+
4
+ # Standard
5
+ from pathlib import Path
6
+ from typing import Any, Union
7
+
8
+ # Local
9
+ from ..utils.logger_config import setup_logger
10
+
11
+ logger = setup_logger(__name__)
12
+
13
+
14
+ class FlowMigration:
15
+ """Utility class for migrating old flow formats to new format."""
16
+
17
+ @staticmethod
18
+ def is_old_format(flow_config: Union[list[dict[str, Any]], dict[str, Any]]) -> bool:
19
+ """Detect if a flow configuration is in the old format.
20
+
21
+ Parameters
22
+ ----------
23
+ flow_config : Union[List[Dict[str, Any]], Dict[str, Any]]
24
+ The loaded YAML configuration.
25
+
26
+ Returns
27
+ -------
28
+ bool
29
+ True if the configuration is in old format, False otherwise.
30
+ """
31
+ # Old format: Direct array of blocks
32
+ # New format: Dictionary with 'metadata' and 'blocks' keys
33
+ if isinstance(flow_config, list):
34
+ return True
35
+
36
+ if isinstance(flow_config, dict):
37
+ # Check if it has the new format structure
38
+ has_metadata = "metadata" in flow_config
39
+ has_blocks = "blocks" in flow_config
40
+
41
+ # If it has both metadata and blocks, it's new format
42
+ if has_metadata and has_blocks:
43
+ return False
44
+
45
+ # If it doesn't have the expected new format structure but is a dict,
46
+ # check if it looks like old format (all keys are block configs)
47
+ if not has_metadata and not has_blocks:
48
+ # Check first few items to see if they look like old block configs
49
+ for value in flow_config.values():
50
+ if isinstance(value, dict) and "block_type" in value:
51
+ return True
52
+ # If it's a dict but doesn't look like blocks, assume new format
53
+ return False
54
+
55
+ # If we can't determine, assume new format
56
+ return False
57
+
58
+ @staticmethod
59
+ def migrate_to_new_format(
60
+ flow_config: list[dict[str, Any]], yaml_path: str
61
+ ) -> tuple[dict[str, Any], dict[str, dict[str, Any]]]:
62
+ """Migrate old format flow configuration to new format.
63
+
64
+ Parameters
65
+ ----------
66
+ flow_config : List[Dict[str, Any]]
67
+ Old format flow configuration (array of blocks).
68
+ yaml_path : str
69
+ Path to the original YAML file for generating metadata.
70
+
71
+ Returns
72
+ -------
73
+ tuple[Dict[str, Any], Dict[str, Dict[str, Any]]]
74
+ Tuple of (new format flow configuration, extracted runtime_params).
75
+ """
76
+ logger.info(f"Migrating old flow format from: {yaml_path}")
77
+
78
+ # Generate default metadata
79
+ flow_name = Path(yaml_path).stem
80
+ metadata = FlowMigration._generate_default_metadata(flow_name)
81
+
82
+ # Process blocks and extract runtime parameters
83
+ migrated_blocks = []
84
+ runtime_params = {}
85
+
86
+ for i, block_config in enumerate(flow_config):
87
+ try:
88
+ migrated_block, block_runtime_params = (
89
+ FlowMigration._migrate_block_config(block_config)
90
+ )
91
+ migrated_blocks.append(migrated_block)
92
+
93
+ # Add block's runtime params if any
94
+ if block_runtime_params:
95
+ block_name = migrated_block.get("block_config", {}).get(
96
+ "block_name"
97
+ )
98
+ if block_name:
99
+ runtime_params[block_name] = block_runtime_params
100
+
101
+ except Exception as exc:
102
+ logger.warning(f"Failed to migrate block at index {i}: {exc}")
103
+ # Keep original block config as fallback
104
+ migrated_blocks.append(block_config)
105
+
106
+ # Create new format structure
107
+ new_config = {"metadata": metadata, "blocks": migrated_blocks}
108
+
109
+ logger.info(f"Successfully migrated flow with {len(migrated_blocks)} blocks")
110
+ logger.info(f"Extracted runtime_params for {len(runtime_params)} blocks")
111
+
112
+ return new_config, runtime_params
113
+
114
+ @staticmethod
115
+ def _generate_default_metadata(flow_name: str) -> dict[str, Any]:
116
+ """Generate default metadata for migrated flows."""
117
+ return {
118
+ "name": flow_name,
119
+ "description": f"Migrated flow: {flow_name}",
120
+ "version": "1.0.0",
121
+ "author": "SDG_Hub",
122
+ "tags": ["migrated"],
123
+ "recommended_models": {
124
+ "default": "meta-llama/Llama-3.3-70B-Instruct",
125
+ "compatible": [],
126
+ "experimental": [],
127
+ },
128
+ }
129
+
130
+ @staticmethod
131
+ def _migrate_block_config(
132
+ block_config: dict[str, Any],
133
+ ) -> tuple[dict[str, Any], dict[str, Any]]:
134
+ """Migrate individual block configuration from old to new format.
135
+
136
+ Parameters
137
+ ----------
138
+ block_config : Dict[str, Any]
139
+ Old format block configuration.
140
+
141
+ Returns
142
+ -------
143
+ tuple[Dict[str, Any], Dict[str, Any]]
144
+ Tuple of (migrated block configuration, extracted runtime_params).
145
+ """
146
+ if not isinstance(block_config, dict):
147
+ return block_config, {}
148
+
149
+ # Start with the original config
150
+ migrated_config = block_config.copy()
151
+ runtime_params = {}
152
+
153
+ # Extract gen_kwargs as runtime_params
154
+ if "gen_kwargs" in migrated_config:
155
+ runtime_params = migrated_config.pop("gen_kwargs")
156
+ logger.debug(f"Extracted gen_kwargs as runtime_params: {runtime_params}")
157
+
158
+ # Remove unsupported fields
159
+ for unsupported_field in ["drop_columns", "drop_duplicates", "batch_kwargs"]:
160
+ if unsupported_field in migrated_config:
161
+ migrated_config.pop(unsupported_field)
162
+ logger.debug(
163
+ f"Ignoring {unsupported_field} as it's not supported in new flow format"
164
+ )
165
+
166
+ # Handle parser_kwargs for LLMBlock (keep in block_config)
167
+ if migrated_config.get("block_type") == "LLMBlock":
168
+ block_config_section = migrated_config.get("block_config", {})
169
+ if "parser_kwargs" in block_config_section:
170
+ parser_kwargs = block_config_section["parser_kwargs"]
171
+ logger.debug(f"Preserving parser_kwargs for LLMBlock: {parser_kwargs}")
172
+
173
+ # Handle operator string conversion for FilterByValueBlock
174
+ if migrated_config.get("block_type") == "FilterByValueBlock":
175
+ block_config_section = migrated_config.get("block_config", {})
176
+ if "operation" in block_config_section:
177
+ operation = block_config_section["operation"]
178
+ if isinstance(operation, str) and operation.startswith("operator."):
179
+ # Convert "operator.eq" to "eq"
180
+ block_config_section["operation"] = operation.replace(
181
+ "operator.", ""
182
+ )
183
+ logger.debug(
184
+ f"Converted operation from {operation} to {block_config_section['operation']}"
185
+ )
186
+
187
+ return migrated_config, runtime_params