sdg-hub 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/__init__.py +28 -1
- sdg_hub/_version.py +2 -2
- sdg_hub/core/__init__.py +22 -0
- sdg_hub/core/blocks/__init__.py +58 -0
- sdg_hub/core/blocks/base.py +313 -0
- sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
- sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
- sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
- sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
- sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
- sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
- sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
- sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
- sdg_hub/core/blocks/evaluation/__init__.py +9 -0
- sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
- sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
- sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
- sdg_hub/core/blocks/filtering/__init__.py +12 -0
- sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
- sdg_hub/core/blocks/llm/__init__.py +27 -0
- sdg_hub/core/blocks/llm/client_manager.py +398 -0
- sdg_hub/core/blocks/llm/config.py +336 -0
- sdg_hub/core/blocks/llm/error_handler.py +368 -0
- sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
- sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +491 -0
- sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
- sdg_hub/core/blocks/llm/text_parser_block.py +357 -0
- sdg_hub/core/blocks/registry.py +331 -0
- sdg_hub/core/blocks/transform/__init__.py +23 -0
- sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
- sdg_hub/core/blocks/transform/melt_columns.py +126 -0
- sdg_hub/core/blocks/transform/rename_columns.py +69 -0
- sdg_hub/core/blocks/transform/text_concat.py +102 -0
- sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
- sdg_hub/core/flow/__init__.py +20 -0
- sdg_hub/core/flow/base.py +1209 -0
- sdg_hub/core/flow/checkpointer.py +333 -0
- sdg_hub/core/flow/metadata.py +389 -0
- sdg_hub/core/flow/migration.py +198 -0
- sdg_hub/core/flow/registry.py +393 -0
- sdg_hub/core/flow/validation.py +277 -0
- sdg_hub/{utils → core/utils}/__init__.py +7 -4
- sdg_hub/core/utils/datautils.py +63 -0
- sdg_hub/core/utils/error_handling.py +208 -0
- sdg_hub/core/utils/flow_id_words.yaml +231 -0
- sdg_hub/core/utils/flow_identifier.py +94 -0
- sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
- sdg_hub/core/utils/yaml_utils.py +59 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +192 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
- sdg_hub-0.2.1.dist-info/METADATA +221 -0
- sdg_hub-0.2.1.dist-info/RECORD +68 -0
- sdg_hub/blocks/__init__.py +0 -42
- sdg_hub/blocks/block.py +0 -96
- sdg_hub/blocks/llmblock.py +0 -375
- sdg_hub/blocks/openaichatblock.py +0 -556
- sdg_hub/blocks/utilblocks.py +0 -597
- sdg_hub/checkpointer.py +0 -139
- sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
- sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
- sdg_hub/configs/annotations/detailed_description.yaml +0 -10
- sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
- sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
- sdg_hub/configs/knowledge/__init__.py +0 -0
- sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
- sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
- sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
- sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
- sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
- sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
- sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
- sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
- sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
- sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
- sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
- sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
- sdg_hub/configs/knowledge/router.yaml +0 -12
- sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
- sdg_hub/configs/reasoning/__init__.py +0 -0
- sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
- sdg_hub/configs/skills/__init__.py +0 -0
- sdg_hub/configs/skills/analyzer.yaml +0 -48
- sdg_hub/configs/skills/annotation.yaml +0 -36
- sdg_hub/configs/skills/contexts.yaml +0 -28
- sdg_hub/configs/skills/critic.yaml +0 -60
- sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
- sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
- sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
- sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
- sdg_hub/configs/skills/freeform_questions.yaml +0 -34
- sdg_hub/configs/skills/freeform_responses.yaml +0 -39
- sdg_hub/configs/skills/grounded_questions.yaml +0 -38
- sdg_hub/configs/skills/grounded_responses.yaml +0 -59
- sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
- sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
- sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
- sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
- sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
- sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
- sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
- sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
- sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
- sdg_hub/configs/skills/judge.yaml +0 -53
- sdg_hub/configs/skills/planner.yaml +0 -67
- sdg_hub/configs/skills/respond.yaml +0 -8
- sdg_hub/configs/skills/revised_responder.yaml +0 -78
- sdg_hub/configs/skills/router.yaml +0 -59
- sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
- sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
- sdg_hub/flow.py +0 -477
- sdg_hub/flow_runner.py +0 -450
- sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
- sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
- sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
- sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
- sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
- sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
- sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
- sdg_hub/pipeline.py +0 -121
- sdg_hub/prompts.py +0 -80
- sdg_hub/registry.py +0 -122
- sdg_hub/sdg.py +0 -206
- sdg_hub/utils/config_validation.py +0 -91
- sdg_hub/utils/datautils.py +0 -14
- sdg_hub/utils/error_handling.py +0 -94
- sdg_hub/utils/validation_result.py +0 -10
- sdg_hub-0.1.4.dist-info/METADATA +0 -190
- sdg_hub-0.1.4.dist-info/RECORD +0 -89
- sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
- /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
- /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/WHEEL +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,389 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Flow metadata and parameter definitions."""
|
3
|
+
|
4
|
+
# Standard
|
5
|
+
from datetime import datetime
|
6
|
+
from enum import Enum
|
7
|
+
from typing import Any, Optional
|
8
|
+
|
9
|
+
# Third Party
|
10
|
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
11
|
+
|
12
|
+
# Local
|
13
|
+
from ..utils.flow_identifier import get_flow_identifier
|
14
|
+
|
15
|
+
|
16
|
+
class ModelCompatibility(str, Enum):
|
17
|
+
"""Model compatibility levels."""
|
18
|
+
|
19
|
+
REQUIRED = "required"
|
20
|
+
RECOMMENDED = "recommended"
|
21
|
+
COMPATIBLE = "compatible"
|
22
|
+
EXPERIMENTAL = "experimental"
|
23
|
+
|
24
|
+
|
25
|
+
class ModelOption(BaseModel):
|
26
|
+
"""Represents a model option with compatibility level.
|
27
|
+
|
28
|
+
Attributes
|
29
|
+
----------
|
30
|
+
name : str
|
31
|
+
Model identifier (e.g., "gpt-4", "claude-3-sonnet")
|
32
|
+
compatibility : ModelCompatibility
|
33
|
+
Compatibility level with the flow
|
34
|
+
"""
|
35
|
+
|
36
|
+
name: str = Field(..., description="Model identifier")
|
37
|
+
compatibility: ModelCompatibility = Field(
|
38
|
+
default=ModelCompatibility.COMPATIBLE,
|
39
|
+
description="Compatibility level with the flow",
|
40
|
+
)
|
41
|
+
|
42
|
+
@field_validator("name")
|
43
|
+
@classmethod
|
44
|
+
def validate_name(cls, v: str) -> str:
|
45
|
+
"""Validate model name is not empty."""
|
46
|
+
if not v.strip():
|
47
|
+
raise ValueError("Model name cannot be empty")
|
48
|
+
return v.strip()
|
49
|
+
|
50
|
+
|
51
|
+
class RecommendedModels(BaseModel):
|
52
|
+
"""Simplified recommended models structure.
|
53
|
+
|
54
|
+
Attributes
|
55
|
+
----------
|
56
|
+
default : str
|
57
|
+
The default model to use
|
58
|
+
compatible : List[str]
|
59
|
+
List of compatible models
|
60
|
+
experimental : List[str]
|
61
|
+
List of experimental models
|
62
|
+
"""
|
63
|
+
|
64
|
+
default: str = Field(..., description="Default model to use")
|
65
|
+
compatible: list[str] = Field(default_factory=list, description="Compatible models")
|
66
|
+
experimental: list[str] = Field(
|
67
|
+
default_factory=list, description="Experimental models"
|
68
|
+
)
|
69
|
+
|
70
|
+
@field_validator("default")
|
71
|
+
@classmethod
|
72
|
+
def validate_default(cls, v: str) -> str:
|
73
|
+
"""Validate default model name is not empty."""
|
74
|
+
if not v.strip():
|
75
|
+
raise ValueError("Default model name cannot be empty")
|
76
|
+
return v.strip()
|
77
|
+
|
78
|
+
@field_validator("compatible", "experimental")
|
79
|
+
@classmethod
|
80
|
+
def validate_model_lists(cls, v: list[str]) -> list[str]:
|
81
|
+
"""Validate model lists contain non-empty names."""
|
82
|
+
return [model.strip() for model in v if model.strip()]
|
83
|
+
|
84
|
+
def get_all_models(self) -> list[str]:
|
85
|
+
"""Get all models (default + compatible + experimental)."""
|
86
|
+
return [self.default] + self.compatible + self.experimental
|
87
|
+
|
88
|
+
def get_best_model(
|
89
|
+
self, available_models: Optional[list[str]] = None
|
90
|
+
) -> Optional[str]:
|
91
|
+
"""Get the best model based on availability.
|
92
|
+
|
93
|
+
Parameters
|
94
|
+
----------
|
95
|
+
available_models : Optional[List[str]]
|
96
|
+
List of available model names. If None, returns default.
|
97
|
+
|
98
|
+
Returns
|
99
|
+
-------
|
100
|
+
Optional[str]
|
101
|
+
Best model name or None if no models available.
|
102
|
+
"""
|
103
|
+
if available_models is None:
|
104
|
+
return self.default
|
105
|
+
|
106
|
+
# Check in priority order: default, compatible, experimental
|
107
|
+
if self.default in available_models:
|
108
|
+
return self.default
|
109
|
+
|
110
|
+
for model in self.compatible:
|
111
|
+
if model in available_models:
|
112
|
+
return model
|
113
|
+
|
114
|
+
for model in self.experimental:
|
115
|
+
if model in available_models:
|
116
|
+
return model
|
117
|
+
|
118
|
+
return None
|
119
|
+
|
120
|
+
|
121
|
+
class FlowParameter(BaseModel):
|
122
|
+
"""Represents a runtime parameter for a flow.
|
123
|
+
|
124
|
+
Attributes
|
125
|
+
----------
|
126
|
+
default : Any
|
127
|
+
Default value for the parameter.
|
128
|
+
description : str
|
129
|
+
Human-readable description of the parameter.
|
130
|
+
type_hint : str
|
131
|
+
Type hint as string (e.g., "float", "str").
|
132
|
+
required : bool
|
133
|
+
Whether this parameter is required at runtime.
|
134
|
+
constraints : Dict[str, Any]
|
135
|
+
Additional constraints for the parameter.
|
136
|
+
"""
|
137
|
+
|
138
|
+
default: Any = Field(..., description="Default value for the parameter")
|
139
|
+
description: str = Field(default="", description="Human-readable description")
|
140
|
+
type_hint: str = Field(default="Any", description="Type hint as string")
|
141
|
+
required: bool = Field(default=False, description="Whether parameter is required")
|
142
|
+
constraints: dict[str, Any] = Field(
|
143
|
+
default_factory=dict, description="Additional constraints for the parameter"
|
144
|
+
)
|
145
|
+
|
146
|
+
@model_validator(mode="after")
|
147
|
+
def validate_required_default(self) -> "FlowParameter":
|
148
|
+
"""Validate that required parameters have appropriate defaults."""
|
149
|
+
if self.required and self.default is None:
|
150
|
+
raise ValueError("Required parameters cannot have None as default")
|
151
|
+
return self
|
152
|
+
|
153
|
+
|
154
|
+
class DatasetRequirements(BaseModel):
|
155
|
+
"""Dataset requirements for flow execution.
|
156
|
+
|
157
|
+
Attributes
|
158
|
+
----------
|
159
|
+
required_columns : List[str]
|
160
|
+
Column names that must be present in the input dataset.
|
161
|
+
optional_columns : List[str]
|
162
|
+
Column names that are optional but can enhance flow performance.
|
163
|
+
min_samples : int
|
164
|
+
Minimum number of samples required in the dataset.
|
165
|
+
max_samples : Optional[int]
|
166
|
+
Maximum number of samples to process (for resource management).
|
167
|
+
column_types : Dict[str, str]
|
168
|
+
Expected types for specific columns.
|
169
|
+
description : str
|
170
|
+
Human-readable description of dataset requirements.
|
171
|
+
"""
|
172
|
+
|
173
|
+
required_columns: list[str] = Field(
|
174
|
+
default_factory=list, description="Column names that must be present"
|
175
|
+
)
|
176
|
+
optional_columns: list[str] = Field(
|
177
|
+
default_factory=list,
|
178
|
+
description="Optional columns that can enhance performance",
|
179
|
+
)
|
180
|
+
min_samples: int = Field(
|
181
|
+
default=1, ge=1, description="Minimum number of samples required"
|
182
|
+
)
|
183
|
+
max_samples: Optional[int] = Field(
|
184
|
+
default=None, gt=0, description="Maximum number of samples to process"
|
185
|
+
)
|
186
|
+
column_types: dict[str, str] = Field(
|
187
|
+
default_factory=dict, description="Expected types for specific columns"
|
188
|
+
)
|
189
|
+
description: str = Field(default="", description="Human-readable description")
|
190
|
+
|
191
|
+
@field_validator("required_columns", "optional_columns")
|
192
|
+
@classmethod
|
193
|
+
def validate_column_names(cls, v: list[str]) -> list[str]:
|
194
|
+
"""Validate column names are not empty."""
|
195
|
+
return [col.strip() for col in v if col.strip()]
|
196
|
+
|
197
|
+
@model_validator(mode="after")
|
198
|
+
def validate_sample_limits(self) -> "DatasetRequirements":
|
199
|
+
"""Validate sample limits are consistent."""
|
200
|
+
if self.max_samples is not None and self.max_samples < self.min_samples:
|
201
|
+
raise ValueError("max_samples must be greater than or equal to min_samples")
|
202
|
+
return self
|
203
|
+
|
204
|
+
def validate_dataset(
|
205
|
+
self, dataset_columns: list[str], dataset_size: int
|
206
|
+
) -> list[str]:
|
207
|
+
"""Validate a dataset against these requirements.
|
208
|
+
|
209
|
+
Parameters
|
210
|
+
----------
|
211
|
+
dataset_columns : List[str]
|
212
|
+
Column names in the dataset.
|
213
|
+
dataset_size : int
|
214
|
+
Number of samples in the dataset.
|
215
|
+
|
216
|
+
Returns
|
217
|
+
-------
|
218
|
+
List[str]
|
219
|
+
List of validation error messages. Empty if valid.
|
220
|
+
"""
|
221
|
+
errors = []
|
222
|
+
|
223
|
+
# Check required columns
|
224
|
+
missing_columns = [
|
225
|
+
col for col in self.required_columns if col not in dataset_columns
|
226
|
+
]
|
227
|
+
if missing_columns:
|
228
|
+
errors.append(f"Missing required columns: {missing_columns}")
|
229
|
+
|
230
|
+
# Check minimum samples
|
231
|
+
if dataset_size < self.min_samples:
|
232
|
+
errors.append(
|
233
|
+
f"Dataset has {dataset_size} samples, minimum required: {self.min_samples}"
|
234
|
+
)
|
235
|
+
|
236
|
+
return errors
|
237
|
+
|
238
|
+
|
239
|
+
class FlowMetadata(BaseModel):
|
240
|
+
"""Metadata for flow configuration and open source contributions.
|
241
|
+
|
242
|
+
Attributes
|
243
|
+
----------
|
244
|
+
id : str
|
245
|
+
Unique identifier for the flow.
|
246
|
+
name : str
|
247
|
+
Human-readable name of the flow.
|
248
|
+
description : str
|
249
|
+
Detailed description of what the flow does.
|
250
|
+
version : str
|
251
|
+
Semantic version (e.g., "1.0.0").
|
252
|
+
author : str
|
253
|
+
Author or contributor name.
|
254
|
+
recommended_models : Optional[RecommendedModels]
|
255
|
+
Simplified recommended models structure with default, compatible, and experimental lists.
|
256
|
+
tags : List[str]
|
257
|
+
Tags for categorization and search.
|
258
|
+
created_at : str
|
259
|
+
Creation timestamp.
|
260
|
+
updated_at : str
|
261
|
+
Last update timestamp.
|
262
|
+
license : str
|
263
|
+
License identifier.
|
264
|
+
min_sdg_hub_version : str
|
265
|
+
Minimum required SDG Hub version.
|
266
|
+
dataset_requirements : Optional[DatasetRequirements]
|
267
|
+
Requirements for input datasets.
|
268
|
+
estimated_cost : str
|
269
|
+
Estimated cost tier for running the flow.
|
270
|
+
estimated_duration : str
|
271
|
+
Estimated duration for flow execution.
|
272
|
+
"""
|
273
|
+
|
274
|
+
name: str = Field(..., min_length=1, description="Human-readable name")
|
275
|
+
id: str = Field(
|
276
|
+
default="", description="Unique identifier for the flow, generated from name"
|
277
|
+
)
|
278
|
+
description: str = Field(default="", description="Detailed description")
|
279
|
+
version: str = Field(
|
280
|
+
default="1.0.0",
|
281
|
+
pattern=r"^\d+\.\d+\.\d+(-[a-zA-Z0-9.-]+)?$",
|
282
|
+
description="Semantic version",
|
283
|
+
)
|
284
|
+
author: str = Field(default="", description="Author or contributor name")
|
285
|
+
recommended_models: Optional[RecommendedModels] = Field(
|
286
|
+
default=None, description="Simplified recommended models structure"
|
287
|
+
)
|
288
|
+
tags: list[str] = Field(
|
289
|
+
default_factory=list, description="Tags for categorization and search"
|
290
|
+
)
|
291
|
+
created_at: str = Field(
|
292
|
+
default_factory=lambda: datetime.now().isoformat(),
|
293
|
+
description="Creation timestamp",
|
294
|
+
)
|
295
|
+
updated_at: str = Field(
|
296
|
+
default_factory=lambda: datetime.now().isoformat(),
|
297
|
+
description="Last update timestamp",
|
298
|
+
)
|
299
|
+
license: str = Field(default="Apache-2.0", description="License identifier")
|
300
|
+
min_sdg_hub_version: str = Field(
|
301
|
+
default="", description="Minimum required SDG Hub version"
|
302
|
+
)
|
303
|
+
dataset_requirements: Optional[DatasetRequirements] = Field(
|
304
|
+
default=None, description="Requirements for input datasets"
|
305
|
+
)
|
306
|
+
estimated_cost: str = Field(
|
307
|
+
default="medium",
|
308
|
+
pattern="^(low|medium|high)$",
|
309
|
+
description="Estimated cost tier for running the flow",
|
310
|
+
)
|
311
|
+
estimated_duration: str = Field(
|
312
|
+
default="", description="Estimated duration for flow execution"
|
313
|
+
)
|
314
|
+
|
315
|
+
@field_validator("id")
|
316
|
+
@classmethod
|
317
|
+
def validate_id(cls, v: str) -> str:
|
318
|
+
"""Validate flow id."""
|
319
|
+
# Note: Auto-generation is handled in the model_validator since field_validator
|
320
|
+
# doesn't have access to other field values in Pydantic v2
|
321
|
+
|
322
|
+
# Validate id format if provided
|
323
|
+
if v:
|
324
|
+
# Must be lowercase
|
325
|
+
if not v.islower():
|
326
|
+
raise ValueError("id must be lowercase")
|
327
|
+
|
328
|
+
# Must contain only alphanumeric characters and hyphens
|
329
|
+
if not v.replace("-", "").isalnum():
|
330
|
+
raise ValueError(
|
331
|
+
"id must contain only alphanumeric characters and hyphens"
|
332
|
+
)
|
333
|
+
|
334
|
+
# Must not start or end with a hyphen
|
335
|
+
if v.startswith("-") or v.endswith("-"):
|
336
|
+
raise ValueError("id must not start or end with a hyphen")
|
337
|
+
|
338
|
+
return v
|
339
|
+
|
340
|
+
@field_validator("tags")
|
341
|
+
@classmethod
|
342
|
+
def validate_tags(cls, v: list[str]) -> list[str]:
|
343
|
+
"""Validate and clean tags."""
|
344
|
+
return [tag.strip().lower() for tag in v if tag.strip()]
|
345
|
+
|
346
|
+
@field_validator("recommended_models")
|
347
|
+
@classmethod
|
348
|
+
def validate_recommended_models(
|
349
|
+
cls, v: Optional[RecommendedModels]
|
350
|
+
) -> Optional[RecommendedModels]:
|
351
|
+
"""Validate recommended models structure."""
|
352
|
+
# Validation is handled within RecommendedModels class
|
353
|
+
return v
|
354
|
+
|
355
|
+
def update_timestamp(self) -> None:
|
356
|
+
"""Update the updated_at timestamp."""
|
357
|
+
self.updated_at = datetime.now().isoformat()
|
358
|
+
|
359
|
+
@model_validator(mode="after")
|
360
|
+
def ensure_id(self) -> "FlowMetadata":
|
361
|
+
"""Ensure id is set.
|
362
|
+
|
363
|
+
Note: YAML persistence is handled by Flow.from_yaml() and FlowRegistry
|
364
|
+
to maintain proper separation of concerns.
|
365
|
+
"""
|
366
|
+
if not self.id and self.name:
|
367
|
+
self.id = get_flow_identifier(self.name)
|
368
|
+
|
369
|
+
return self
|
370
|
+
|
371
|
+
def get_best_model(
|
372
|
+
self, available_models: Optional[list[str]] = None
|
373
|
+
) -> Optional[str]:
|
374
|
+
"""Get the best recommended model based on availability.
|
375
|
+
|
376
|
+
Parameters
|
377
|
+
----------
|
378
|
+
available_models : Optional[List[str]]
|
379
|
+
List of available model names. If None, returns default model.
|
380
|
+
|
381
|
+
Returns
|
382
|
+
-------
|
383
|
+
Optional[str]
|
384
|
+
Best model name or None if no models available.
|
385
|
+
"""
|
386
|
+
if not self.recommended_models:
|
387
|
+
return None
|
388
|
+
|
389
|
+
return self.recommended_models.get_best_model(available_models)
|
@@ -0,0 +1,198 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
"""Migration utilities for backward compatibility with old flow formats."""
|
3
|
+
|
4
|
+
# Standard
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Any, Union
|
7
|
+
|
8
|
+
# Local
|
9
|
+
from ..utils.logger_config import setup_logger
|
10
|
+
|
11
|
+
logger = setup_logger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
class FlowMigration:
|
15
|
+
"""Utility class for migrating old flow formats to new format."""
|
16
|
+
|
17
|
+
@staticmethod
|
18
|
+
def is_old_format(flow_config: Union[list[dict[str, Any]], dict[str, Any]]) -> bool:
|
19
|
+
"""Detect if a flow configuration is in the old format.
|
20
|
+
|
21
|
+
Parameters
|
22
|
+
----------
|
23
|
+
flow_config : Union[List[Dict[str, Any]], Dict[str, Any]]
|
24
|
+
The loaded YAML configuration.
|
25
|
+
|
26
|
+
Returns
|
27
|
+
-------
|
28
|
+
bool
|
29
|
+
True if the configuration is in old format, False otherwise.
|
30
|
+
"""
|
31
|
+
# Old format: Direct array of blocks
|
32
|
+
# New format: Dictionary with 'metadata' and 'blocks' keys
|
33
|
+
if isinstance(flow_config, list):
|
34
|
+
return True
|
35
|
+
|
36
|
+
if isinstance(flow_config, dict):
|
37
|
+
# Check if it has the new format structure
|
38
|
+
has_metadata = "metadata" in flow_config
|
39
|
+
has_blocks = "blocks" in flow_config
|
40
|
+
|
41
|
+
# If it has both metadata and blocks, it's new format
|
42
|
+
if has_metadata and has_blocks:
|
43
|
+
return False
|
44
|
+
|
45
|
+
# If it doesn't have the expected new format structure but is a dict,
|
46
|
+
# check if it looks like old format (all keys are block configs)
|
47
|
+
if not has_metadata and not has_blocks:
|
48
|
+
# Check first few items to see if they look like old block configs
|
49
|
+
for value in flow_config.values():
|
50
|
+
if isinstance(value, dict) and "block_type" in value:
|
51
|
+
return True
|
52
|
+
# If it's a dict but doesn't look like blocks, assume new format
|
53
|
+
return False
|
54
|
+
|
55
|
+
# If we can't determine, assume new format
|
56
|
+
return False
|
57
|
+
|
58
|
+
@staticmethod
|
59
|
+
def migrate_to_new_format(
|
60
|
+
flow_config: list[dict[str, Any]], yaml_path: str
|
61
|
+
) -> tuple[dict[str, Any], dict[str, dict[str, Any]]]:
|
62
|
+
"""Migrate old format flow configuration to new format.
|
63
|
+
|
64
|
+
Parameters
|
65
|
+
----------
|
66
|
+
flow_config : List[Dict[str, Any]]
|
67
|
+
Old format flow configuration (array of blocks).
|
68
|
+
yaml_path : str
|
69
|
+
Path to the original YAML file for generating metadata.
|
70
|
+
|
71
|
+
Returns
|
72
|
+
-------
|
73
|
+
tuple[Dict[str, Any], Dict[str, Dict[str, Any]]]
|
74
|
+
Tuple of (new format flow configuration, extracted runtime_params).
|
75
|
+
"""
|
76
|
+
logger.info(f"Migrating old flow format from: {yaml_path}")
|
77
|
+
|
78
|
+
# Generate default metadata
|
79
|
+
flow_name = Path(yaml_path).stem
|
80
|
+
metadata = FlowMigration._generate_default_metadata(flow_name)
|
81
|
+
|
82
|
+
# Process blocks and extract runtime parameters
|
83
|
+
migrated_blocks = []
|
84
|
+
runtime_params = {}
|
85
|
+
|
86
|
+
for i, block_config in enumerate(flow_config):
|
87
|
+
try:
|
88
|
+
migrated_block, block_runtime_params = (
|
89
|
+
FlowMigration._migrate_block_config(block_config)
|
90
|
+
)
|
91
|
+
migrated_blocks.append(migrated_block)
|
92
|
+
|
93
|
+
# Add block's runtime params if any
|
94
|
+
if block_runtime_params:
|
95
|
+
block_name = migrated_block.get("block_config", {}).get(
|
96
|
+
"block_name"
|
97
|
+
)
|
98
|
+
if block_name:
|
99
|
+
runtime_params[block_name] = block_runtime_params
|
100
|
+
|
101
|
+
except Exception as exc:
|
102
|
+
logger.warning(f"Failed to migrate block at index {i}: {exc}")
|
103
|
+
# Keep original block config as fallback
|
104
|
+
migrated_blocks.append(block_config)
|
105
|
+
|
106
|
+
# Create new format structure
|
107
|
+
new_config = {"metadata": metadata, "blocks": migrated_blocks}
|
108
|
+
|
109
|
+
logger.info(f"Successfully migrated flow with {len(migrated_blocks)} blocks")
|
110
|
+
logger.info(f"Extracted runtime_params for {len(runtime_params)} blocks")
|
111
|
+
|
112
|
+
return new_config, runtime_params
|
113
|
+
|
114
|
+
@staticmethod
|
115
|
+
def _generate_default_metadata(flow_name: str) -> dict[str, Any]:
|
116
|
+
"""Generate default metadata for migrated flows."""
|
117
|
+
# Import here to avoid circular import
|
118
|
+
from ..utils.flow_identifier import get_flow_identifier
|
119
|
+
|
120
|
+
metadata = {
|
121
|
+
"name": flow_name,
|
122
|
+
"description": f"Migrated flow: {flow_name}",
|
123
|
+
"version": "1.0.0",
|
124
|
+
"author": "SDG_Hub",
|
125
|
+
"tags": ["migrated"],
|
126
|
+
"recommended_models": {
|
127
|
+
"default": "meta-llama/Llama-3.3-70B-Instruct",
|
128
|
+
"compatible": [],
|
129
|
+
"experimental": [],
|
130
|
+
},
|
131
|
+
}
|
132
|
+
|
133
|
+
# Generate id for migrated flows
|
134
|
+
flow_id = get_flow_identifier(flow_name)
|
135
|
+
if flow_id:
|
136
|
+
metadata["id"] = flow_id
|
137
|
+
logger.debug(f"Generated id for migrated flow: {flow_id}")
|
138
|
+
|
139
|
+
return metadata
|
140
|
+
|
141
|
+
@staticmethod
|
142
|
+
def _migrate_block_config(
|
143
|
+
block_config: dict[str, Any],
|
144
|
+
) -> tuple[dict[str, Any], dict[str, Any]]:
|
145
|
+
"""Migrate individual block configuration from old to new format.
|
146
|
+
|
147
|
+
Parameters
|
148
|
+
----------
|
149
|
+
block_config : Dict[str, Any]
|
150
|
+
Old format block configuration.
|
151
|
+
|
152
|
+
Returns
|
153
|
+
-------
|
154
|
+
tuple[Dict[str, Any], Dict[str, Any]]
|
155
|
+
Tuple of (migrated block configuration, extracted runtime_params).
|
156
|
+
"""
|
157
|
+
if not isinstance(block_config, dict):
|
158
|
+
return block_config, {}
|
159
|
+
|
160
|
+
# Start with the original config
|
161
|
+
migrated_config = block_config.copy()
|
162
|
+
runtime_params = {}
|
163
|
+
|
164
|
+
# Extract gen_kwargs as runtime_params
|
165
|
+
if "gen_kwargs" in migrated_config:
|
166
|
+
runtime_params = migrated_config.pop("gen_kwargs")
|
167
|
+
logger.debug(f"Extracted gen_kwargs as runtime_params: {runtime_params}")
|
168
|
+
|
169
|
+
# Remove unsupported fields
|
170
|
+
for unsupported_field in ["drop_columns", "drop_duplicates", "batch_kwargs"]:
|
171
|
+
if unsupported_field in migrated_config:
|
172
|
+
migrated_config.pop(unsupported_field)
|
173
|
+
logger.debug(
|
174
|
+
f"Ignoring {unsupported_field} as it's not supported in new flow format"
|
175
|
+
)
|
176
|
+
|
177
|
+
# Handle parser_kwargs for LLMBlock (keep in block_config)
|
178
|
+
if migrated_config.get("block_type") == "LLMBlock":
|
179
|
+
block_config_section = migrated_config.get("block_config", {})
|
180
|
+
if "parser_kwargs" in block_config_section:
|
181
|
+
parser_kwargs = block_config_section["parser_kwargs"]
|
182
|
+
logger.debug(f"Preserving parser_kwargs for LLMBlock: {parser_kwargs}")
|
183
|
+
|
184
|
+
# Handle operator string conversion for FilterByValueBlock
|
185
|
+
if migrated_config.get("block_type") == "FilterByValueBlock":
|
186
|
+
block_config_section = migrated_config.get("block_config", {})
|
187
|
+
if "operation" in block_config_section:
|
188
|
+
operation = block_config_section["operation"]
|
189
|
+
if isinstance(operation, str) and operation.startswith("operator."):
|
190
|
+
# Convert "operator.eq" to "eq"
|
191
|
+
block_config_section["operation"] = operation.replace(
|
192
|
+
"operator.", ""
|
193
|
+
)
|
194
|
+
logger.debug(
|
195
|
+
f"Converted operation from {operation} to {block_config_section['operation']}"
|
196
|
+
)
|
197
|
+
|
198
|
+
return migrated_config, runtime_params
|