sdg-hub 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. sdg_hub/_version.py +16 -3
  2. sdg_hub/core/blocks/deprecated_blocks/selector.py +1 -1
  3. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +175 -416
  4. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +174 -415
  5. sdg_hub/core/blocks/evaluation/verify_question_block.py +180 -415
  6. sdg_hub/core/blocks/llm/__init__.py +2 -0
  7. sdg_hub/core/blocks/llm/client_manager.py +61 -24
  8. sdg_hub/core/blocks/llm/config.py +1 -0
  9. sdg_hub/core/blocks/llm/llm_chat_block.py +62 -7
  10. sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +653 -0
  11. sdg_hub/core/blocks/llm/text_parser_block.py +75 -30
  12. sdg_hub/core/blocks/registry.py +49 -35
  13. sdg_hub/core/blocks/transform/index_based_mapper.py +1 -1
  14. sdg_hub/core/flow/base.py +370 -20
  15. sdg_hub/core/flow/checkpointer.py +333 -0
  16. sdg_hub/core/flow/metadata.py +45 -0
  17. sdg_hub/core/flow/migration.py +12 -1
  18. sdg_hub/core/flow/registry.py +121 -58
  19. sdg_hub/core/flow/validation.py +12 -0
  20. sdg_hub/core/utils/__init__.py +2 -1
  21. sdg_hub/core/utils/datautils.py +81 -1
  22. sdg_hub/core/utils/flow_id_words.yaml +231 -0
  23. sdg_hub/core/utils/flow_identifier.py +94 -0
  24. sdg_hub/core/utils/yaml_utils.py +59 -0
  25. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +1 -7
  26. {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/METADATA +59 -31
  27. {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/RECORD +30 -25
  28. {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/WHEEL +0 -0
  29. {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/licenses/LICENSE +0 -0
  30. {sdg_hub-0.2.0.dist-info → sdg_hub-0.2.2.dist-info}/top_level.txt +0 -0
sdg_hub/core/flow/base.py CHANGED
@@ -4,19 +4,28 @@
4
4
  # Standard
5
5
  from pathlib import Path
6
6
  from typing import Any, Optional, Union
7
+ import time
7
8
 
8
9
  # Third Party
9
10
  from datasets import Dataset
10
11
  from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
12
+ from rich.console import Console
13
+ from rich.panel import Panel
14
+ from rich.table import Table
15
+ from rich.tree import Tree
16
+ import datasets
11
17
  import yaml
12
18
 
13
19
  # Local
14
20
  from ..blocks.base import BaseBlock
15
21
  from ..blocks.registry import BlockRegistry
22
+ from ..utils.datautils import safe_concatenate_with_validation, validate_no_duplicates
16
23
  from ..utils.error_handling import EmptyDatasetError, FlowValidationError
17
24
  from ..utils.logger_config import setup_logger
18
25
  from ..utils.path_resolution import resolve_path
19
- from .metadata import FlowMetadata, FlowParameter
26
+ from ..utils.yaml_utils import save_flow_yaml
27
+ from .checkpointer import FlowCheckpointer
28
+ from .metadata import DatasetRequirements, FlowMetadata, FlowParameter
20
29
  from .migration import FlowMigration
21
30
  from .validation import FlowValidator
22
31
 
@@ -133,7 +142,17 @@ class Flow(BaseModel):
133
142
  -------
134
143
  Flow
135
144
  Validated Flow instance.
145
+
146
+ Raises
147
+ ------
148
+ FlowValidationError
149
+ If yaml_path is None or the file doesn't exist.
136
150
  """
151
+ if yaml_path is None:
152
+ raise FlowValidationError(
153
+ "Flow path cannot be None. Please provide a valid YAML file path or check that the flow exists in the registry."
154
+ )
155
+
137
156
  yaml_path = resolve_path(yaml_path, [])
138
157
  yaml_dir = Path(yaml_path).parent
139
158
 
@@ -160,6 +179,8 @@ class Flow(BaseModel):
160
179
  flow_config, migrated_runtime_params = FlowMigration.migrate_to_new_format(
161
180
  flow_config, yaml_path
162
181
  )
182
+ # Save migrated config back to YAML to persist id
183
+ save_flow_yaml(yaml_path, flow_config, "migrated to new format")
163
184
 
164
185
  # Validate YAML structure
165
186
  validator = FlowValidator()
@@ -221,6 +242,17 @@ class Flow(BaseModel):
221
242
  # Create and validate the flow
222
243
  try:
223
244
  flow = cls(blocks=blocks, metadata=metadata, parameters=parameters)
245
+ # Persist generated id back to the YAML file (only on initial load)
246
+ # If the file had no metadata.id originally, update and rewrite
247
+ if not flow_config.get("metadata", {}).get("id"):
248
+ flow_config.setdefault("metadata", {})["id"] = flow.metadata.id
249
+ save_flow_yaml(
250
+ yaml_path,
251
+ flow_config,
252
+ f"added generated id: {flow.metadata.id}",
253
+ )
254
+ else:
255
+ logger.debug(f"Flow already had id: {flow.metadata.id}")
224
256
  # Store migrated runtime params and client for backward compatibility
225
257
  if migrated_runtime_params:
226
258
  flow._migrated_runtime_params = migrated_runtime_params
@@ -275,13 +307,11 @@ class Flow(BaseModel):
275
307
 
276
308
  # Get block class from registry
277
309
  try:
278
- block_class = BlockRegistry.get(block_type_name)
310
+ block_class = BlockRegistry._get(block_type_name)
279
311
  except KeyError as exc:
280
312
  # Get all available blocks from all categories
281
- all_blocks = BlockRegistry.all()
282
- available_blocks = ", ".join(
283
- [block for blocks in all_blocks.values() for block in blocks]
284
- )
313
+ all_blocks = BlockRegistry.list_blocks()
314
+ available_blocks = ", ".join(all_blocks)
285
315
  raise FlowValidationError(
286
316
  f"Block type '{block_type_name}' not found in registry. "
287
317
  f"Available blocks: {available_blocks}"
@@ -324,6 +354,9 @@ class Flow(BaseModel):
324
354
  self,
325
355
  dataset: Dataset,
326
356
  runtime_params: Optional[dict[str, dict[str, Any]]] = None,
357
+ checkpoint_dir: Optional[str] = None,
358
+ save_freq: Optional[int] = None,
359
+ max_concurrency: Optional[int] = None,
327
360
  ) -> Dataset:
328
361
  """Execute the flow blocks in sequence to generate data.
329
362
 
@@ -340,6 +373,14 @@ class Flow(BaseModel):
340
373
  "block_name": {"param1": value1, "param2": value2},
341
374
  "other_block": {"param3": value3}
342
375
  }
376
+ checkpoint_dir : Optional[str], optional
377
+ Directory to save/load checkpoints. If provided, enables checkpointing.
378
+ save_freq : Optional[int], optional
379
+ Number of completed samples after which to save a checkpoint.
380
+ If None, only saves final results when checkpointing is enabled.
381
+ max_concurrency : Optional[int], optional
382
+ Maximum number of concurrent requests across all blocks.
383
+ Controls async request concurrency to prevent overwhelming servers.
343
384
 
344
385
  Returns
345
386
  -------
@@ -353,6 +394,26 @@ class Flow(BaseModel):
353
394
  FlowValidationError
354
395
  If flow validation fails or if model configuration is required but not set.
355
396
  """
397
+ # Validate save_freq parameter early to prevent range() errors
398
+ if save_freq is not None and save_freq <= 0:
399
+ raise FlowValidationError(
400
+ f"save_freq must be greater than 0, got {save_freq}"
401
+ )
402
+
403
+ # Validate max_concurrency parameter
404
+ if max_concurrency is not None:
405
+ # Explicitly reject boolean values (bool is a subclass of int in Python)
406
+ if isinstance(max_concurrency, bool) or not isinstance(
407
+ max_concurrency, int
408
+ ):
409
+ raise FlowValidationError(
410
+ f"max_concurrency must be an int, got {type(max_concurrency).__name__}"
411
+ )
412
+ if max_concurrency <= 0:
413
+ raise FlowValidationError(
414
+ f"max_concurrency must be greater than 0, got {max_concurrency}"
415
+ )
416
+
356
417
  # Validate preconditions
357
418
  if not self.blocks:
358
419
  raise FlowValidationError("Cannot generate with empty flow")
@@ -360,6 +421,8 @@ class Flow(BaseModel):
360
421
  if len(dataset) == 0:
361
422
  raise EmptyDatasetError("Input dataset is empty")
362
423
 
424
+ validate_no_duplicates(dataset)
425
+
363
426
  # Check if model configuration has been set for flows with LLM blocks
364
427
  llm_blocks = self._detect_llm_blocks()
365
428
  if llm_blocks and not self._model_config_set:
@@ -376,18 +439,131 @@ class Flow(BaseModel):
376
439
  "Dataset validation failed:\n" + "\n".join(dataset_errors)
377
440
  )
378
441
 
442
+ # Log concurrency control if specified
443
+ if max_concurrency is not None:
444
+ logger.info(f"Using max_concurrency={max_concurrency} for LLM requests")
445
+
446
+ # Initialize checkpointer if enabled
447
+ checkpointer = None
448
+ completed_dataset = None
449
+ if checkpoint_dir:
450
+ checkpointer = FlowCheckpointer(
451
+ checkpoint_dir=checkpoint_dir,
452
+ save_freq=save_freq,
453
+ flow_id=self.metadata.id,
454
+ )
455
+
456
+ # Load existing progress
457
+ remaining_dataset, completed_dataset = checkpointer.load_existing_progress(
458
+ dataset
459
+ )
460
+
461
+ if len(remaining_dataset) == 0:
462
+ logger.info("All samples already completed, returning existing results")
463
+ return completed_dataset
464
+
465
+ dataset = remaining_dataset
466
+ logger.info(f"Resuming with {len(dataset)} remaining samples")
467
+
379
468
  logger.info(
380
469
  f"Starting flow '{self.metadata.name}' v{self.metadata.version} "
381
470
  f"with {len(dataset)} samples across {len(self.blocks)} blocks"
471
+ + (f" (max_concurrency={max_concurrency})" if max_concurrency else "")
382
472
  )
383
473
 
384
- current_dataset = dataset
385
474
  # Merge migrated runtime params with provided ones (provided ones take precedence)
386
475
  merged_runtime_params = self._migrated_runtime_params.copy()
387
476
  if runtime_params:
388
477
  merged_runtime_params.update(runtime_params)
389
478
  runtime_params = merged_runtime_params
390
479
 
480
+ # Process dataset in chunks if checkpointing with save_freq
481
+ if checkpointer and save_freq:
482
+ all_processed = []
483
+
484
+ # Process in chunks of save_freq
485
+ for i in range(0, len(dataset), save_freq):
486
+ chunk_end = min(i + save_freq, len(dataset))
487
+ chunk_dataset = dataset.select(range(i, chunk_end))
488
+
489
+ logger.info(
490
+ f"Processing chunk {i // save_freq + 1}: samples {i} to {chunk_end - 1}"
491
+ )
492
+
493
+ # Execute all blocks on this chunk
494
+ processed_chunk = self._execute_blocks_on_dataset(
495
+ chunk_dataset, runtime_params, max_concurrency
496
+ )
497
+ all_processed.append(processed_chunk)
498
+
499
+ # Save checkpoint after chunk completion
500
+ checkpointer.add_completed_samples(processed_chunk)
501
+
502
+ # Save final checkpoint for any remaining samples
503
+ checkpointer.save_final_checkpoint()
504
+
505
+ # Combine all processed chunks
506
+ final_dataset = safe_concatenate_with_validation(
507
+ all_processed, "processed chunks from flow execution"
508
+ )
509
+
510
+ # Combine with previously completed samples if any
511
+ if checkpointer and completed_dataset:
512
+ final_dataset = safe_concatenate_with_validation(
513
+ [completed_dataset, final_dataset],
514
+ "completed checkpoint data with newly processed data",
515
+ )
516
+
517
+ else:
518
+ # Process entire dataset at once
519
+ final_dataset = self._execute_blocks_on_dataset(
520
+ dataset, runtime_params, max_concurrency
521
+ )
522
+
523
+ # Save final checkpoint if checkpointing enabled
524
+ if checkpointer:
525
+ checkpointer.add_completed_samples(final_dataset)
526
+ checkpointer.save_final_checkpoint()
527
+
528
+ # Combine with previously completed samples if any
529
+ if completed_dataset:
530
+ final_dataset = safe_concatenate_with_validation(
531
+ [completed_dataset, final_dataset],
532
+ "completed checkpoint data with newly processed data",
533
+ )
534
+
535
+ logger.info(
536
+ f"Flow '{self.metadata.name}' completed successfully: "
537
+ f"{len(final_dataset)} final samples, "
538
+ f"{len(final_dataset.column_names)} final columns"
539
+ )
540
+
541
+ return final_dataset
542
+
543
+ def _execute_blocks_on_dataset(
544
+ self,
545
+ dataset: Dataset,
546
+ runtime_params: dict[str, dict[str, Any]],
547
+ max_concurrency: Optional[int] = None,
548
+ ) -> Dataset:
549
+ """Execute all blocks in sequence on the given dataset.
550
+
551
+ Parameters
552
+ ----------
553
+ dataset : Dataset
554
+ Dataset to process through all blocks.
555
+ runtime_params : Dict[str, Dict[str, Any]]
556
+ Runtime parameters for block execution.
557
+ max_concurrency : Optional[int], optional
558
+ Maximum concurrency for LLM requests across blocks.
559
+
560
+ Returns
561
+ -------
562
+ Dataset
563
+ Dataset after processing through all blocks.
564
+ """
565
+ current_dataset = dataset
566
+
391
567
  # Execute blocks in sequence
392
568
  for i, block in enumerate(self.blocks):
393
569
  logger.info(
@@ -398,6 +574,10 @@ class Flow(BaseModel):
398
574
  # Prepare block execution parameters
399
575
  block_kwargs = self._prepare_block_kwargs(block, runtime_params)
400
576
 
577
+ # Add max_concurrency to block kwargs if provided
578
+ if max_concurrency is not None:
579
+ block_kwargs["_flow_max_concurrency"] = max_concurrency
580
+
401
581
  try:
402
582
  # Check if this is a deprecated block and skip validations
403
583
  is_deprecated_block = (
@@ -436,12 +616,6 @@ class Flow(BaseModel):
436
616
  f"Block '{block.block_name}' execution failed: {exc}"
437
617
  ) from exc
438
618
 
439
- logger.info(
440
- f"Flow '{self.metadata.name}' completed successfully: "
441
- f"{len(current_dataset)} final samples, "
442
- f"{len(current_dataset.column_names)} final columns"
443
- )
444
-
445
619
  return current_dataset
446
620
 
447
621
  def _prepare_block_kwargs(
@@ -760,6 +934,8 @@ class Flow(BaseModel):
760
934
  if len(dataset) == 0:
761
935
  raise EmptyDatasetError("Input dataset is empty")
762
936
 
937
+ validate_no_duplicates(dataset)
938
+
763
939
  # Use smaller sample size if dataset is smaller
764
940
  actual_sample_size = min(sample_size, len(dataset))
765
941
 
@@ -784,9 +960,6 @@ class Flow(BaseModel):
784
960
  "execution_time_seconds": 0,
785
961
  }
786
962
 
787
- # Standard
788
- import time
789
-
790
963
  start_time = time.time()
791
964
 
792
965
  try:
@@ -930,6 +1103,186 @@ class Flow(BaseModel):
930
1103
  "block_names": [block.block_name for block in self.blocks],
931
1104
  }
932
1105
 
1106
+ def get_dataset_requirements(self) -> Optional[DatasetRequirements]:
1107
+ """Get the dataset requirements for this flow.
1108
+
1109
+ Returns
1110
+ -------
1111
+ Optional[DatasetRequirements]
1112
+ Dataset requirements object or None if not defined.
1113
+
1114
+ Examples
1115
+ --------
1116
+ >>> flow = Flow.from_yaml("path/to/flow.yaml")
1117
+ >>> requirements = flow.get_dataset_requirements()
1118
+ >>> if requirements:
1119
+ ... print(f"Required columns: {requirements.required_columns}")
1120
+ """
1121
+ return self.metadata.dataset_requirements
1122
+
1123
+ def get_dataset_schema(self) -> Dataset:
1124
+ """Get an empty dataset with the correct schema for this flow.
1125
+
1126
+ Returns
1127
+ -------
1128
+ Dataset
1129
+ Empty HuggingFace Dataset with the correct schema/features for this flow.
1130
+ Users can add data to this dataset or use it to validate their own dataset schema.
1131
+
1132
+ Examples
1133
+ --------
1134
+ >>> flow = Flow.from_yaml("path/to/flow.yaml")
1135
+ >>> schema_dataset = flow.get_dataset_schema()
1136
+ >>>
1137
+ >>> # Add your data
1138
+ >>> schema_dataset = schema_dataset.add_item({
1139
+ ... "document": "Your document text",
1140
+ ... "domain": "Computer Science",
1141
+ ... "icl_document": "Example document"
1142
+ ... })
1143
+ >>>
1144
+ >>> # Or validate your existing dataset schema
1145
+ >>> my_dataset = Dataset.from_dict(my_data)
1146
+ >>> if my_dataset.features == schema_dataset.features:
1147
+ ... print("Schema matches!")
1148
+ """
1149
+
1150
+ requirements = self.get_dataset_requirements()
1151
+
1152
+ if requirements is None:
1153
+ # Return empty dataset with no schema requirements
1154
+ return Dataset.from_dict({})
1155
+
1156
+ # Build schema features
1157
+ schema_features = {}
1158
+
1159
+ # Process required columns
1160
+ for col_name in requirements.required_columns:
1161
+ col_type = requirements.column_types.get(col_name, "string")
1162
+ schema_features[col_name] = self._map_column_type_to_feature(col_type)
1163
+
1164
+ # Process optional columns
1165
+ for col_name in requirements.optional_columns:
1166
+ col_type = requirements.column_types.get(col_name, "string")
1167
+ schema_features[col_name] = self._map_column_type_to_feature(col_type)
1168
+
1169
+ # Create empty dataset with the correct features
1170
+ features = datasets.Features(schema_features)
1171
+ empty_data = {col_name: [] for col_name in schema_features.keys()}
1172
+
1173
+ return Dataset.from_dict(empty_data, features=features)
1174
+
1175
+ def _map_column_type_to_feature(self, col_type: str):
1176
+ """Map column type string to HuggingFace feature type."""
1177
+ # Map common type names to HuggingFace types
1178
+ if col_type in ["str", "string", "text"]:
1179
+ return datasets.Value("string")
1180
+ elif col_type in ["int", "integer"]:
1181
+ return datasets.Value("int64")
1182
+ elif col_type in ["float", "number"]:
1183
+ return datasets.Value("float64")
1184
+ elif col_type in ["bool", "boolean"]:
1185
+ return datasets.Value("bool")
1186
+ else:
1187
+ # Default to string for unknown types
1188
+ return datasets.Value("string")
1189
+
1190
+ def print_info(self) -> None:
1191
+ """
1192
+ Print an interactive summary of the Flow in the console.
1193
+
1194
+ The summary contains:
1195
+ 1. Flow metadata (name, version, author, description)
1196
+ 2. Defined runtime parameters with type hints and defaults
1197
+ 3. A table of all blocks with their input and output columns
1198
+
1199
+ Notes
1200
+ -----
1201
+ Uses the `rich` library for colourised output; install with
1202
+ `pip install rich` if not already present.
1203
+
1204
+ Returns
1205
+ -------
1206
+ None
1207
+ """
1208
+
1209
+ console = Console()
1210
+
1211
+ # Create main tree structure
1212
+ flow_tree = Tree(
1213
+ f"[bold bright_blue]{self.metadata.name}[/bold bright_blue] Flow"
1214
+ )
1215
+
1216
+ # Metadata section
1217
+ metadata_branch = flow_tree.add(
1218
+ "[bold bright_green]Metadata[/bold bright_green]"
1219
+ )
1220
+ metadata_branch.add(
1221
+ f"Version: [bright_cyan]{self.metadata.version}[/bright_cyan]"
1222
+ )
1223
+ metadata_branch.add(
1224
+ f"Author: [bright_cyan]{self.metadata.author}[/bright_cyan]"
1225
+ )
1226
+ if self.metadata.description:
1227
+ metadata_branch.add(
1228
+ f"Description: [white]{self.metadata.description}[/white]"
1229
+ )
1230
+
1231
+ # Parameters section
1232
+ if self.parameters:
1233
+ params_branch = flow_tree.add(
1234
+ "[bold bright_yellow]Parameters[/bold bright_yellow]"
1235
+ )
1236
+ for name, param in self.parameters.items():
1237
+ param_info = f"[bright_cyan]{name}[/bright_cyan]: [white]{param.type_hint}[/white]"
1238
+ if param.default is not None:
1239
+ param_info += f" = [bright_white]{param.default}[/bright_white]"
1240
+ params_branch.add(param_info)
1241
+
1242
+ # Blocks overview
1243
+ flow_tree.add(
1244
+ f"[bold bright_magenta]Blocks[/bold bright_magenta] ({len(self.blocks)} total)"
1245
+ )
1246
+
1247
+ # Create blocks table
1248
+ blocks_table = Table(show_header=True, header_style="bold bright_white")
1249
+ blocks_table.add_column("Block Name", style="bright_cyan")
1250
+ blocks_table.add_column("Type", style="bright_green")
1251
+ blocks_table.add_column("Input Cols", style="bright_yellow")
1252
+ blocks_table.add_column("Output Cols", style="bright_red")
1253
+
1254
+ for block in self.blocks:
1255
+ input_cols = getattr(block, "input_cols", None)
1256
+ output_cols = getattr(block, "output_cols", None)
1257
+
1258
+ blocks_table.add_row(
1259
+ block.block_name,
1260
+ block.__class__.__name__,
1261
+ str(input_cols) if input_cols else "[bright_black]None[/bright_black]",
1262
+ str(output_cols)
1263
+ if output_cols
1264
+ else "[bright_black]None[/bright_black]",
1265
+ )
1266
+
1267
+ # Print everything
1268
+ console.print()
1269
+ console.print(
1270
+ Panel(
1271
+ flow_tree,
1272
+ title="[bold bright_white]Flow Information[/bold bright_white]",
1273
+ border_style="bright_blue",
1274
+ )
1275
+ )
1276
+ console.print()
1277
+ console.print(
1278
+ Panel(
1279
+ blocks_table,
1280
+ title="[bold bright_white]Block Details[/bold bright_white]",
1281
+ border_style="bright_magenta",
1282
+ )
1283
+ )
1284
+ console.print()
1285
+
933
1286
  def to_yaml(self, output_path: str) -> None:
934
1287
  """Save flow configuration to YAML file.
935
1288
 
@@ -952,10 +1305,7 @@ class Flow(BaseModel):
952
1305
  name: param.model_dump() for name, param in self.parameters.items()
953
1306
  }
954
1307
 
955
- with open(output_path, "w", encoding="utf-8") as f:
956
- yaml.dump(config, f, default_flow_style=False, sort_keys=False)
957
-
958
- logger.info(f"Flow configuration saved to: {output_path}")
1308
+ save_flow_yaml(output_path, config)
959
1309
 
960
1310
  def __len__(self) -> int:
961
1311
  """Number of blocks in the flow."""