sdg-hub 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sdg_hub/core/flow/base.py CHANGED
@@ -8,8 +8,6 @@ from typing import Any, Optional, Union
8
8
  import time
9
9
  import uuid
10
10
 
11
- # Third Party
12
- from datasets import Dataset
13
11
  from pydantic import (
14
12
  BaseModel,
15
13
  ConfigDict,
@@ -23,6 +21,9 @@ from rich.panel import Panel
23
21
  from rich.table import Table
24
22
  from rich.tree import Tree
25
23
  import datasets
24
+
25
+ # Third Party
26
+ import pandas as pd
26
27
  import yaml
27
28
 
28
29
  # Local
@@ -285,15 +286,62 @@ class Flow(BaseModel):
285
286
  return {key: str(yaml_dir / path) for key, path in paths.items()}
286
287
  return paths
287
288
 
289
+ @staticmethod
290
+ def _convert_to_dataframe(
291
+ dataset: Union[pd.DataFrame, datasets.Dataset],
292
+ ) -> tuple[pd.DataFrame, bool]:
293
+ """Convert datasets.Dataset to pd.DataFrame if needed (backwards compatibility).
294
+
295
+ Parameters
296
+ ----------
297
+ dataset : Union[pd.DataFrame, datasets.Dataset]
298
+ Input dataset in either format.
299
+
300
+ Returns
301
+ -------
302
+ tuple[pd.DataFrame, bool]
303
+ Tuple of (converted DataFrame, was_dataset flag).
304
+ was_dataset is True if input was a datasets.Dataset, False if it was already a DataFrame.
305
+ """
306
+ if isinstance(dataset, datasets.Dataset):
307
+ logger.info("Converting datasets.Dataset to pd.DataFrame for processing")
308
+ return dataset.to_pandas(), True
309
+ return dataset, False
310
+
311
+ @staticmethod
312
+ def _convert_from_dataframe(
313
+ df: pd.DataFrame, should_convert: bool
314
+ ) -> Union[pd.DataFrame, datasets.Dataset]:
315
+ """Convert pd.DataFrame back to datasets.Dataset if needed (backwards compatibility).
316
+
317
+ Parameters
318
+ ----------
319
+ df : pd.DataFrame
320
+ DataFrame to potentially convert.
321
+ should_convert : bool
322
+ If True, convert to datasets.Dataset. If False, return as-is.
323
+
324
+ Returns
325
+ -------
326
+ Union[pd.DataFrame, datasets.Dataset]
327
+ Original DataFrame or converted Dataset, matching the input type.
328
+ """
329
+ if should_convert:
330
+ logger.info(
331
+ "Converting pd.DataFrame back to datasets.Dataset to match input type"
332
+ )
333
+ return datasets.Dataset.from_pandas(df)
334
+ return df
335
+
288
336
  def generate(
289
337
  self,
290
- dataset: Dataset,
338
+ dataset: Union[pd.DataFrame, datasets.Dataset],
291
339
  runtime_params: Optional[dict[str, dict[str, Any]]] = None,
292
340
  checkpoint_dir: Optional[str] = None,
293
341
  save_freq: Optional[int] = None,
294
342
  log_dir: Optional[str] = None,
295
343
  max_concurrency: Optional[int] = None,
296
- ) -> Dataset:
344
+ ) -> Union[pd.DataFrame, datasets.Dataset]:
297
345
  """Execute the flow blocks in sequence to generate data.
298
346
 
299
347
  Note: For flows with LLM blocks, set_model_config() must be called first
@@ -301,8 +349,9 @@ class Flow(BaseModel):
301
349
 
302
350
  Parameters
303
351
  ----------
304
- dataset : Dataset
305
- Input dataset to process.
352
+ dataset : Union[pd.DataFrame, datasets.Dataset]
353
+ Input dataset to process. Can be either pandas DataFrame or HuggingFace Dataset
354
+ (will be automatically converted to DataFrame for backwards compatibility).
306
355
  runtime_params : Optional[Dict[str, Dict[str, Any]]], optional
307
356
  Runtime parameters organized by block name. Format:
308
357
  {
@@ -324,8 +373,9 @@ class Flow(BaseModel):
324
373
 
325
374
  Returns
326
375
  -------
327
- Dataset
376
+ Union[pd.DataFrame, datasets.Dataset]
328
377
  Processed dataset after all blocks have been executed.
378
+ Return type matches the input type (DataFrame in -> DataFrame out, Dataset in -> Dataset out).
329
379
 
330
380
  Raises
331
381
  ------
@@ -334,6 +384,9 @@ class Flow(BaseModel):
334
384
  FlowValidationError
335
385
  If flow validation fails or if model configuration is required but not set.
336
386
  """
387
+ # Convert to DataFrame if needed (backwards compatibility)
388
+ dataset, was_dataset = self._convert_to_dataframe(dataset)
389
+
337
390
  # Validate save_freq parameter early to prevent range() errors
338
391
  if save_freq is not None and save_freq <= 0:
339
392
  raise FlowValidationError(
@@ -429,7 +482,7 @@ class Flow(BaseModel):
429
482
  finally:
430
483
  flow_logger.removeHandler(h)
431
484
 
432
- return completed_dataset
485
+ return self._convert_from_dataframe(completed_dataset, was_dataset)
433
486
 
434
487
  dataset = remaining_dataset
435
488
  flow_logger.info(f"Resuming with {len(dataset)} remaining samples")
@@ -456,7 +509,7 @@ class Flow(BaseModel):
456
509
  # Process in chunks of save_freq
457
510
  for i in range(0, len(dataset), save_freq):
458
511
  chunk_end = min(i + save_freq, len(dataset))
459
- chunk_dataset = dataset.select(range(i, chunk_end))
512
+ chunk_dataset = dataset.iloc[i:chunk_end]
460
513
 
461
514
  flow_logger.info(
462
515
  f"Processing chunk {i // save_freq + 1}: samples {i} to {chunk_end - 1}"
@@ -480,7 +533,11 @@ class Flow(BaseModel):
480
533
  )
481
534
 
482
535
  # Combine with previously completed samples if any
483
- if checkpointer and completed_dataset:
536
+ if (
537
+ checkpointer
538
+ and completed_dataset is not None
539
+ and not completed_dataset.empty
540
+ ):
484
541
  final_dataset = safe_concatenate_with_validation(
485
542
  [completed_dataset, final_dataset],
486
543
  "completed checkpoint data with newly processed data",
@@ -498,7 +555,7 @@ class Flow(BaseModel):
498
555
  checkpointer.save_final_checkpoint()
499
556
 
500
557
  # Combine with previously completed samples if any
501
- if completed_dataset:
558
+ if completed_dataset is not None and not completed_dataset.empty:
502
559
  final_dataset = safe_concatenate_with_validation(
503
560
  [completed_dataset, final_dataset],
504
561
  "completed checkpoint data with newly processed data",
@@ -536,7 +593,7 @@ class Flow(BaseModel):
536
593
  flow_logger.info(
537
594
  f"Flow '{self.metadata.name}' completed successfully: "
538
595
  f"{len(final_dataset)} final samples, "
539
- f"{len(final_dataset.column_names)} final columns"
596
+ f"{len(final_dataset.columns)} final columns"
540
597
  )
541
598
 
542
599
  # Close file handlers if we opened a flow-specific logger
@@ -550,20 +607,20 @@ class Flow(BaseModel):
550
607
  finally:
551
608
  flow_logger.removeHandler(h)
552
609
 
553
- return final_dataset
610
+ return self._convert_from_dataframe(final_dataset, was_dataset)
554
611
 
555
612
  def _execute_blocks_on_dataset(
556
613
  self,
557
- dataset: Dataset,
614
+ dataset: pd.DataFrame,
558
615
  runtime_params: dict[str, dict[str, Any]],
559
616
  flow_logger=None,
560
617
  max_concurrency: Optional[int] = None,
561
- ) -> Dataset:
618
+ ) -> pd.DataFrame:
562
619
  """Execute all blocks in sequence on the given dataset.
563
620
 
564
621
  Parameters
565
622
  ----------
566
- dataset : Dataset
623
+ dataset : pd.DataFrame
567
624
  Dataset to process through all blocks.
568
625
  runtime_params : Dict[str, Dict[str, Any]]
569
626
  Runtime parameters for block execution.
@@ -574,7 +631,7 @@ class Flow(BaseModel):
574
631
 
575
632
  Returns
576
633
  -------
577
- Dataset
634
+ pd.DataFrame
578
635
  Dataset after processing through all blocks.
579
636
  """
580
637
  # Use provided logger or fall back to global logger
@@ -598,7 +655,7 @@ class Flow(BaseModel):
598
655
  # Capture metrics before execution
599
656
  start_time = time.perf_counter()
600
657
  input_rows = len(current_dataset)
601
- input_cols = set(current_dataset.column_names)
658
+ input_cols = set(current_dataset.columns)
602
659
 
603
660
  try:
604
661
  # Execute block with validation and logging
@@ -613,7 +670,7 @@ class Flow(BaseModel):
613
670
  # Capture metrics after successful execution
614
671
  execution_time = time.perf_counter() - start_time
615
672
  output_rows = len(current_dataset)
616
- output_cols = set(current_dataset.column_names)
673
+ output_cols = set(current_dataset.columns)
617
674
  added_cols = output_cols - input_cols
618
675
  removed_cols = input_cols - output_cols
619
676
 
@@ -634,7 +691,7 @@ class Flow(BaseModel):
634
691
  exec_logger.info(
635
692
  f"Block '{block.block_name}' completed successfully: "
636
693
  f"{len(current_dataset)} samples, "
637
- f"{len(current_dataset.column_names)} columns"
694
+ f"{len(current_dataset.columns)} columns"
638
695
  )
639
696
 
640
697
  except Exception as exc:
@@ -932,17 +989,37 @@ class Flow(BaseModel):
932
989
  "experimental": self.metadata.recommended_models.experimental,
933
990
  }
934
991
 
935
- def validate_dataset(self, dataset: Dataset) -> list[str]:
936
- """Validate dataset against flow requirements."""
992
+ def validate_dataset(
993
+ self, dataset: Union[pd.DataFrame, datasets.Dataset]
994
+ ) -> list[str]:
995
+ """Validate dataset against flow requirements.
996
+
997
+ Parameters
998
+ ----------
999
+ dataset : Union[pd.DataFrame, datasets.Dataset]
1000
+ Dataset to validate. Can be either pandas DataFrame or HuggingFace Dataset
1001
+ (will be automatically converted to DataFrame for backwards compatibility).
1002
+
1003
+ Returns
1004
+ -------
1005
+ list[str]
1006
+ List of validation error messages (empty if valid).
1007
+ """
1008
+ # Convert to DataFrame if needed (backwards compatibility)
1009
+ dataset, _ = self._convert_to_dataframe(dataset)
1010
+
937
1011
  errors = []
938
1012
 
939
1013
  if len(dataset) == 0:
940
1014
  errors.append("Dataset is empty")
941
1015
 
942
1016
  if self.metadata.dataset_requirements:
1017
+ # Get column names
1018
+ columns = dataset.columns.tolist()
1019
+
943
1020
  errors.extend(
944
1021
  self.metadata.dataset_requirements.validate_dataset(
945
- dataset.column_names, len(dataset)
1022
+ columns, len(dataset)
946
1023
  )
947
1024
  )
948
1025
 
@@ -950,7 +1027,7 @@ class Flow(BaseModel):
950
1027
 
951
1028
  def dry_run(
952
1029
  self,
953
- dataset: Dataset,
1030
+ dataset: Union[pd.DataFrame, datasets.Dataset],
954
1031
  sample_size: int = 2,
955
1032
  runtime_params: Optional[dict[str, dict[str, Any]]] = None,
956
1033
  max_concurrency: Optional[int] = None,
@@ -960,8 +1037,9 @@ class Flow(BaseModel):
960
1037
 
961
1038
  Parameters
962
1039
  ----------
963
- dataset : Dataset
964
- Input dataset to test with.
1040
+ dataset : Union[pd.DataFrame, datasets.Dataset]
1041
+ Input dataset to test with. Can be either pandas DataFrame or HuggingFace Dataset
1042
+ (will be automatically converted to DataFrame for backwards compatibility).
965
1043
  sample_size : int, default=2
966
1044
  Number of samples to use for dry run testing.
967
1045
  runtime_params : Optional[Dict[str, Dict[str, Any]]], optional
@@ -986,6 +1064,9 @@ class Flow(BaseModel):
986
1064
  FlowValidationError
987
1065
  If any block fails during dry run execution.
988
1066
  """
1067
+ # Convert to DataFrame if needed (backwards compatibility)
1068
+ dataset, _ = self._convert_to_dataframe(dataset)
1069
+
989
1070
  # Validate preconditions
990
1071
  if not self.blocks:
991
1072
  raise FlowValidationError("Cannot dry run empty flow")
@@ -1017,7 +1098,7 @@ class Flow(BaseModel):
1017
1098
  )
1018
1099
 
1019
1100
  # Create subset dataset
1020
- sample_dataset = dataset.select(range(actual_sample_size))
1101
+ sample_dataset = dataset.iloc[:actual_sample_size]
1021
1102
 
1022
1103
  # Initialize dry run results
1023
1104
  dry_run_results = {
@@ -1026,7 +1107,7 @@ class Flow(BaseModel):
1026
1107
  "sample_size": actual_sample_size,
1027
1108
  "original_dataset_size": len(dataset),
1028
1109
  "max_concurrency": max_concurrency,
1029
- "input_columns": dataset.column_names,
1110
+ "input_columns": dataset.columns.tolist(),
1030
1111
  "blocks_executed": [],
1031
1112
  "final_dataset": None,
1032
1113
  "execution_successful": True,
@@ -1070,7 +1151,7 @@ class Flow(BaseModel):
1070
1151
  "execution_time_seconds": block_execution_time,
1071
1152
  "input_rows": input_rows,
1072
1153
  "output_rows": len(current_dataset),
1073
- "output_columns": current_dataset.column_names,
1154
+ "output_columns": current_dataset.columns.tolist(),
1074
1155
  "parameters_used": block_kwargs,
1075
1156
  }
1076
1157
 
@@ -1079,14 +1160,14 @@ class Flow(BaseModel):
1079
1160
  logger.info(
1080
1161
  f"Dry run block '{block.block_name}' completed: "
1081
1162
  f"{len(current_dataset)} samples, "
1082
- f"{len(current_dataset.column_names)} columns, "
1163
+ f"{len(current_dataset.columns)} columns, "
1083
1164
  f"{block_execution_time:.2f}s"
1084
1165
  )
1085
1166
 
1086
1167
  # Store final results
1087
1168
  dry_run_results["final_dataset"] = {
1088
1169
  "rows": len(current_dataset),
1089
- "columns": current_dataset.column_names,
1170
+ "columns": current_dataset.columns.tolist(),
1090
1171
  "sample_data": current_dataset.to_dict()
1091
1172
  if len(current_dataset) > 0
1092
1173
  else {},
@@ -1121,7 +1202,7 @@ class Flow(BaseModel):
1121
1202
  def _estimate_total_time(
1122
1203
  self,
1123
1204
  first_run_results: dict[str, Any],
1124
- dataset: Dataset,
1205
+ dataset: pd.DataFrame,
1125
1206
  runtime_params: Optional[dict[str, dict[str, Any]]],
1126
1207
  max_concurrency: Optional[int],
1127
1208
  ) -> dict[str, Any]:
@@ -1134,7 +1215,7 @@ class Flow(BaseModel):
1134
1215
  ----------
1135
1216
  first_run_results : dict
1136
1217
  Results from the first dry run.
1137
- dataset : Dataset
1218
+ dataset : pd.DataFrame
1138
1219
  Full dataset for estimation.
1139
1220
  runtime_params : Optional[dict]
1140
1221
  Runtime parameters.
@@ -1283,13 +1364,13 @@ class Flow(BaseModel):
1283
1364
  """
1284
1365
  return self.metadata.dataset_requirements
1285
1366
 
1286
- def get_dataset_schema(self) -> Dataset:
1367
+ def get_dataset_schema(self) -> pd.DataFrame:
1287
1368
  """Get an empty dataset with the correct schema for this flow.
1288
1369
 
1289
1370
  Returns
1290
1371
  -------
1291
- Dataset
1292
- Empty HuggingFace Dataset with the correct schema/features for this flow.
1372
+ pd.DataFrame
1373
+ Empty DataFrame with the correct schema/features for this flow.
1293
1374
  Users can add data to this dataset or use it to validate their own dataset schema.
1294
1375
 
1295
1376
  Examples
@@ -1305,50 +1386,51 @@ class Flow(BaseModel):
1305
1386
  ... })
1306
1387
  >>>
1307
1388
  >>> # Or validate your existing dataset schema
1308
- >>> my_dataset = Dataset.from_dict(my_data)
1309
- >>> if my_dataset.features == schema_dataset.features:
1389
+ >>> my_dataset = pd.DataFrame(my_data)
1390
+ >>> if my_dataset.dtypes.equals(schema_dataset.dtypes):
1310
1391
  ... print("Schema matches!")
1311
1392
  """
1312
1393
 
1313
1394
  requirements = self.get_dataset_requirements()
1314
1395
 
1315
1396
  if requirements is None:
1316
- # Return empty dataset with no schema requirements
1317
- return Dataset.from_dict({})
1397
+ # Return empty dataframe with no schema requirements
1398
+ return pd.DataFrame({})
1318
1399
 
1319
- # Build schema features
1320
- schema_features = {}
1400
+ # Build schema with column names and dtypes
1401
+ schema = {}
1321
1402
 
1322
1403
  # Process required columns
1323
1404
  for col_name in requirements.required_columns:
1324
1405
  col_type = requirements.column_types.get(col_name, "string")
1325
- schema_features[col_name] = self._map_column_type_to_feature(col_type)
1406
+ schema[col_name] = self._map_column_type_to_dtype(col_type)
1326
1407
 
1327
1408
  # Process optional columns
1328
1409
  for col_name in requirements.optional_columns:
1329
1410
  col_type = requirements.column_types.get(col_name, "string")
1330
- schema_features[col_name] = self._map_column_type_to_feature(col_type)
1411
+ schema[col_name] = self._map_column_type_to_dtype(col_type)
1331
1412
 
1332
- # Create empty dataset with the correct features
1333
- features = datasets.Features(schema_features)
1334
- empty_data = {col_name: [] for col_name in schema_features.keys()}
1413
+ # Create empty dataframe with the correct dtypes
1414
+ empty_data = {
1415
+ col_name: pd.Series([], dtype=dtype) for col_name, dtype in schema.items()
1416
+ }
1335
1417
 
1336
- return Dataset.from_dict(empty_data, features=features)
1418
+ return pd.DataFrame(empty_data)
1337
1419
 
1338
- def _map_column_type_to_feature(self, col_type: str):
1339
- """Map column type string to HuggingFace feature type."""
1340
- # Map common type names to HuggingFace types
1420
+ def _map_column_type_to_dtype(self, col_type: str):
1421
+ """Map column type string to pandas dtype."""
1422
+ # Map common type names to pandas dtypes
1341
1423
  if col_type in ["str", "string", "text"]:
1342
- return datasets.Value("string")
1424
+ return "object" # pandas uses 'object' for strings
1343
1425
  elif col_type in ["int", "integer"]:
1344
- return datasets.Value("int64")
1426
+ return "Int64" # nullable integer
1345
1427
  elif col_type in ["float", "number"]:
1346
- return datasets.Value("float64")
1428
+ return "float64"
1347
1429
  elif col_type in ["bool", "boolean"]:
1348
- return datasets.Value("bool")
1430
+ return "boolean" # nullable boolean
1349
1431
  else:
1350
- # Default to string for unknown types
1351
- return datasets.Value("string")
1432
+ # Default to object (string) for unknown types
1433
+ return "object"
1352
1434
 
1353
1435
  def print_info(self) -> None:
1354
1436
  """
@@ -9,7 +9,7 @@ import os
9
9
  import uuid
10
10
 
11
11
  # Third Party
12
- from datasets import Dataset
12
+ import pandas as pd
13
13
 
14
14
  # Local
15
15
  from ..utils.datautils import safe_concatenate_with_validation
@@ -67,18 +67,18 @@ class FlowCheckpointer:
67
67
  return os.path.join(self.checkpoint_dir, "flow_metadata.json")
68
68
 
69
69
  def load_existing_progress(
70
- self, input_dataset: Dataset
71
- ) -> Tuple[Dataset, Optional[Dataset]]:
70
+ self, input_dataset: pd.DataFrame
71
+ ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
72
72
  """Load existing checkpoint data and determine remaining work.
73
73
 
74
74
  Parameters
75
75
  ----------
76
- input_dataset : Dataset
76
+ input_dataset : pd.DataFrame
77
77
  Original input dataset for the flow.
78
78
 
79
79
  Returns
80
80
  -------
81
- Tuple[Dataset, Optional[Dataset]]
81
+ Tuple[pd.DataFrame, Optional[pd.DataFrame]]
82
82
  (remaining_samples_to_process, completed_samples_dataset)
83
83
  If no checkpoints exist, returns (input_dataset, None)
84
84
  """
@@ -127,20 +127,20 @@ class FlowCheckpointer:
127
127
  logger.warning(f"Failed to load checkpoints: {exc}. Starting from scratch.")
128
128
  return input_dataset, None
129
129
 
130
- def add_completed_samples(self, samples: Dataset) -> None:
130
+ def add_completed_samples(self, samples: pd.DataFrame) -> None:
131
131
  """Add samples that have completed the entire flow.
132
132
 
133
133
  Parameters
134
134
  ----------
135
- samples : Dataset
135
+ samples : pd.DataFrame
136
136
  Samples that have completed processing through all blocks.
137
137
  """
138
138
  if not self.is_enabled:
139
139
  return
140
140
 
141
141
  # Add to pending samples
142
- for sample in samples:
143
- self._pending_samples.append(sample)
142
+ for _, sample in samples.iterrows():
143
+ self._pending_samples.append(sample.to_dict())
144
144
  self._samples_processed += 1
145
145
 
146
146
  # Check if we should save a checkpoint
@@ -167,9 +167,9 @@ class FlowCheckpointer:
167
167
  self.checkpoint_dir, f"checkpoint_{self._checkpoint_counter:04d}.jsonl"
168
168
  )
169
169
 
170
- # Convert pending samples to dataset and save
171
- checkpoint_dataset = Dataset.from_list(self._pending_samples)
172
- checkpoint_dataset.to_json(checkpoint_file, orient="records", lines=True)
170
+ # Convert pending samples to dataframe and save
171
+ checkpoint_df = pd.DataFrame(self._pending_samples)
172
+ checkpoint_df.to_json(checkpoint_file, orient="records", lines=True)
173
173
 
174
174
  # Update metadata
175
175
  self._save_metadata()
@@ -207,7 +207,7 @@ class FlowCheckpointer:
207
207
  logger.warning(f"Failed to load metadata: {exc}")
208
208
  return None
209
209
 
210
- def _load_completed_samples(self) -> Optional[Dataset]:
210
+ def _load_completed_samples(self) -> Optional[pd.DataFrame]:
211
211
  """Load all completed samples from checkpoint files."""
212
212
  checkpoint_files = []
213
213
  checkpoint_dir = Path(self.checkpoint_dir)
@@ -222,27 +222,25 @@ class FlowCheckpointer:
222
222
  # Sort checkpoint files by number
223
223
  checkpoint_files.sort()
224
224
 
225
- # Load and concatenate all checkpoint datasets
226
- datasets = []
225
+ # Load and concatenate all checkpoint dataframes
226
+ dataframes = []
227
227
  for file_path in checkpoint_files:
228
228
  try:
229
- dataset = Dataset.from_json(file_path)
230
- if len(dataset) > 0:
231
- datasets.append(dataset)
232
- logger.debug(
233
- f"Loaded checkpoint: {file_path} ({len(dataset)} samples)"
234
- )
229
+ df = pd.read_json(file_path, lines=True)
230
+ if len(df) > 0:
231
+ dataframes.append(df)
232
+ logger.debug(f"Loaded checkpoint: {file_path} ({len(df)} samples)")
235
233
  except Exception as exc:
236
234
  logger.warning(f"Failed to load checkpoint {file_path}: {exc}")
237
235
 
238
- if not datasets:
236
+ if not dataframes:
239
237
  return None
240
238
 
241
- return safe_concatenate_with_validation(datasets, "checkpoint files")
239
+ return safe_concatenate_with_validation(dataframes, "checkpoint files")
242
240
 
243
241
  def _find_remaining_samples(
244
- self, input_dataset: Dataset, completed_dataset: Dataset
245
- ) -> Dataset:
242
+ self, input_dataset: pd.DataFrame, completed_dataset: pd.DataFrame
243
+ ) -> pd.DataFrame:
246
244
  """Find samples from input_dataset that are not in completed_dataset.
247
245
 
248
246
  Note: Assumes input_dataset contains unique samples. For datasets with
@@ -250,19 +248,19 @@ class FlowCheckpointer:
250
248
 
251
249
  Parameters
252
250
  ----------
253
- input_dataset : Dataset
251
+ input_dataset : pd.DataFrame
254
252
  Original input dataset (assumed to contain unique samples).
255
- completed_dataset : Dataset
253
+ completed_dataset : pd.DataFrame
256
254
  Dataset of completed samples.
257
255
 
258
256
  Returns
259
257
  -------
260
- Dataset
258
+ pd.DataFrame
261
259
  Samples that still need processing.
262
260
  """
263
261
  # Get common columns for comparison
264
- input_columns = set(input_dataset.column_names)
265
- completed_columns = set(completed_dataset.column_names)
262
+ input_columns = set(input_dataset.columns.tolist())
263
+ completed_columns = set(completed_dataset.columns.tolist())
266
264
  common_columns = list(input_columns & completed_columns)
267
265
 
268
266
  if not common_columns:
@@ -272,9 +270,9 @@ class FlowCheckpointer:
272
270
  )
273
271
  return input_dataset
274
272
 
275
- # Convert to pandas for easier comparison
276
- input_df = input_dataset.select_columns(common_columns).to_pandas()
277
- completed_df = completed_dataset.select_columns(common_columns).to_pandas()
273
+ # Select only common columns for comparison
274
+ input_df = input_dataset[common_columns]
275
+ completed_df = completed_dataset[common_columns]
278
276
 
279
277
  # Find rows that haven't been completed
280
278
  # Use tuple representation for comparison
@@ -287,10 +285,10 @@ class FlowCheckpointer:
287
285
  remaining_indices = input_df[remaining_mask].index.tolist()
288
286
 
289
287
  if not remaining_indices:
290
- # Return empty dataset with same structure
291
- return input_dataset.select([])
288
+ # Return empty dataframe with same structure
289
+ return input_dataset.iloc[0:0]
292
290
 
293
- return input_dataset.select(remaining_indices)
291
+ return input_dataset.iloc[remaining_indices]
294
292
 
295
293
  def get_progress_info(self) -> Dict[str, Any]:
296
294
  """Get information about current progress.
@@ -5,7 +5,7 @@
5
5
  from typing import TYPE_CHECKING, Any
6
6
 
7
7
  # Third Party
8
- from datasets import Dataset
8
+ import pandas as pd
9
9
 
10
10
  if TYPE_CHECKING:
11
11
  # Local
@@ -180,14 +180,14 @@ class FlowValidator:
180
180
 
181
181
  return errors
182
182
 
183
- def validate_flow_execution(self, flow: "Flow", dataset: Dataset) -> list[str]:
183
+ def validate_flow_execution(self, flow: "Flow", dataset: pd.DataFrame) -> list[str]:
184
184
  """Validate that a flow can be executed with the given dataset.
185
185
 
186
186
  Parameters
187
187
  ----------
188
188
  flow : Flow
189
189
  The flow to validate.
190
- dataset : Dataset
190
+ dataset : pd.DataFrame
191
191
  Dataset to validate against.
192
192
 
193
193
  Returns
@@ -206,7 +206,7 @@ class FlowValidator:
206
206
  return errors
207
207
 
208
208
  # Track available columns as we progress through blocks
209
- current_columns = set(dataset.column_names)
209
+ current_columns = set(dataset.columns.tolist())
210
210
 
211
211
  for _i, block in enumerate(flow.blocks):
212
212
  block_name = block.block_name