sdg-hub 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/_version.py +2 -2
- sdg_hub/core/blocks/base.py +60 -58
- sdg_hub/core/blocks/filtering/column_value_filter.py +29 -16
- sdg_hub/core/blocks/llm/__init__.py +0 -2
- sdg_hub/core/blocks/llm/llm_chat_block.py +42 -36
- sdg_hub/core/blocks/llm/llm_parser_block.py +13 -7
- sdg_hub/core/blocks/llm/prompt_builder_block.py +15 -10
- sdg_hub/core/blocks/llm/text_parser_block.py +14 -9
- sdg_hub/core/blocks/transform/duplicate_columns.py +9 -8
- sdg_hub/core/blocks/transform/index_based_mapper.py +29 -15
- sdg_hub/core/blocks/transform/json_structure_block.py +16 -13
- sdg_hub/core/blocks/transform/melt_columns.py +13 -12
- sdg_hub/core/blocks/transform/rename_columns.py +20 -9
- sdg_hub/core/blocks/transform/text_concat.py +20 -21
- sdg_hub/core/blocks/transform/uniform_col_val_setter.py +6 -5
- sdg_hub/core/flow/base.py +139 -57
- sdg_hub/core/flow/checkpointer.py +34 -36
- sdg_hub/core/flow/validation.py +4 -4
- sdg_hub/core/utils/datautils.py +52 -54
- sdg_hub/core/utils/flow_metrics.py +9 -6
- {sdg_hub-0.5.0.dist-info → sdg_hub-0.6.0.dist-info}/METADATA +2 -8
- {sdg_hub-0.5.0.dist-info → sdg_hub-0.6.0.dist-info}/RECORD +25 -26
- sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -771
- {sdg_hub-0.5.0.dist-info → sdg_hub-0.6.0.dist-info}/WHEEL +0 -0
- {sdg_hub-0.5.0.dist-info → sdg_hub-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.5.0.dist-info → sdg_hub-0.6.0.dist-info}/top_level.txt +0 -0
sdg_hub/core/flow/base.py
CHANGED
|
@@ -8,8 +8,6 @@ from typing import Any, Optional, Union
|
|
|
8
8
|
import time
|
|
9
9
|
import uuid
|
|
10
10
|
|
|
11
|
-
# Third Party
|
|
12
|
-
from datasets import Dataset
|
|
13
11
|
from pydantic import (
|
|
14
12
|
BaseModel,
|
|
15
13
|
ConfigDict,
|
|
@@ -23,6 +21,9 @@ from rich.panel import Panel
|
|
|
23
21
|
from rich.table import Table
|
|
24
22
|
from rich.tree import Tree
|
|
25
23
|
import datasets
|
|
24
|
+
|
|
25
|
+
# Third Party
|
|
26
|
+
import pandas as pd
|
|
26
27
|
import yaml
|
|
27
28
|
|
|
28
29
|
# Local
|
|
@@ -285,15 +286,62 @@ class Flow(BaseModel):
|
|
|
285
286
|
return {key: str(yaml_dir / path) for key, path in paths.items()}
|
|
286
287
|
return paths
|
|
287
288
|
|
|
289
|
+
@staticmethod
|
|
290
|
+
def _convert_to_dataframe(
|
|
291
|
+
dataset: Union[pd.DataFrame, datasets.Dataset],
|
|
292
|
+
) -> tuple[pd.DataFrame, bool]:
|
|
293
|
+
"""Convert datasets.Dataset to pd.DataFrame if needed (backwards compatibility).
|
|
294
|
+
|
|
295
|
+
Parameters
|
|
296
|
+
----------
|
|
297
|
+
dataset : Union[pd.DataFrame, datasets.Dataset]
|
|
298
|
+
Input dataset in either format.
|
|
299
|
+
|
|
300
|
+
Returns
|
|
301
|
+
-------
|
|
302
|
+
tuple[pd.DataFrame, bool]
|
|
303
|
+
Tuple of (converted DataFrame, was_dataset flag).
|
|
304
|
+
was_dataset is True if input was a datasets.Dataset, False if it was already a DataFrame.
|
|
305
|
+
"""
|
|
306
|
+
if isinstance(dataset, datasets.Dataset):
|
|
307
|
+
logger.info("Converting datasets.Dataset to pd.DataFrame for processing")
|
|
308
|
+
return dataset.to_pandas(), True
|
|
309
|
+
return dataset, False
|
|
310
|
+
|
|
311
|
+
@staticmethod
|
|
312
|
+
def _convert_from_dataframe(
|
|
313
|
+
df: pd.DataFrame, should_convert: bool
|
|
314
|
+
) -> Union[pd.DataFrame, datasets.Dataset]:
|
|
315
|
+
"""Convert pd.DataFrame back to datasets.Dataset if needed (backwards compatibility).
|
|
316
|
+
|
|
317
|
+
Parameters
|
|
318
|
+
----------
|
|
319
|
+
df : pd.DataFrame
|
|
320
|
+
DataFrame to potentially convert.
|
|
321
|
+
should_convert : bool
|
|
322
|
+
If True, convert to datasets.Dataset. If False, return as-is.
|
|
323
|
+
|
|
324
|
+
Returns
|
|
325
|
+
-------
|
|
326
|
+
Union[pd.DataFrame, datasets.Dataset]
|
|
327
|
+
Original DataFrame or converted Dataset, matching the input type.
|
|
328
|
+
"""
|
|
329
|
+
if should_convert:
|
|
330
|
+
logger.info(
|
|
331
|
+
"Converting pd.DataFrame back to datasets.Dataset to match input type"
|
|
332
|
+
)
|
|
333
|
+
return datasets.Dataset.from_pandas(df)
|
|
334
|
+
return df
|
|
335
|
+
|
|
288
336
|
def generate(
|
|
289
337
|
self,
|
|
290
|
-
dataset: Dataset,
|
|
338
|
+
dataset: Union[pd.DataFrame, datasets.Dataset],
|
|
291
339
|
runtime_params: Optional[dict[str, dict[str, Any]]] = None,
|
|
292
340
|
checkpoint_dir: Optional[str] = None,
|
|
293
341
|
save_freq: Optional[int] = None,
|
|
294
342
|
log_dir: Optional[str] = None,
|
|
295
343
|
max_concurrency: Optional[int] = None,
|
|
296
|
-
) -> Dataset:
|
|
344
|
+
) -> Union[pd.DataFrame, datasets.Dataset]:
|
|
297
345
|
"""Execute the flow blocks in sequence to generate data.
|
|
298
346
|
|
|
299
347
|
Note: For flows with LLM blocks, set_model_config() must be called first
|
|
@@ -301,8 +349,9 @@ class Flow(BaseModel):
|
|
|
301
349
|
|
|
302
350
|
Parameters
|
|
303
351
|
----------
|
|
304
|
-
dataset : Dataset
|
|
305
|
-
Input dataset to process.
|
|
352
|
+
dataset : Union[pd.DataFrame, datasets.Dataset]
|
|
353
|
+
Input dataset to process. Can be either pandas DataFrame or HuggingFace Dataset
|
|
354
|
+
(will be automatically converted to DataFrame for backwards compatibility).
|
|
306
355
|
runtime_params : Optional[Dict[str, Dict[str, Any]]], optional
|
|
307
356
|
Runtime parameters organized by block name. Format:
|
|
308
357
|
{
|
|
@@ -324,8 +373,9 @@ class Flow(BaseModel):
|
|
|
324
373
|
|
|
325
374
|
Returns
|
|
326
375
|
-------
|
|
327
|
-
Dataset
|
|
376
|
+
Union[pd.DataFrame, datasets.Dataset]
|
|
328
377
|
Processed dataset after all blocks have been executed.
|
|
378
|
+
Return type matches the input type (DataFrame in -> DataFrame out, Dataset in -> Dataset out).
|
|
329
379
|
|
|
330
380
|
Raises
|
|
331
381
|
------
|
|
@@ -334,6 +384,9 @@ class Flow(BaseModel):
|
|
|
334
384
|
FlowValidationError
|
|
335
385
|
If flow validation fails or if model configuration is required but not set.
|
|
336
386
|
"""
|
|
387
|
+
# Convert to DataFrame if needed (backwards compatibility)
|
|
388
|
+
dataset, was_dataset = self._convert_to_dataframe(dataset)
|
|
389
|
+
|
|
337
390
|
# Validate save_freq parameter early to prevent range() errors
|
|
338
391
|
if save_freq is not None and save_freq <= 0:
|
|
339
392
|
raise FlowValidationError(
|
|
@@ -429,7 +482,7 @@ class Flow(BaseModel):
|
|
|
429
482
|
finally:
|
|
430
483
|
flow_logger.removeHandler(h)
|
|
431
484
|
|
|
432
|
-
return completed_dataset
|
|
485
|
+
return self._convert_from_dataframe(completed_dataset, was_dataset)
|
|
433
486
|
|
|
434
487
|
dataset = remaining_dataset
|
|
435
488
|
flow_logger.info(f"Resuming with {len(dataset)} remaining samples")
|
|
@@ -456,7 +509,7 @@ class Flow(BaseModel):
|
|
|
456
509
|
# Process in chunks of save_freq
|
|
457
510
|
for i in range(0, len(dataset), save_freq):
|
|
458
511
|
chunk_end = min(i + save_freq, len(dataset))
|
|
459
|
-
chunk_dataset = dataset.
|
|
512
|
+
chunk_dataset = dataset.iloc[i:chunk_end]
|
|
460
513
|
|
|
461
514
|
flow_logger.info(
|
|
462
515
|
f"Processing chunk {i // save_freq + 1}: samples {i} to {chunk_end - 1}"
|
|
@@ -480,7 +533,11 @@ class Flow(BaseModel):
|
|
|
480
533
|
)
|
|
481
534
|
|
|
482
535
|
# Combine with previously completed samples if any
|
|
483
|
-
if
|
|
536
|
+
if (
|
|
537
|
+
checkpointer
|
|
538
|
+
and completed_dataset is not None
|
|
539
|
+
and not completed_dataset.empty
|
|
540
|
+
):
|
|
484
541
|
final_dataset = safe_concatenate_with_validation(
|
|
485
542
|
[completed_dataset, final_dataset],
|
|
486
543
|
"completed checkpoint data with newly processed data",
|
|
@@ -498,7 +555,7 @@ class Flow(BaseModel):
|
|
|
498
555
|
checkpointer.save_final_checkpoint()
|
|
499
556
|
|
|
500
557
|
# Combine with previously completed samples if any
|
|
501
|
-
if completed_dataset:
|
|
558
|
+
if completed_dataset is not None and not completed_dataset.empty:
|
|
502
559
|
final_dataset = safe_concatenate_with_validation(
|
|
503
560
|
[completed_dataset, final_dataset],
|
|
504
561
|
"completed checkpoint data with newly processed data",
|
|
@@ -536,7 +593,7 @@ class Flow(BaseModel):
|
|
|
536
593
|
flow_logger.info(
|
|
537
594
|
f"Flow '{self.metadata.name}' completed successfully: "
|
|
538
595
|
f"{len(final_dataset)} final samples, "
|
|
539
|
-
f"{len(final_dataset.
|
|
596
|
+
f"{len(final_dataset.columns)} final columns"
|
|
540
597
|
)
|
|
541
598
|
|
|
542
599
|
# Close file handlers if we opened a flow-specific logger
|
|
@@ -550,20 +607,20 @@ class Flow(BaseModel):
|
|
|
550
607
|
finally:
|
|
551
608
|
flow_logger.removeHandler(h)
|
|
552
609
|
|
|
553
|
-
return final_dataset
|
|
610
|
+
return self._convert_from_dataframe(final_dataset, was_dataset)
|
|
554
611
|
|
|
555
612
|
def _execute_blocks_on_dataset(
|
|
556
613
|
self,
|
|
557
|
-
dataset:
|
|
614
|
+
dataset: pd.DataFrame,
|
|
558
615
|
runtime_params: dict[str, dict[str, Any]],
|
|
559
616
|
flow_logger=None,
|
|
560
617
|
max_concurrency: Optional[int] = None,
|
|
561
|
-
) ->
|
|
618
|
+
) -> pd.DataFrame:
|
|
562
619
|
"""Execute all blocks in sequence on the given dataset.
|
|
563
620
|
|
|
564
621
|
Parameters
|
|
565
622
|
----------
|
|
566
|
-
dataset :
|
|
623
|
+
dataset : pd.DataFrame
|
|
567
624
|
Dataset to process through all blocks.
|
|
568
625
|
runtime_params : Dict[str, Dict[str, Any]]
|
|
569
626
|
Runtime parameters for block execution.
|
|
@@ -574,7 +631,7 @@ class Flow(BaseModel):
|
|
|
574
631
|
|
|
575
632
|
Returns
|
|
576
633
|
-------
|
|
577
|
-
|
|
634
|
+
pd.DataFrame
|
|
578
635
|
Dataset after processing through all blocks.
|
|
579
636
|
"""
|
|
580
637
|
# Use provided logger or fall back to global logger
|
|
@@ -598,7 +655,7 @@ class Flow(BaseModel):
|
|
|
598
655
|
# Capture metrics before execution
|
|
599
656
|
start_time = time.perf_counter()
|
|
600
657
|
input_rows = len(current_dataset)
|
|
601
|
-
input_cols = set(current_dataset.
|
|
658
|
+
input_cols = set(current_dataset.columns)
|
|
602
659
|
|
|
603
660
|
try:
|
|
604
661
|
# Execute block with validation and logging
|
|
@@ -613,7 +670,7 @@ class Flow(BaseModel):
|
|
|
613
670
|
# Capture metrics after successful execution
|
|
614
671
|
execution_time = time.perf_counter() - start_time
|
|
615
672
|
output_rows = len(current_dataset)
|
|
616
|
-
output_cols = set(current_dataset.
|
|
673
|
+
output_cols = set(current_dataset.columns)
|
|
617
674
|
added_cols = output_cols - input_cols
|
|
618
675
|
removed_cols = input_cols - output_cols
|
|
619
676
|
|
|
@@ -634,7 +691,7 @@ class Flow(BaseModel):
|
|
|
634
691
|
exec_logger.info(
|
|
635
692
|
f"Block '{block.block_name}' completed successfully: "
|
|
636
693
|
f"{len(current_dataset)} samples, "
|
|
637
|
-
f"{len(current_dataset.
|
|
694
|
+
f"{len(current_dataset.columns)} columns"
|
|
638
695
|
)
|
|
639
696
|
|
|
640
697
|
except Exception as exc:
|
|
@@ -932,17 +989,37 @@ class Flow(BaseModel):
|
|
|
932
989
|
"experimental": self.metadata.recommended_models.experimental,
|
|
933
990
|
}
|
|
934
991
|
|
|
935
|
-
def validate_dataset(
|
|
936
|
-
|
|
992
|
+
def validate_dataset(
|
|
993
|
+
self, dataset: Union[pd.DataFrame, datasets.Dataset]
|
|
994
|
+
) -> list[str]:
|
|
995
|
+
"""Validate dataset against flow requirements.
|
|
996
|
+
|
|
997
|
+
Parameters
|
|
998
|
+
----------
|
|
999
|
+
dataset : Union[pd.DataFrame, datasets.Dataset]
|
|
1000
|
+
Dataset to validate. Can be either pandas DataFrame or HuggingFace Dataset
|
|
1001
|
+
(will be automatically converted to DataFrame for backwards compatibility).
|
|
1002
|
+
|
|
1003
|
+
Returns
|
|
1004
|
+
-------
|
|
1005
|
+
list[str]
|
|
1006
|
+
List of validation error messages (empty if valid).
|
|
1007
|
+
"""
|
|
1008
|
+
# Convert to DataFrame if needed (backwards compatibility)
|
|
1009
|
+
dataset, _ = self._convert_to_dataframe(dataset)
|
|
1010
|
+
|
|
937
1011
|
errors = []
|
|
938
1012
|
|
|
939
1013
|
if len(dataset) == 0:
|
|
940
1014
|
errors.append("Dataset is empty")
|
|
941
1015
|
|
|
942
1016
|
if self.metadata.dataset_requirements:
|
|
1017
|
+
# Get column names
|
|
1018
|
+
columns = dataset.columns.tolist()
|
|
1019
|
+
|
|
943
1020
|
errors.extend(
|
|
944
1021
|
self.metadata.dataset_requirements.validate_dataset(
|
|
945
|
-
|
|
1022
|
+
columns, len(dataset)
|
|
946
1023
|
)
|
|
947
1024
|
)
|
|
948
1025
|
|
|
@@ -950,7 +1027,7 @@ class Flow(BaseModel):
|
|
|
950
1027
|
|
|
951
1028
|
def dry_run(
|
|
952
1029
|
self,
|
|
953
|
-
dataset: Dataset,
|
|
1030
|
+
dataset: Union[pd.DataFrame, datasets.Dataset],
|
|
954
1031
|
sample_size: int = 2,
|
|
955
1032
|
runtime_params: Optional[dict[str, dict[str, Any]]] = None,
|
|
956
1033
|
max_concurrency: Optional[int] = None,
|
|
@@ -960,8 +1037,9 @@ class Flow(BaseModel):
|
|
|
960
1037
|
|
|
961
1038
|
Parameters
|
|
962
1039
|
----------
|
|
963
|
-
dataset : Dataset
|
|
964
|
-
Input dataset to test with.
|
|
1040
|
+
dataset : Union[pd.DataFrame, datasets.Dataset]
|
|
1041
|
+
Input dataset to test with. Can be either pandas DataFrame or HuggingFace Dataset
|
|
1042
|
+
(will be automatically converted to DataFrame for backwards compatibility).
|
|
965
1043
|
sample_size : int, default=2
|
|
966
1044
|
Number of samples to use for dry run testing.
|
|
967
1045
|
runtime_params : Optional[Dict[str, Dict[str, Any]]], optional
|
|
@@ -986,6 +1064,9 @@ class Flow(BaseModel):
|
|
|
986
1064
|
FlowValidationError
|
|
987
1065
|
If any block fails during dry run execution.
|
|
988
1066
|
"""
|
|
1067
|
+
# Convert to DataFrame if needed (backwards compatibility)
|
|
1068
|
+
dataset, _ = self._convert_to_dataframe(dataset)
|
|
1069
|
+
|
|
989
1070
|
# Validate preconditions
|
|
990
1071
|
if not self.blocks:
|
|
991
1072
|
raise FlowValidationError("Cannot dry run empty flow")
|
|
@@ -1017,7 +1098,7 @@ class Flow(BaseModel):
|
|
|
1017
1098
|
)
|
|
1018
1099
|
|
|
1019
1100
|
# Create subset dataset
|
|
1020
|
-
sample_dataset = dataset.
|
|
1101
|
+
sample_dataset = dataset.iloc[:actual_sample_size]
|
|
1021
1102
|
|
|
1022
1103
|
# Initialize dry run results
|
|
1023
1104
|
dry_run_results = {
|
|
@@ -1026,7 +1107,7 @@ class Flow(BaseModel):
|
|
|
1026
1107
|
"sample_size": actual_sample_size,
|
|
1027
1108
|
"original_dataset_size": len(dataset),
|
|
1028
1109
|
"max_concurrency": max_concurrency,
|
|
1029
|
-
"input_columns": dataset.
|
|
1110
|
+
"input_columns": dataset.columns.tolist(),
|
|
1030
1111
|
"blocks_executed": [],
|
|
1031
1112
|
"final_dataset": None,
|
|
1032
1113
|
"execution_successful": True,
|
|
@@ -1070,7 +1151,7 @@ class Flow(BaseModel):
|
|
|
1070
1151
|
"execution_time_seconds": block_execution_time,
|
|
1071
1152
|
"input_rows": input_rows,
|
|
1072
1153
|
"output_rows": len(current_dataset),
|
|
1073
|
-
"output_columns": current_dataset.
|
|
1154
|
+
"output_columns": current_dataset.columns.tolist(),
|
|
1074
1155
|
"parameters_used": block_kwargs,
|
|
1075
1156
|
}
|
|
1076
1157
|
|
|
@@ -1079,14 +1160,14 @@ class Flow(BaseModel):
|
|
|
1079
1160
|
logger.info(
|
|
1080
1161
|
f"Dry run block '{block.block_name}' completed: "
|
|
1081
1162
|
f"{len(current_dataset)} samples, "
|
|
1082
|
-
f"{len(current_dataset.
|
|
1163
|
+
f"{len(current_dataset.columns)} columns, "
|
|
1083
1164
|
f"{block_execution_time:.2f}s"
|
|
1084
1165
|
)
|
|
1085
1166
|
|
|
1086
1167
|
# Store final results
|
|
1087
1168
|
dry_run_results["final_dataset"] = {
|
|
1088
1169
|
"rows": len(current_dataset),
|
|
1089
|
-
"columns": current_dataset.
|
|
1170
|
+
"columns": current_dataset.columns.tolist(),
|
|
1090
1171
|
"sample_data": current_dataset.to_dict()
|
|
1091
1172
|
if len(current_dataset) > 0
|
|
1092
1173
|
else {},
|
|
@@ -1121,7 +1202,7 @@ class Flow(BaseModel):
|
|
|
1121
1202
|
def _estimate_total_time(
|
|
1122
1203
|
self,
|
|
1123
1204
|
first_run_results: dict[str, Any],
|
|
1124
|
-
dataset:
|
|
1205
|
+
dataset: pd.DataFrame,
|
|
1125
1206
|
runtime_params: Optional[dict[str, dict[str, Any]]],
|
|
1126
1207
|
max_concurrency: Optional[int],
|
|
1127
1208
|
) -> dict[str, Any]:
|
|
@@ -1134,7 +1215,7 @@ class Flow(BaseModel):
|
|
|
1134
1215
|
----------
|
|
1135
1216
|
first_run_results : dict
|
|
1136
1217
|
Results from the first dry run.
|
|
1137
|
-
dataset :
|
|
1218
|
+
dataset : pd.DataFrame
|
|
1138
1219
|
Full dataset for estimation.
|
|
1139
1220
|
runtime_params : Optional[dict]
|
|
1140
1221
|
Runtime parameters.
|
|
@@ -1283,13 +1364,13 @@ class Flow(BaseModel):
|
|
|
1283
1364
|
"""
|
|
1284
1365
|
return self.metadata.dataset_requirements
|
|
1285
1366
|
|
|
1286
|
-
def get_dataset_schema(self) ->
|
|
1367
|
+
def get_dataset_schema(self) -> pd.DataFrame:
|
|
1287
1368
|
"""Get an empty dataset with the correct schema for this flow.
|
|
1288
1369
|
|
|
1289
1370
|
Returns
|
|
1290
1371
|
-------
|
|
1291
|
-
|
|
1292
|
-
Empty
|
|
1372
|
+
pd.DataFrame
|
|
1373
|
+
Empty DataFrame with the correct schema/features for this flow.
|
|
1293
1374
|
Users can add data to this dataset or use it to validate their own dataset schema.
|
|
1294
1375
|
|
|
1295
1376
|
Examples
|
|
@@ -1305,50 +1386,51 @@ class Flow(BaseModel):
|
|
|
1305
1386
|
... })
|
|
1306
1387
|
>>>
|
|
1307
1388
|
>>> # Or validate your existing dataset schema
|
|
1308
|
-
>>> my_dataset =
|
|
1309
|
-
>>> if my_dataset.
|
|
1389
|
+
>>> my_dataset = pd.DataFrame(my_data)
|
|
1390
|
+
>>> if my_dataset.dtypes.equals(schema_dataset.dtypes):
|
|
1310
1391
|
... print("Schema matches!")
|
|
1311
1392
|
"""
|
|
1312
1393
|
|
|
1313
1394
|
requirements = self.get_dataset_requirements()
|
|
1314
1395
|
|
|
1315
1396
|
if requirements is None:
|
|
1316
|
-
# Return empty
|
|
1317
|
-
return
|
|
1397
|
+
# Return empty dataframe with no schema requirements
|
|
1398
|
+
return pd.DataFrame({})
|
|
1318
1399
|
|
|
1319
|
-
# Build schema
|
|
1320
|
-
|
|
1400
|
+
# Build schema with column names and dtypes
|
|
1401
|
+
schema = {}
|
|
1321
1402
|
|
|
1322
1403
|
# Process required columns
|
|
1323
1404
|
for col_name in requirements.required_columns:
|
|
1324
1405
|
col_type = requirements.column_types.get(col_name, "string")
|
|
1325
|
-
|
|
1406
|
+
schema[col_name] = self._map_column_type_to_dtype(col_type)
|
|
1326
1407
|
|
|
1327
1408
|
# Process optional columns
|
|
1328
1409
|
for col_name in requirements.optional_columns:
|
|
1329
1410
|
col_type = requirements.column_types.get(col_name, "string")
|
|
1330
|
-
|
|
1411
|
+
schema[col_name] = self._map_column_type_to_dtype(col_type)
|
|
1331
1412
|
|
|
1332
|
-
# Create empty
|
|
1333
|
-
|
|
1334
|
-
|
|
1413
|
+
# Create empty dataframe with the correct dtypes
|
|
1414
|
+
empty_data = {
|
|
1415
|
+
col_name: pd.Series([], dtype=dtype) for col_name, dtype in schema.items()
|
|
1416
|
+
}
|
|
1335
1417
|
|
|
1336
|
-
return
|
|
1418
|
+
return pd.DataFrame(empty_data)
|
|
1337
1419
|
|
|
1338
|
-
def
|
|
1339
|
-
"""Map column type string to
|
|
1340
|
-
# Map common type names to
|
|
1420
|
+
def _map_column_type_to_dtype(self, col_type: str):
|
|
1421
|
+
"""Map column type string to pandas dtype."""
|
|
1422
|
+
# Map common type names to pandas dtypes
|
|
1341
1423
|
if col_type in ["str", "string", "text"]:
|
|
1342
|
-
return
|
|
1424
|
+
return "object" # pandas uses 'object' for strings
|
|
1343
1425
|
elif col_type in ["int", "integer"]:
|
|
1344
|
-
return
|
|
1426
|
+
return "Int64" # nullable integer
|
|
1345
1427
|
elif col_type in ["float", "number"]:
|
|
1346
|
-
return
|
|
1428
|
+
return "float64"
|
|
1347
1429
|
elif col_type in ["bool", "boolean"]:
|
|
1348
|
-
return
|
|
1430
|
+
return "boolean" # nullable boolean
|
|
1349
1431
|
else:
|
|
1350
|
-
# Default to string for unknown types
|
|
1351
|
-
return
|
|
1432
|
+
# Default to object (string) for unknown types
|
|
1433
|
+
return "object"
|
|
1352
1434
|
|
|
1353
1435
|
def print_info(self) -> None:
|
|
1354
1436
|
"""
|
|
@@ -9,7 +9,7 @@ import os
|
|
|
9
9
|
import uuid
|
|
10
10
|
|
|
11
11
|
# Third Party
|
|
12
|
-
|
|
12
|
+
import pandas as pd
|
|
13
13
|
|
|
14
14
|
# Local
|
|
15
15
|
from ..utils.datautils import safe_concatenate_with_validation
|
|
@@ -67,18 +67,18 @@ class FlowCheckpointer:
|
|
|
67
67
|
return os.path.join(self.checkpoint_dir, "flow_metadata.json")
|
|
68
68
|
|
|
69
69
|
def load_existing_progress(
|
|
70
|
-
self, input_dataset:
|
|
71
|
-
) -> Tuple[
|
|
70
|
+
self, input_dataset: pd.DataFrame
|
|
71
|
+
) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
|
|
72
72
|
"""Load existing checkpoint data and determine remaining work.
|
|
73
73
|
|
|
74
74
|
Parameters
|
|
75
75
|
----------
|
|
76
|
-
input_dataset :
|
|
76
|
+
input_dataset : pd.DataFrame
|
|
77
77
|
Original input dataset for the flow.
|
|
78
78
|
|
|
79
79
|
Returns
|
|
80
80
|
-------
|
|
81
|
-
Tuple[
|
|
81
|
+
Tuple[pd.DataFrame, Optional[pd.DataFrame]]
|
|
82
82
|
(remaining_samples_to_process, completed_samples_dataset)
|
|
83
83
|
If no checkpoints exist, returns (input_dataset, None)
|
|
84
84
|
"""
|
|
@@ -127,20 +127,20 @@ class FlowCheckpointer:
|
|
|
127
127
|
logger.warning(f"Failed to load checkpoints: {exc}. Starting from scratch.")
|
|
128
128
|
return input_dataset, None
|
|
129
129
|
|
|
130
|
-
def add_completed_samples(self, samples:
|
|
130
|
+
def add_completed_samples(self, samples: pd.DataFrame) -> None:
|
|
131
131
|
"""Add samples that have completed the entire flow.
|
|
132
132
|
|
|
133
133
|
Parameters
|
|
134
134
|
----------
|
|
135
|
-
samples :
|
|
135
|
+
samples : pd.DataFrame
|
|
136
136
|
Samples that have completed processing through all blocks.
|
|
137
137
|
"""
|
|
138
138
|
if not self.is_enabled:
|
|
139
139
|
return
|
|
140
140
|
|
|
141
141
|
# Add to pending samples
|
|
142
|
-
for sample in samples:
|
|
143
|
-
self._pending_samples.append(sample)
|
|
142
|
+
for _, sample in samples.iterrows():
|
|
143
|
+
self._pending_samples.append(sample.to_dict())
|
|
144
144
|
self._samples_processed += 1
|
|
145
145
|
|
|
146
146
|
# Check if we should save a checkpoint
|
|
@@ -167,9 +167,9 @@ class FlowCheckpointer:
|
|
|
167
167
|
self.checkpoint_dir, f"checkpoint_{self._checkpoint_counter:04d}.jsonl"
|
|
168
168
|
)
|
|
169
169
|
|
|
170
|
-
# Convert pending samples to
|
|
171
|
-
|
|
172
|
-
|
|
170
|
+
# Convert pending samples to dataframe and save
|
|
171
|
+
checkpoint_df = pd.DataFrame(self._pending_samples)
|
|
172
|
+
checkpoint_df.to_json(checkpoint_file, orient="records", lines=True)
|
|
173
173
|
|
|
174
174
|
# Update metadata
|
|
175
175
|
self._save_metadata()
|
|
@@ -207,7 +207,7 @@ class FlowCheckpointer:
|
|
|
207
207
|
logger.warning(f"Failed to load metadata: {exc}")
|
|
208
208
|
return None
|
|
209
209
|
|
|
210
|
-
def _load_completed_samples(self) -> Optional[
|
|
210
|
+
def _load_completed_samples(self) -> Optional[pd.DataFrame]:
|
|
211
211
|
"""Load all completed samples from checkpoint files."""
|
|
212
212
|
checkpoint_files = []
|
|
213
213
|
checkpoint_dir = Path(self.checkpoint_dir)
|
|
@@ -222,27 +222,25 @@ class FlowCheckpointer:
|
|
|
222
222
|
# Sort checkpoint files by number
|
|
223
223
|
checkpoint_files.sort()
|
|
224
224
|
|
|
225
|
-
# Load and concatenate all checkpoint
|
|
226
|
-
|
|
225
|
+
# Load and concatenate all checkpoint dataframes
|
|
226
|
+
dataframes = []
|
|
227
227
|
for file_path in checkpoint_files:
|
|
228
228
|
try:
|
|
229
|
-
|
|
230
|
-
if len(
|
|
231
|
-
|
|
232
|
-
logger.debug(
|
|
233
|
-
f"Loaded checkpoint: {file_path} ({len(dataset)} samples)"
|
|
234
|
-
)
|
|
229
|
+
df = pd.read_json(file_path, lines=True)
|
|
230
|
+
if len(df) > 0:
|
|
231
|
+
dataframes.append(df)
|
|
232
|
+
logger.debug(f"Loaded checkpoint: {file_path} ({len(df)} samples)")
|
|
235
233
|
except Exception as exc:
|
|
236
234
|
logger.warning(f"Failed to load checkpoint {file_path}: {exc}")
|
|
237
235
|
|
|
238
|
-
if not
|
|
236
|
+
if not dataframes:
|
|
239
237
|
return None
|
|
240
238
|
|
|
241
|
-
return safe_concatenate_with_validation(
|
|
239
|
+
return safe_concatenate_with_validation(dataframes, "checkpoint files")
|
|
242
240
|
|
|
243
241
|
def _find_remaining_samples(
|
|
244
|
-
self, input_dataset:
|
|
245
|
-
) ->
|
|
242
|
+
self, input_dataset: pd.DataFrame, completed_dataset: pd.DataFrame
|
|
243
|
+
) -> pd.DataFrame:
|
|
246
244
|
"""Find samples from input_dataset that are not in completed_dataset.
|
|
247
245
|
|
|
248
246
|
Note: Assumes input_dataset contains unique samples. For datasets with
|
|
@@ -250,19 +248,19 @@ class FlowCheckpointer:
|
|
|
250
248
|
|
|
251
249
|
Parameters
|
|
252
250
|
----------
|
|
253
|
-
input_dataset :
|
|
251
|
+
input_dataset : pd.DataFrame
|
|
254
252
|
Original input dataset (assumed to contain unique samples).
|
|
255
|
-
completed_dataset :
|
|
253
|
+
completed_dataset : pd.DataFrame
|
|
256
254
|
Dataset of completed samples.
|
|
257
255
|
|
|
258
256
|
Returns
|
|
259
257
|
-------
|
|
260
|
-
|
|
258
|
+
pd.DataFrame
|
|
261
259
|
Samples that still need processing.
|
|
262
260
|
"""
|
|
263
261
|
# Get common columns for comparison
|
|
264
|
-
input_columns = set(input_dataset.
|
|
265
|
-
completed_columns = set(completed_dataset.
|
|
262
|
+
input_columns = set(input_dataset.columns.tolist())
|
|
263
|
+
completed_columns = set(completed_dataset.columns.tolist())
|
|
266
264
|
common_columns = list(input_columns & completed_columns)
|
|
267
265
|
|
|
268
266
|
if not common_columns:
|
|
@@ -272,9 +270,9 @@ class FlowCheckpointer:
|
|
|
272
270
|
)
|
|
273
271
|
return input_dataset
|
|
274
272
|
|
|
275
|
-
#
|
|
276
|
-
input_df = input_dataset
|
|
277
|
-
completed_df = completed_dataset
|
|
273
|
+
# Select only common columns for comparison
|
|
274
|
+
input_df = input_dataset[common_columns]
|
|
275
|
+
completed_df = completed_dataset[common_columns]
|
|
278
276
|
|
|
279
277
|
# Find rows that haven't been completed
|
|
280
278
|
# Use tuple representation for comparison
|
|
@@ -287,10 +285,10 @@ class FlowCheckpointer:
|
|
|
287
285
|
remaining_indices = input_df[remaining_mask].index.tolist()
|
|
288
286
|
|
|
289
287
|
if not remaining_indices:
|
|
290
|
-
# Return empty
|
|
291
|
-
return input_dataset.
|
|
288
|
+
# Return empty dataframe with same structure
|
|
289
|
+
return input_dataset.iloc[0:0]
|
|
292
290
|
|
|
293
|
-
return input_dataset.
|
|
291
|
+
return input_dataset.iloc[remaining_indices]
|
|
294
292
|
|
|
295
293
|
def get_progress_info(self) -> Dict[str, Any]:
|
|
296
294
|
"""Get information about current progress.
|
sdg_hub/core/flow/validation.py
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
7
|
# Third Party
|
|
8
|
-
|
|
8
|
+
import pandas as pd
|
|
9
9
|
|
|
10
10
|
if TYPE_CHECKING:
|
|
11
11
|
# Local
|
|
@@ -180,14 +180,14 @@ class FlowValidator:
|
|
|
180
180
|
|
|
181
181
|
return errors
|
|
182
182
|
|
|
183
|
-
def validate_flow_execution(self, flow: "Flow", dataset:
|
|
183
|
+
def validate_flow_execution(self, flow: "Flow", dataset: pd.DataFrame) -> list[str]:
|
|
184
184
|
"""Validate that a flow can be executed with the given dataset.
|
|
185
185
|
|
|
186
186
|
Parameters
|
|
187
187
|
----------
|
|
188
188
|
flow : Flow
|
|
189
189
|
The flow to validate.
|
|
190
|
-
dataset :
|
|
190
|
+
dataset : pd.DataFrame
|
|
191
191
|
Dataset to validate against.
|
|
192
192
|
|
|
193
193
|
Returns
|
|
@@ -206,7 +206,7 @@ class FlowValidator:
|
|
|
206
206
|
return errors
|
|
207
207
|
|
|
208
208
|
# Track available columns as we progress through blocks
|
|
209
|
-
current_columns = set(dataset.
|
|
209
|
+
current_columns = set(dataset.columns.tolist())
|
|
210
210
|
|
|
211
211
|
for _i, block in enumerate(flow.blocks):
|
|
212
212
|
block_name = block.block_name
|