sdg-hub 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/_version.py +2 -2
- sdg_hub/core/blocks/base.py +60 -58
- sdg_hub/core/blocks/filtering/column_value_filter.py +29 -16
- sdg_hub/core/blocks/llm/__init__.py +0 -2
- sdg_hub/core/blocks/llm/llm_chat_block.py +42 -36
- sdg_hub/core/blocks/llm/llm_parser_block.py +13 -59
- sdg_hub/core/blocks/llm/prompt_builder_block.py +15 -10
- sdg_hub/core/blocks/llm/text_parser_block.py +14 -61
- sdg_hub/core/blocks/transform/duplicate_columns.py +9 -8
- sdg_hub/core/blocks/transform/index_based_mapper.py +29 -15
- sdg_hub/core/blocks/transform/json_structure_block.py +16 -13
- sdg_hub/core/blocks/transform/melt_columns.py +13 -12
- sdg_hub/core/blocks/transform/rename_columns.py +20 -9
- sdg_hub/core/blocks/transform/text_concat.py +20 -21
- sdg_hub/core/blocks/transform/uniform_col_val_setter.py +6 -5
- sdg_hub/core/flow/base.py +139 -106
- sdg_hub/core/flow/checkpointer.py +34 -36
- sdg_hub/core/flow/validation.py +4 -4
- sdg_hub/core/utils/datautils.py +52 -54
- sdg_hub/core/utils/flow_metrics.py +9 -6
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +1 -0
- {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/METADATA +5 -9
- {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/RECORD +26 -28
- sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -771
- sdg_hub/core/utils/temp_manager.py +0 -57
- {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/WHEEL +0 -0
- {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/top_level.txt +0 -0
sdg_hub/core/flow/base.py
CHANGED
|
@@ -5,13 +5,9 @@
|
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from typing import Any, Optional, Union
|
|
8
|
-
from weakref import finalize
|
|
9
|
-
import gc
|
|
10
8
|
import time
|
|
11
9
|
import uuid
|
|
12
10
|
|
|
13
|
-
# Third Party
|
|
14
|
-
from datasets import Dataset
|
|
15
11
|
from pydantic import (
|
|
16
12
|
BaseModel,
|
|
17
13
|
ConfigDict,
|
|
@@ -25,6 +21,9 @@ from rich.panel import Panel
|
|
|
25
21
|
from rich.table import Table
|
|
26
22
|
from rich.tree import Tree
|
|
27
23
|
import datasets
|
|
24
|
+
|
|
25
|
+
# Third Party
|
|
26
|
+
import pandas as pd
|
|
28
27
|
import yaml
|
|
29
28
|
|
|
30
29
|
# Local
|
|
@@ -39,11 +38,6 @@ from ..utils.flow_metrics import (
|
|
|
39
38
|
)
|
|
40
39
|
from ..utils.logger_config import setup_logger
|
|
41
40
|
from ..utils.path_resolution import resolve_path
|
|
42
|
-
from ..utils.temp_manager import (
|
|
43
|
-
cleanup_path,
|
|
44
|
-
create_temp_dir,
|
|
45
|
-
create_temp_file,
|
|
46
|
-
)
|
|
47
41
|
from ..utils.time_estimator import estimate_execution_time
|
|
48
42
|
from ..utils.yaml_utils import save_flow_yaml
|
|
49
43
|
from .checkpointer import FlowCheckpointer
|
|
@@ -292,15 +286,62 @@ class Flow(BaseModel):
|
|
|
292
286
|
return {key: str(yaml_dir / path) for key, path in paths.items()}
|
|
293
287
|
return paths
|
|
294
288
|
|
|
289
|
+
@staticmethod
|
|
290
|
+
def _convert_to_dataframe(
|
|
291
|
+
dataset: Union[pd.DataFrame, datasets.Dataset],
|
|
292
|
+
) -> tuple[pd.DataFrame, bool]:
|
|
293
|
+
"""Convert datasets.Dataset to pd.DataFrame if needed (backwards compatibility).
|
|
294
|
+
|
|
295
|
+
Parameters
|
|
296
|
+
----------
|
|
297
|
+
dataset : Union[pd.DataFrame, datasets.Dataset]
|
|
298
|
+
Input dataset in either format.
|
|
299
|
+
|
|
300
|
+
Returns
|
|
301
|
+
-------
|
|
302
|
+
tuple[pd.DataFrame, bool]
|
|
303
|
+
Tuple of (converted DataFrame, was_dataset flag).
|
|
304
|
+
was_dataset is True if input was a datasets.Dataset, False if it was already a DataFrame.
|
|
305
|
+
"""
|
|
306
|
+
if isinstance(dataset, datasets.Dataset):
|
|
307
|
+
logger.info("Converting datasets.Dataset to pd.DataFrame for processing")
|
|
308
|
+
return dataset.to_pandas(), True
|
|
309
|
+
return dataset, False
|
|
310
|
+
|
|
311
|
+
@staticmethod
|
|
312
|
+
def _convert_from_dataframe(
|
|
313
|
+
df: pd.DataFrame, should_convert: bool
|
|
314
|
+
) -> Union[pd.DataFrame, datasets.Dataset]:
|
|
315
|
+
"""Convert pd.DataFrame back to datasets.Dataset if needed (backwards compatibility).
|
|
316
|
+
|
|
317
|
+
Parameters
|
|
318
|
+
----------
|
|
319
|
+
df : pd.DataFrame
|
|
320
|
+
DataFrame to potentially convert.
|
|
321
|
+
should_convert : bool
|
|
322
|
+
If True, convert to datasets.Dataset. If False, return as-is.
|
|
323
|
+
|
|
324
|
+
Returns
|
|
325
|
+
-------
|
|
326
|
+
Union[pd.DataFrame, datasets.Dataset]
|
|
327
|
+
Original DataFrame or converted Dataset, matching the input type.
|
|
328
|
+
"""
|
|
329
|
+
if should_convert:
|
|
330
|
+
logger.info(
|
|
331
|
+
"Converting pd.DataFrame back to datasets.Dataset to match input type"
|
|
332
|
+
)
|
|
333
|
+
return datasets.Dataset.from_pandas(df)
|
|
334
|
+
return df
|
|
335
|
+
|
|
295
336
|
def generate(
|
|
296
337
|
self,
|
|
297
|
-
dataset: Dataset,
|
|
338
|
+
dataset: Union[pd.DataFrame, datasets.Dataset],
|
|
298
339
|
runtime_params: Optional[dict[str, dict[str, Any]]] = None,
|
|
299
340
|
checkpoint_dir: Optional[str] = None,
|
|
300
341
|
save_freq: Optional[int] = None,
|
|
301
342
|
log_dir: Optional[str] = None,
|
|
302
343
|
max_concurrency: Optional[int] = None,
|
|
303
|
-
) -> Dataset:
|
|
344
|
+
) -> Union[pd.DataFrame, datasets.Dataset]:
|
|
304
345
|
"""Execute the flow blocks in sequence to generate data.
|
|
305
346
|
|
|
306
347
|
Note: For flows with LLM blocks, set_model_config() must be called first
|
|
@@ -308,8 +349,9 @@ class Flow(BaseModel):
|
|
|
308
349
|
|
|
309
350
|
Parameters
|
|
310
351
|
----------
|
|
311
|
-
dataset : Dataset
|
|
312
|
-
Input dataset to process.
|
|
352
|
+
dataset : Union[pd.DataFrame, datasets.Dataset]
|
|
353
|
+
Input dataset to process. Can be either pandas DataFrame or HuggingFace Dataset
|
|
354
|
+
(will be automatically converted to DataFrame for backwards compatibility).
|
|
313
355
|
runtime_params : Optional[Dict[str, Dict[str, Any]]], optional
|
|
314
356
|
Runtime parameters organized by block name. Format:
|
|
315
357
|
{
|
|
@@ -331,8 +373,9 @@ class Flow(BaseModel):
|
|
|
331
373
|
|
|
332
374
|
Returns
|
|
333
375
|
-------
|
|
334
|
-
Dataset
|
|
376
|
+
Union[pd.DataFrame, datasets.Dataset]
|
|
335
377
|
Processed dataset after all blocks have been executed.
|
|
378
|
+
Return type matches the input type (DataFrame in -> DataFrame out, Dataset in -> Dataset out).
|
|
336
379
|
|
|
337
380
|
Raises
|
|
338
381
|
------
|
|
@@ -341,6 +384,9 @@ class Flow(BaseModel):
|
|
|
341
384
|
FlowValidationError
|
|
342
385
|
If flow validation fails or if model configuration is required but not set.
|
|
343
386
|
"""
|
|
387
|
+
# Convert to DataFrame if needed (backwards compatibility)
|
|
388
|
+
dataset, was_dataset = self._convert_to_dataframe(dataset)
|
|
389
|
+
|
|
344
390
|
# Validate save_freq parameter early to prevent range() errors
|
|
345
391
|
if save_freq is not None and save_freq <= 0:
|
|
346
392
|
raise FlowValidationError(
|
|
@@ -436,7 +482,7 @@ class Flow(BaseModel):
|
|
|
436
482
|
finally:
|
|
437
483
|
flow_logger.removeHandler(h)
|
|
438
484
|
|
|
439
|
-
return completed_dataset
|
|
485
|
+
return self._convert_from_dataframe(completed_dataset, was_dataset)
|
|
440
486
|
|
|
441
487
|
dataset = remaining_dataset
|
|
442
488
|
flow_logger.info(f"Resuming with {len(dataset)} remaining samples")
|
|
@@ -463,7 +509,7 @@ class Flow(BaseModel):
|
|
|
463
509
|
# Process in chunks of save_freq
|
|
464
510
|
for i in range(0, len(dataset), save_freq):
|
|
465
511
|
chunk_end = min(i + save_freq, len(dataset))
|
|
466
|
-
chunk_dataset = dataset.
|
|
512
|
+
chunk_dataset = dataset.iloc[i:chunk_end]
|
|
467
513
|
|
|
468
514
|
flow_logger.info(
|
|
469
515
|
f"Processing chunk {i // save_freq + 1}: samples {i} to {chunk_end - 1}"
|
|
@@ -487,7 +533,11 @@ class Flow(BaseModel):
|
|
|
487
533
|
)
|
|
488
534
|
|
|
489
535
|
# Combine with previously completed samples if any
|
|
490
|
-
if
|
|
536
|
+
if (
|
|
537
|
+
checkpointer
|
|
538
|
+
and completed_dataset is not None
|
|
539
|
+
and not completed_dataset.empty
|
|
540
|
+
):
|
|
491
541
|
final_dataset = safe_concatenate_with_validation(
|
|
492
542
|
[completed_dataset, final_dataset],
|
|
493
543
|
"completed checkpoint data with newly processed data",
|
|
@@ -505,7 +555,7 @@ class Flow(BaseModel):
|
|
|
505
555
|
checkpointer.save_final_checkpoint()
|
|
506
556
|
|
|
507
557
|
# Combine with previously completed samples if any
|
|
508
|
-
if completed_dataset:
|
|
558
|
+
if completed_dataset is not None and not completed_dataset.empty:
|
|
509
559
|
final_dataset = safe_concatenate_with_validation(
|
|
510
560
|
[completed_dataset, final_dataset],
|
|
511
561
|
"completed checkpoint data with newly processed data",
|
|
@@ -543,7 +593,7 @@ class Flow(BaseModel):
|
|
|
543
593
|
flow_logger.info(
|
|
544
594
|
f"Flow '{self.metadata.name}' completed successfully: "
|
|
545
595
|
f"{len(final_dataset)} final samples, "
|
|
546
|
-
f"{len(final_dataset.
|
|
596
|
+
f"{len(final_dataset.columns)} final columns"
|
|
547
597
|
)
|
|
548
598
|
|
|
549
599
|
# Close file handlers if we opened a flow-specific logger
|
|
@@ -557,20 +607,20 @@ class Flow(BaseModel):
|
|
|
557
607
|
finally:
|
|
558
608
|
flow_logger.removeHandler(h)
|
|
559
609
|
|
|
560
|
-
return final_dataset
|
|
610
|
+
return self._convert_from_dataframe(final_dataset, was_dataset)
|
|
561
611
|
|
|
562
612
|
def _execute_blocks_on_dataset(
|
|
563
613
|
self,
|
|
564
|
-
dataset:
|
|
614
|
+
dataset: pd.DataFrame,
|
|
565
615
|
runtime_params: dict[str, dict[str, Any]],
|
|
566
616
|
flow_logger=None,
|
|
567
617
|
max_concurrency: Optional[int] = None,
|
|
568
|
-
) ->
|
|
618
|
+
) -> pd.DataFrame:
|
|
569
619
|
"""Execute all blocks in sequence on the given dataset.
|
|
570
620
|
|
|
571
621
|
Parameters
|
|
572
622
|
----------
|
|
573
|
-
dataset :
|
|
623
|
+
dataset : pd.DataFrame
|
|
574
624
|
Dataset to process through all blocks.
|
|
575
625
|
runtime_params : Dict[str, Dict[str, Any]]
|
|
576
626
|
Runtime parameters for block execution.
|
|
@@ -581,13 +631,12 @@ class Flow(BaseModel):
|
|
|
581
631
|
|
|
582
632
|
Returns
|
|
583
633
|
-------
|
|
584
|
-
|
|
634
|
+
pd.DataFrame
|
|
585
635
|
Dataset after processing through all blocks.
|
|
586
636
|
"""
|
|
587
637
|
# Use provided logger or fall back to global logger
|
|
588
638
|
exec_logger = flow_logger if flow_logger is not None else logger
|
|
589
639
|
current_dataset = dataset
|
|
590
|
-
current_dataset_temp_path: Optional[Path] = None
|
|
591
640
|
|
|
592
641
|
# Execute blocks in sequence
|
|
593
642
|
for i, block in enumerate(self.blocks):
|
|
@@ -599,14 +648,6 @@ class Flow(BaseModel):
|
|
|
599
648
|
# Prepare block execution parameters
|
|
600
649
|
block_kwargs = self._prepare_block_kwargs(block, runtime_params)
|
|
601
650
|
|
|
602
|
-
block_temp_jsonl_path: Optional[Path] = None
|
|
603
|
-
dataset_temp_dir: Optional[Path] = None
|
|
604
|
-
if getattr(block, "_flow_requires_jsonl_tmp", False):
|
|
605
|
-
block_temp_jsonl_path = create_temp_file(
|
|
606
|
-
prefix=f"{block.block_name}_parser", suffix=".jsonl"
|
|
607
|
-
)
|
|
608
|
-
block_kwargs["_flow_tmp_jsonl_path"] = str(block_temp_jsonl_path)
|
|
609
|
-
|
|
610
651
|
# Add max_concurrency to block kwargs if provided
|
|
611
652
|
if max_concurrency is not None:
|
|
612
653
|
block_kwargs["_flow_max_concurrency"] = max_concurrency
|
|
@@ -614,7 +655,7 @@ class Flow(BaseModel):
|
|
|
614
655
|
# Capture metrics before execution
|
|
615
656
|
start_time = time.perf_counter()
|
|
616
657
|
input_rows = len(current_dataset)
|
|
617
|
-
input_cols = set(current_dataset.
|
|
658
|
+
input_cols = set(current_dataset.columns)
|
|
618
659
|
|
|
619
660
|
try:
|
|
620
661
|
# Execute block with validation and logging
|
|
@@ -626,32 +667,10 @@ class Flow(BaseModel):
|
|
|
626
667
|
f"Block '{block.block_name}' produced empty dataset"
|
|
627
668
|
)
|
|
628
669
|
|
|
629
|
-
# Here, we write and reload dataset object from and to disk.
|
|
630
|
-
# This is done because HF Datasets library creates a ton of intermediate
|
|
631
|
-
# objects, and holds on to them even after the objects have fulfilled
|
|
632
|
-
# their purpose. To get flush these objects, HF recommends to implement
|
|
633
|
-
# this `save_to_disk` and `load_from_disk` hack.
|
|
634
|
-
# https://github.com/huggingface/datasets/blob/main/src/datasets/arrow_dataset.py#L1029
|
|
635
|
-
previous_temp_path = current_dataset_temp_path
|
|
636
|
-
dataset_temp_dir = create_temp_dir(prefix=f"flow_{block.block_name}")
|
|
637
|
-
current_dataset.save_to_disk(str(dataset_temp_dir))
|
|
638
|
-
del current_dataset
|
|
639
|
-
gc.collect()
|
|
640
|
-
current_dataset = datasets.load_from_disk(
|
|
641
|
-
str(dataset_temp_dir), keep_in_memory=False
|
|
642
|
-
)
|
|
643
|
-
finalize(current_dataset, cleanup_path, dataset_temp_dir)
|
|
644
|
-
current_dataset_temp_path = dataset_temp_dir
|
|
645
|
-
if previous_temp_path and previous_temp_path != dataset_temp_dir:
|
|
646
|
-
cleanup_path(previous_temp_path)
|
|
647
|
-
|
|
648
|
-
if block_temp_jsonl_path is not None:
|
|
649
|
-
cleanup_path(block_temp_jsonl_path)
|
|
650
|
-
|
|
651
670
|
# Capture metrics after successful execution
|
|
652
671
|
execution_time = time.perf_counter() - start_time
|
|
653
672
|
output_rows = len(current_dataset)
|
|
654
|
-
output_cols = set(current_dataset.
|
|
673
|
+
output_cols = set(current_dataset.columns)
|
|
655
674
|
added_cols = output_cols - input_cols
|
|
656
675
|
removed_cols = input_cols - output_cols
|
|
657
676
|
|
|
@@ -672,14 +691,10 @@ class Flow(BaseModel):
|
|
|
672
691
|
exec_logger.info(
|
|
673
692
|
f"Block '{block.block_name}' completed successfully: "
|
|
674
693
|
f"{len(current_dataset)} samples, "
|
|
675
|
-
f"{len(current_dataset.
|
|
694
|
+
f"{len(current_dataset.columns)} columns"
|
|
676
695
|
)
|
|
677
696
|
|
|
678
697
|
except Exception as exc:
|
|
679
|
-
if block_temp_jsonl_path is not None:
|
|
680
|
-
cleanup_path(block_temp_jsonl_path)
|
|
681
|
-
if dataset_temp_dir is not None:
|
|
682
|
-
cleanup_path(dataset_temp_dir)
|
|
683
698
|
# Capture metrics for failed execution
|
|
684
699
|
execution_time = time.perf_counter() - start_time
|
|
685
700
|
self._block_metrics.append(
|
|
@@ -703,13 +718,6 @@ class Flow(BaseModel):
|
|
|
703
718
|
f"Block '{block.block_name}' execution failed: {exc}"
|
|
704
719
|
) from exc
|
|
705
720
|
|
|
706
|
-
if current_dataset_temp_path is not None:
|
|
707
|
-
final_temp_path = current_dataset_temp_path
|
|
708
|
-
current_dataset = datasets.load_from_disk(
|
|
709
|
-
str(final_temp_path), keep_in_memory=True
|
|
710
|
-
)
|
|
711
|
-
cleanup_path(final_temp_path)
|
|
712
|
-
|
|
713
721
|
return current_dataset
|
|
714
722
|
|
|
715
723
|
def _prepare_block_kwargs(
|
|
@@ -981,17 +989,37 @@ class Flow(BaseModel):
|
|
|
981
989
|
"experimental": self.metadata.recommended_models.experimental,
|
|
982
990
|
}
|
|
983
991
|
|
|
984
|
-
def validate_dataset(
|
|
985
|
-
|
|
992
|
+
def validate_dataset(
|
|
993
|
+
self, dataset: Union[pd.DataFrame, datasets.Dataset]
|
|
994
|
+
) -> list[str]:
|
|
995
|
+
"""Validate dataset against flow requirements.
|
|
996
|
+
|
|
997
|
+
Parameters
|
|
998
|
+
----------
|
|
999
|
+
dataset : Union[pd.DataFrame, datasets.Dataset]
|
|
1000
|
+
Dataset to validate. Can be either pandas DataFrame or HuggingFace Dataset
|
|
1001
|
+
(will be automatically converted to DataFrame for backwards compatibility).
|
|
1002
|
+
|
|
1003
|
+
Returns
|
|
1004
|
+
-------
|
|
1005
|
+
list[str]
|
|
1006
|
+
List of validation error messages (empty if valid).
|
|
1007
|
+
"""
|
|
1008
|
+
# Convert to DataFrame if needed (backwards compatibility)
|
|
1009
|
+
dataset, _ = self._convert_to_dataframe(dataset)
|
|
1010
|
+
|
|
986
1011
|
errors = []
|
|
987
1012
|
|
|
988
1013
|
if len(dataset) == 0:
|
|
989
1014
|
errors.append("Dataset is empty")
|
|
990
1015
|
|
|
991
1016
|
if self.metadata.dataset_requirements:
|
|
1017
|
+
# Get column names
|
|
1018
|
+
columns = dataset.columns.tolist()
|
|
1019
|
+
|
|
992
1020
|
errors.extend(
|
|
993
1021
|
self.metadata.dataset_requirements.validate_dataset(
|
|
994
|
-
|
|
1022
|
+
columns, len(dataset)
|
|
995
1023
|
)
|
|
996
1024
|
)
|
|
997
1025
|
|
|
@@ -999,7 +1027,7 @@ class Flow(BaseModel):
|
|
|
999
1027
|
|
|
1000
1028
|
def dry_run(
|
|
1001
1029
|
self,
|
|
1002
|
-
dataset: Dataset,
|
|
1030
|
+
dataset: Union[pd.DataFrame, datasets.Dataset],
|
|
1003
1031
|
sample_size: int = 2,
|
|
1004
1032
|
runtime_params: Optional[dict[str, dict[str, Any]]] = None,
|
|
1005
1033
|
max_concurrency: Optional[int] = None,
|
|
@@ -1009,8 +1037,9 @@ class Flow(BaseModel):
|
|
|
1009
1037
|
|
|
1010
1038
|
Parameters
|
|
1011
1039
|
----------
|
|
1012
|
-
dataset : Dataset
|
|
1013
|
-
Input dataset to test with.
|
|
1040
|
+
dataset : Union[pd.DataFrame, datasets.Dataset]
|
|
1041
|
+
Input dataset to test with. Can be either pandas DataFrame or HuggingFace Dataset
|
|
1042
|
+
(will be automatically converted to DataFrame for backwards compatibility).
|
|
1014
1043
|
sample_size : int, default=2
|
|
1015
1044
|
Number of samples to use for dry run testing.
|
|
1016
1045
|
runtime_params : Optional[Dict[str, Dict[str, Any]]], optional
|
|
@@ -1035,6 +1064,9 @@ class Flow(BaseModel):
|
|
|
1035
1064
|
FlowValidationError
|
|
1036
1065
|
If any block fails during dry run execution.
|
|
1037
1066
|
"""
|
|
1067
|
+
# Convert to DataFrame if needed (backwards compatibility)
|
|
1068
|
+
dataset, _ = self._convert_to_dataframe(dataset)
|
|
1069
|
+
|
|
1038
1070
|
# Validate preconditions
|
|
1039
1071
|
if not self.blocks:
|
|
1040
1072
|
raise FlowValidationError("Cannot dry run empty flow")
|
|
@@ -1066,7 +1098,7 @@ class Flow(BaseModel):
|
|
|
1066
1098
|
)
|
|
1067
1099
|
|
|
1068
1100
|
# Create subset dataset
|
|
1069
|
-
sample_dataset = dataset.
|
|
1101
|
+
sample_dataset = dataset.iloc[:actual_sample_size]
|
|
1070
1102
|
|
|
1071
1103
|
# Initialize dry run results
|
|
1072
1104
|
dry_run_results = {
|
|
@@ -1075,7 +1107,7 @@ class Flow(BaseModel):
|
|
|
1075
1107
|
"sample_size": actual_sample_size,
|
|
1076
1108
|
"original_dataset_size": len(dataset),
|
|
1077
1109
|
"max_concurrency": max_concurrency,
|
|
1078
|
-
"input_columns": dataset.
|
|
1110
|
+
"input_columns": dataset.columns.tolist(),
|
|
1079
1111
|
"blocks_executed": [],
|
|
1080
1112
|
"final_dataset": None,
|
|
1081
1113
|
"execution_successful": True,
|
|
@@ -1119,7 +1151,7 @@ class Flow(BaseModel):
|
|
|
1119
1151
|
"execution_time_seconds": block_execution_time,
|
|
1120
1152
|
"input_rows": input_rows,
|
|
1121
1153
|
"output_rows": len(current_dataset),
|
|
1122
|
-
"output_columns": current_dataset.
|
|
1154
|
+
"output_columns": current_dataset.columns.tolist(),
|
|
1123
1155
|
"parameters_used": block_kwargs,
|
|
1124
1156
|
}
|
|
1125
1157
|
|
|
@@ -1128,14 +1160,14 @@ class Flow(BaseModel):
|
|
|
1128
1160
|
logger.info(
|
|
1129
1161
|
f"Dry run block '{block.block_name}' completed: "
|
|
1130
1162
|
f"{len(current_dataset)} samples, "
|
|
1131
|
-
f"{len(current_dataset.
|
|
1163
|
+
f"{len(current_dataset.columns)} columns, "
|
|
1132
1164
|
f"{block_execution_time:.2f}s"
|
|
1133
1165
|
)
|
|
1134
1166
|
|
|
1135
1167
|
# Store final results
|
|
1136
1168
|
dry_run_results["final_dataset"] = {
|
|
1137
1169
|
"rows": len(current_dataset),
|
|
1138
|
-
"columns": current_dataset.
|
|
1170
|
+
"columns": current_dataset.columns.tolist(),
|
|
1139
1171
|
"sample_data": current_dataset.to_dict()
|
|
1140
1172
|
if len(current_dataset) > 0
|
|
1141
1173
|
else {},
|
|
@@ -1170,7 +1202,7 @@ class Flow(BaseModel):
|
|
|
1170
1202
|
def _estimate_total_time(
|
|
1171
1203
|
self,
|
|
1172
1204
|
first_run_results: dict[str, Any],
|
|
1173
|
-
dataset:
|
|
1205
|
+
dataset: pd.DataFrame,
|
|
1174
1206
|
runtime_params: Optional[dict[str, dict[str, Any]]],
|
|
1175
1207
|
max_concurrency: Optional[int],
|
|
1176
1208
|
) -> dict[str, Any]:
|
|
@@ -1183,7 +1215,7 @@ class Flow(BaseModel):
|
|
|
1183
1215
|
----------
|
|
1184
1216
|
first_run_results : dict
|
|
1185
1217
|
Results from the first dry run.
|
|
1186
|
-
dataset :
|
|
1218
|
+
dataset : pd.DataFrame
|
|
1187
1219
|
Full dataset for estimation.
|
|
1188
1220
|
runtime_params : Optional[dict]
|
|
1189
1221
|
Runtime parameters.
|
|
@@ -1332,13 +1364,13 @@ class Flow(BaseModel):
|
|
|
1332
1364
|
"""
|
|
1333
1365
|
return self.metadata.dataset_requirements
|
|
1334
1366
|
|
|
1335
|
-
def get_dataset_schema(self) ->
|
|
1367
|
+
def get_dataset_schema(self) -> pd.DataFrame:
|
|
1336
1368
|
"""Get an empty dataset with the correct schema for this flow.
|
|
1337
1369
|
|
|
1338
1370
|
Returns
|
|
1339
1371
|
-------
|
|
1340
|
-
|
|
1341
|
-
Empty
|
|
1372
|
+
pd.DataFrame
|
|
1373
|
+
Empty DataFrame with the correct schema/features for this flow.
|
|
1342
1374
|
Users can add data to this dataset or use it to validate their own dataset schema.
|
|
1343
1375
|
|
|
1344
1376
|
Examples
|
|
@@ -1354,50 +1386,51 @@ class Flow(BaseModel):
|
|
|
1354
1386
|
... })
|
|
1355
1387
|
>>>
|
|
1356
1388
|
>>> # Or validate your existing dataset schema
|
|
1357
|
-
>>> my_dataset =
|
|
1358
|
-
>>> if my_dataset.
|
|
1389
|
+
>>> my_dataset = pd.DataFrame(my_data)
|
|
1390
|
+
>>> if my_dataset.dtypes.equals(schema_dataset.dtypes):
|
|
1359
1391
|
... print("Schema matches!")
|
|
1360
1392
|
"""
|
|
1361
1393
|
|
|
1362
1394
|
requirements = self.get_dataset_requirements()
|
|
1363
1395
|
|
|
1364
1396
|
if requirements is None:
|
|
1365
|
-
# Return empty
|
|
1366
|
-
return
|
|
1397
|
+
# Return empty dataframe with no schema requirements
|
|
1398
|
+
return pd.DataFrame({})
|
|
1367
1399
|
|
|
1368
|
-
# Build schema
|
|
1369
|
-
|
|
1400
|
+
# Build schema with column names and dtypes
|
|
1401
|
+
schema = {}
|
|
1370
1402
|
|
|
1371
1403
|
# Process required columns
|
|
1372
1404
|
for col_name in requirements.required_columns:
|
|
1373
1405
|
col_type = requirements.column_types.get(col_name, "string")
|
|
1374
|
-
|
|
1406
|
+
schema[col_name] = self._map_column_type_to_dtype(col_type)
|
|
1375
1407
|
|
|
1376
1408
|
# Process optional columns
|
|
1377
1409
|
for col_name in requirements.optional_columns:
|
|
1378
1410
|
col_type = requirements.column_types.get(col_name, "string")
|
|
1379
|
-
|
|
1411
|
+
schema[col_name] = self._map_column_type_to_dtype(col_type)
|
|
1380
1412
|
|
|
1381
|
-
# Create empty
|
|
1382
|
-
|
|
1383
|
-
|
|
1413
|
+
# Create empty dataframe with the correct dtypes
|
|
1414
|
+
empty_data = {
|
|
1415
|
+
col_name: pd.Series([], dtype=dtype) for col_name, dtype in schema.items()
|
|
1416
|
+
}
|
|
1384
1417
|
|
|
1385
|
-
return
|
|
1418
|
+
return pd.DataFrame(empty_data)
|
|
1386
1419
|
|
|
1387
|
-
def
|
|
1388
|
-
"""Map column type string to
|
|
1389
|
-
# Map common type names to
|
|
1420
|
+
def _map_column_type_to_dtype(self, col_type: str):
|
|
1421
|
+
"""Map column type string to pandas dtype."""
|
|
1422
|
+
# Map common type names to pandas dtypes
|
|
1390
1423
|
if col_type in ["str", "string", "text"]:
|
|
1391
|
-
return
|
|
1424
|
+
return "object" # pandas uses 'object' for strings
|
|
1392
1425
|
elif col_type in ["int", "integer"]:
|
|
1393
|
-
return
|
|
1426
|
+
return "Int64" # nullable integer
|
|
1394
1427
|
elif col_type in ["float", "number"]:
|
|
1395
|
-
return
|
|
1428
|
+
return "float64"
|
|
1396
1429
|
elif col_type in ["bool", "boolean"]:
|
|
1397
|
-
return
|
|
1430
|
+
return "boolean" # nullable boolean
|
|
1398
1431
|
else:
|
|
1399
|
-
# Default to string for unknown types
|
|
1400
|
-
return
|
|
1432
|
+
# Default to object (string) for unknown types
|
|
1433
|
+
return "object"
|
|
1401
1434
|
|
|
1402
1435
|
def print_info(self) -> None:
|
|
1403
1436
|
"""
|