sdg-hub 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. sdg_hub/_version.py +2 -2
  2. sdg_hub/core/blocks/base.py +60 -58
  3. sdg_hub/core/blocks/filtering/column_value_filter.py +29 -16
  4. sdg_hub/core/blocks/llm/__init__.py +0 -2
  5. sdg_hub/core/blocks/llm/llm_chat_block.py +42 -36
  6. sdg_hub/core/blocks/llm/llm_parser_block.py +13 -59
  7. sdg_hub/core/blocks/llm/prompt_builder_block.py +15 -10
  8. sdg_hub/core/blocks/llm/text_parser_block.py +14 -61
  9. sdg_hub/core/blocks/transform/duplicate_columns.py +9 -8
  10. sdg_hub/core/blocks/transform/index_based_mapper.py +29 -15
  11. sdg_hub/core/blocks/transform/json_structure_block.py +16 -13
  12. sdg_hub/core/blocks/transform/melt_columns.py +13 -12
  13. sdg_hub/core/blocks/transform/rename_columns.py +20 -9
  14. sdg_hub/core/blocks/transform/text_concat.py +20 -21
  15. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +6 -5
  16. sdg_hub/core/flow/base.py +139 -106
  17. sdg_hub/core/flow/checkpointer.py +34 -36
  18. sdg_hub/core/flow/validation.py +4 -4
  19. sdg_hub/core/utils/datautils.py +52 -54
  20. sdg_hub/core/utils/flow_metrics.py +9 -6
  21. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/multilingual/japanese/flow.yaml +1 -0
  22. {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/METADATA +5 -9
  23. {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/RECORD +26 -28
  24. sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +0 -771
  25. sdg_hub/core/utils/temp_manager.py +0 -57
  26. {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/WHEEL +0 -0
  27. {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/licenses/LICENSE +0 -0
  28. {sdg_hub-0.5.1.dist-info → sdg_hub-0.6.1.dist-info}/top_level.txt +0 -0
sdg_hub/core/flow/base.py CHANGED
@@ -5,13 +5,9 @@
5
5
  from datetime import datetime
6
6
  from pathlib import Path
7
7
  from typing import Any, Optional, Union
8
- from weakref import finalize
9
- import gc
10
8
  import time
11
9
  import uuid
12
10
 
13
- # Third Party
14
- from datasets import Dataset
15
11
  from pydantic import (
16
12
  BaseModel,
17
13
  ConfigDict,
@@ -25,6 +21,9 @@ from rich.panel import Panel
25
21
  from rich.table import Table
26
22
  from rich.tree import Tree
27
23
  import datasets
24
+
25
+ # Third Party
26
+ import pandas as pd
28
27
  import yaml
29
28
 
30
29
  # Local
@@ -39,11 +38,6 @@ from ..utils.flow_metrics import (
39
38
  )
40
39
  from ..utils.logger_config import setup_logger
41
40
  from ..utils.path_resolution import resolve_path
42
- from ..utils.temp_manager import (
43
- cleanup_path,
44
- create_temp_dir,
45
- create_temp_file,
46
- )
47
41
  from ..utils.time_estimator import estimate_execution_time
48
42
  from ..utils.yaml_utils import save_flow_yaml
49
43
  from .checkpointer import FlowCheckpointer
@@ -292,15 +286,62 @@ class Flow(BaseModel):
292
286
  return {key: str(yaml_dir / path) for key, path in paths.items()}
293
287
  return paths
294
288
 
289
+ @staticmethod
290
+ def _convert_to_dataframe(
291
+ dataset: Union[pd.DataFrame, datasets.Dataset],
292
+ ) -> tuple[pd.DataFrame, bool]:
293
+ """Convert datasets.Dataset to pd.DataFrame if needed (backwards compatibility).
294
+
295
+ Parameters
296
+ ----------
297
+ dataset : Union[pd.DataFrame, datasets.Dataset]
298
+ Input dataset in either format.
299
+
300
+ Returns
301
+ -------
302
+ tuple[pd.DataFrame, bool]
303
+ Tuple of (converted DataFrame, was_dataset flag).
304
+ was_dataset is True if input was a datasets.Dataset, False if it was already a DataFrame.
305
+ """
306
+ if isinstance(dataset, datasets.Dataset):
307
+ logger.info("Converting datasets.Dataset to pd.DataFrame for processing")
308
+ return dataset.to_pandas(), True
309
+ return dataset, False
310
+
311
+ @staticmethod
312
+ def _convert_from_dataframe(
313
+ df: pd.DataFrame, should_convert: bool
314
+ ) -> Union[pd.DataFrame, datasets.Dataset]:
315
+ """Convert pd.DataFrame back to datasets.Dataset if needed (backwards compatibility).
316
+
317
+ Parameters
318
+ ----------
319
+ df : pd.DataFrame
320
+ DataFrame to potentially convert.
321
+ should_convert : bool
322
+ If True, convert to datasets.Dataset. If False, return as-is.
323
+
324
+ Returns
325
+ -------
326
+ Union[pd.DataFrame, datasets.Dataset]
327
+ Original DataFrame or converted Dataset, matching the input type.
328
+ """
329
+ if should_convert:
330
+ logger.info(
331
+ "Converting pd.DataFrame back to datasets.Dataset to match input type"
332
+ )
333
+ return datasets.Dataset.from_pandas(df)
334
+ return df
335
+
295
336
  def generate(
296
337
  self,
297
- dataset: Dataset,
338
+ dataset: Union[pd.DataFrame, datasets.Dataset],
298
339
  runtime_params: Optional[dict[str, dict[str, Any]]] = None,
299
340
  checkpoint_dir: Optional[str] = None,
300
341
  save_freq: Optional[int] = None,
301
342
  log_dir: Optional[str] = None,
302
343
  max_concurrency: Optional[int] = None,
303
- ) -> Dataset:
344
+ ) -> Union[pd.DataFrame, datasets.Dataset]:
304
345
  """Execute the flow blocks in sequence to generate data.
305
346
 
306
347
  Note: For flows with LLM blocks, set_model_config() must be called first
@@ -308,8 +349,9 @@ class Flow(BaseModel):
308
349
 
309
350
  Parameters
310
351
  ----------
311
- dataset : Dataset
312
- Input dataset to process.
352
+ dataset : Union[pd.DataFrame, datasets.Dataset]
353
+ Input dataset to process. Can be either pandas DataFrame or HuggingFace Dataset
354
+ (will be automatically converted to DataFrame for backwards compatibility).
313
355
  runtime_params : Optional[Dict[str, Dict[str, Any]]], optional
314
356
  Runtime parameters organized by block name. Format:
315
357
  {
@@ -331,8 +373,9 @@ class Flow(BaseModel):
331
373
 
332
374
  Returns
333
375
  -------
334
- Dataset
376
+ Union[pd.DataFrame, datasets.Dataset]
335
377
  Processed dataset after all blocks have been executed.
378
+ Return type matches the input type (DataFrame in -> DataFrame out, Dataset in -> Dataset out).
336
379
 
337
380
  Raises
338
381
  ------
@@ -341,6 +384,9 @@ class Flow(BaseModel):
341
384
  FlowValidationError
342
385
  If flow validation fails or if model configuration is required but not set.
343
386
  """
387
+ # Convert to DataFrame if needed (backwards compatibility)
388
+ dataset, was_dataset = self._convert_to_dataframe(dataset)
389
+
344
390
  # Validate save_freq parameter early to prevent range() errors
345
391
  if save_freq is not None and save_freq <= 0:
346
392
  raise FlowValidationError(
@@ -436,7 +482,7 @@ class Flow(BaseModel):
436
482
  finally:
437
483
  flow_logger.removeHandler(h)
438
484
 
439
- return completed_dataset
485
+ return self._convert_from_dataframe(completed_dataset, was_dataset)
440
486
 
441
487
  dataset = remaining_dataset
442
488
  flow_logger.info(f"Resuming with {len(dataset)} remaining samples")
@@ -463,7 +509,7 @@ class Flow(BaseModel):
463
509
  # Process in chunks of save_freq
464
510
  for i in range(0, len(dataset), save_freq):
465
511
  chunk_end = min(i + save_freq, len(dataset))
466
- chunk_dataset = dataset.select(range(i, chunk_end))
512
+ chunk_dataset = dataset.iloc[i:chunk_end]
467
513
 
468
514
  flow_logger.info(
469
515
  f"Processing chunk {i // save_freq + 1}: samples {i} to {chunk_end - 1}"
@@ -487,7 +533,11 @@ class Flow(BaseModel):
487
533
  )
488
534
 
489
535
  # Combine with previously completed samples if any
490
- if checkpointer and completed_dataset:
536
+ if (
537
+ checkpointer
538
+ and completed_dataset is not None
539
+ and not completed_dataset.empty
540
+ ):
491
541
  final_dataset = safe_concatenate_with_validation(
492
542
  [completed_dataset, final_dataset],
493
543
  "completed checkpoint data with newly processed data",
@@ -505,7 +555,7 @@ class Flow(BaseModel):
505
555
  checkpointer.save_final_checkpoint()
506
556
 
507
557
  # Combine with previously completed samples if any
508
- if completed_dataset:
558
+ if completed_dataset is not None and not completed_dataset.empty:
509
559
  final_dataset = safe_concatenate_with_validation(
510
560
  [completed_dataset, final_dataset],
511
561
  "completed checkpoint data with newly processed data",
@@ -543,7 +593,7 @@ class Flow(BaseModel):
543
593
  flow_logger.info(
544
594
  f"Flow '{self.metadata.name}' completed successfully: "
545
595
  f"{len(final_dataset)} final samples, "
546
- f"{len(final_dataset.column_names)} final columns"
596
+ f"{len(final_dataset.columns)} final columns"
547
597
  )
548
598
 
549
599
  # Close file handlers if we opened a flow-specific logger
@@ -557,20 +607,20 @@ class Flow(BaseModel):
557
607
  finally:
558
608
  flow_logger.removeHandler(h)
559
609
 
560
- return final_dataset
610
+ return self._convert_from_dataframe(final_dataset, was_dataset)
561
611
 
562
612
  def _execute_blocks_on_dataset(
563
613
  self,
564
- dataset: Dataset,
614
+ dataset: pd.DataFrame,
565
615
  runtime_params: dict[str, dict[str, Any]],
566
616
  flow_logger=None,
567
617
  max_concurrency: Optional[int] = None,
568
- ) -> Dataset:
618
+ ) -> pd.DataFrame:
569
619
  """Execute all blocks in sequence on the given dataset.
570
620
 
571
621
  Parameters
572
622
  ----------
573
- dataset : Dataset
623
+ dataset : pd.DataFrame
574
624
  Dataset to process through all blocks.
575
625
  runtime_params : Dict[str, Dict[str, Any]]
576
626
  Runtime parameters for block execution.
@@ -581,13 +631,12 @@ class Flow(BaseModel):
581
631
 
582
632
  Returns
583
633
  -------
584
- Dataset
634
+ pd.DataFrame
585
635
  Dataset after processing through all blocks.
586
636
  """
587
637
  # Use provided logger or fall back to global logger
588
638
  exec_logger = flow_logger if flow_logger is not None else logger
589
639
  current_dataset = dataset
590
- current_dataset_temp_path: Optional[Path] = None
591
640
 
592
641
  # Execute blocks in sequence
593
642
  for i, block in enumerate(self.blocks):
@@ -599,14 +648,6 @@ class Flow(BaseModel):
599
648
  # Prepare block execution parameters
600
649
  block_kwargs = self._prepare_block_kwargs(block, runtime_params)
601
650
 
602
- block_temp_jsonl_path: Optional[Path] = None
603
- dataset_temp_dir: Optional[Path] = None
604
- if getattr(block, "_flow_requires_jsonl_tmp", False):
605
- block_temp_jsonl_path = create_temp_file(
606
- prefix=f"{block.block_name}_parser", suffix=".jsonl"
607
- )
608
- block_kwargs["_flow_tmp_jsonl_path"] = str(block_temp_jsonl_path)
609
-
610
651
  # Add max_concurrency to block kwargs if provided
611
652
  if max_concurrency is not None:
612
653
  block_kwargs["_flow_max_concurrency"] = max_concurrency
@@ -614,7 +655,7 @@ class Flow(BaseModel):
614
655
  # Capture metrics before execution
615
656
  start_time = time.perf_counter()
616
657
  input_rows = len(current_dataset)
617
- input_cols = set(current_dataset.column_names)
658
+ input_cols = set(current_dataset.columns)
618
659
 
619
660
  try:
620
661
  # Execute block with validation and logging
@@ -626,32 +667,10 @@ class Flow(BaseModel):
626
667
  f"Block '{block.block_name}' produced empty dataset"
627
668
  )
628
669
 
629
- # Here, we write and reload dataset object from and to disk.
630
- # This is done because HF Datasets library creates a ton of intermediate
631
- # objects, and holds on to them even after the objects have fulfilled
632
- # their purpose. To get flush these objects, HF recommends to implement
633
- # this `save_to_disk` and `load_from_disk` hack.
634
- # https://github.com/huggingface/datasets/blob/main/src/datasets/arrow_dataset.py#L1029
635
- previous_temp_path = current_dataset_temp_path
636
- dataset_temp_dir = create_temp_dir(prefix=f"flow_{block.block_name}")
637
- current_dataset.save_to_disk(str(dataset_temp_dir))
638
- del current_dataset
639
- gc.collect()
640
- current_dataset = datasets.load_from_disk(
641
- str(dataset_temp_dir), keep_in_memory=False
642
- )
643
- finalize(current_dataset, cleanup_path, dataset_temp_dir)
644
- current_dataset_temp_path = dataset_temp_dir
645
- if previous_temp_path and previous_temp_path != dataset_temp_dir:
646
- cleanup_path(previous_temp_path)
647
-
648
- if block_temp_jsonl_path is not None:
649
- cleanup_path(block_temp_jsonl_path)
650
-
651
670
  # Capture metrics after successful execution
652
671
  execution_time = time.perf_counter() - start_time
653
672
  output_rows = len(current_dataset)
654
- output_cols = set(current_dataset.column_names)
673
+ output_cols = set(current_dataset.columns)
655
674
  added_cols = output_cols - input_cols
656
675
  removed_cols = input_cols - output_cols
657
676
 
@@ -672,14 +691,10 @@ class Flow(BaseModel):
672
691
  exec_logger.info(
673
692
  f"Block '{block.block_name}' completed successfully: "
674
693
  f"{len(current_dataset)} samples, "
675
- f"{len(current_dataset.column_names)} columns"
694
+ f"{len(current_dataset.columns)} columns"
676
695
  )
677
696
 
678
697
  except Exception as exc:
679
- if block_temp_jsonl_path is not None:
680
- cleanup_path(block_temp_jsonl_path)
681
- if dataset_temp_dir is not None:
682
- cleanup_path(dataset_temp_dir)
683
698
  # Capture metrics for failed execution
684
699
  execution_time = time.perf_counter() - start_time
685
700
  self._block_metrics.append(
@@ -703,13 +718,6 @@ class Flow(BaseModel):
703
718
  f"Block '{block.block_name}' execution failed: {exc}"
704
719
  ) from exc
705
720
 
706
- if current_dataset_temp_path is not None:
707
- final_temp_path = current_dataset_temp_path
708
- current_dataset = datasets.load_from_disk(
709
- str(final_temp_path), keep_in_memory=True
710
- )
711
- cleanup_path(final_temp_path)
712
-
713
721
  return current_dataset
714
722
 
715
723
  def _prepare_block_kwargs(
@@ -981,17 +989,37 @@ class Flow(BaseModel):
981
989
  "experimental": self.metadata.recommended_models.experimental,
982
990
  }
983
991
 
984
- def validate_dataset(self, dataset: Dataset) -> list[str]:
985
- """Validate dataset against flow requirements."""
992
+ def validate_dataset(
993
+ self, dataset: Union[pd.DataFrame, datasets.Dataset]
994
+ ) -> list[str]:
995
+ """Validate dataset against flow requirements.
996
+
997
+ Parameters
998
+ ----------
999
+ dataset : Union[pd.DataFrame, datasets.Dataset]
1000
+ Dataset to validate. Can be either pandas DataFrame or HuggingFace Dataset
1001
+ (will be automatically converted to DataFrame for backwards compatibility).
1002
+
1003
+ Returns
1004
+ -------
1005
+ list[str]
1006
+ List of validation error messages (empty if valid).
1007
+ """
1008
+ # Convert to DataFrame if needed (backwards compatibility)
1009
+ dataset, _ = self._convert_to_dataframe(dataset)
1010
+
986
1011
  errors = []
987
1012
 
988
1013
  if len(dataset) == 0:
989
1014
  errors.append("Dataset is empty")
990
1015
 
991
1016
  if self.metadata.dataset_requirements:
1017
+ # Get column names
1018
+ columns = dataset.columns.tolist()
1019
+
992
1020
  errors.extend(
993
1021
  self.metadata.dataset_requirements.validate_dataset(
994
- dataset.column_names, len(dataset)
1022
+ columns, len(dataset)
995
1023
  )
996
1024
  )
997
1025
 
@@ -999,7 +1027,7 @@ class Flow(BaseModel):
999
1027
 
1000
1028
  def dry_run(
1001
1029
  self,
1002
- dataset: Dataset,
1030
+ dataset: Union[pd.DataFrame, datasets.Dataset],
1003
1031
  sample_size: int = 2,
1004
1032
  runtime_params: Optional[dict[str, dict[str, Any]]] = None,
1005
1033
  max_concurrency: Optional[int] = None,
@@ -1009,8 +1037,9 @@ class Flow(BaseModel):
1009
1037
 
1010
1038
  Parameters
1011
1039
  ----------
1012
- dataset : Dataset
1013
- Input dataset to test with.
1040
+ dataset : Union[pd.DataFrame, datasets.Dataset]
1041
+ Input dataset to test with. Can be either pandas DataFrame or HuggingFace Dataset
1042
+ (will be automatically converted to DataFrame for backwards compatibility).
1014
1043
  sample_size : int, default=2
1015
1044
  Number of samples to use for dry run testing.
1016
1045
  runtime_params : Optional[Dict[str, Dict[str, Any]]], optional
@@ -1035,6 +1064,9 @@ class Flow(BaseModel):
1035
1064
  FlowValidationError
1036
1065
  If any block fails during dry run execution.
1037
1066
  """
1067
+ # Convert to DataFrame if needed (backwards compatibility)
1068
+ dataset, _ = self._convert_to_dataframe(dataset)
1069
+
1038
1070
  # Validate preconditions
1039
1071
  if not self.blocks:
1040
1072
  raise FlowValidationError("Cannot dry run empty flow")
@@ -1066,7 +1098,7 @@ class Flow(BaseModel):
1066
1098
  )
1067
1099
 
1068
1100
  # Create subset dataset
1069
- sample_dataset = dataset.select(range(actual_sample_size))
1101
+ sample_dataset = dataset.iloc[:actual_sample_size]
1070
1102
 
1071
1103
  # Initialize dry run results
1072
1104
  dry_run_results = {
@@ -1075,7 +1107,7 @@ class Flow(BaseModel):
1075
1107
  "sample_size": actual_sample_size,
1076
1108
  "original_dataset_size": len(dataset),
1077
1109
  "max_concurrency": max_concurrency,
1078
- "input_columns": dataset.column_names,
1110
+ "input_columns": dataset.columns.tolist(),
1079
1111
  "blocks_executed": [],
1080
1112
  "final_dataset": None,
1081
1113
  "execution_successful": True,
@@ -1119,7 +1151,7 @@ class Flow(BaseModel):
1119
1151
  "execution_time_seconds": block_execution_time,
1120
1152
  "input_rows": input_rows,
1121
1153
  "output_rows": len(current_dataset),
1122
- "output_columns": current_dataset.column_names,
1154
+ "output_columns": current_dataset.columns.tolist(),
1123
1155
  "parameters_used": block_kwargs,
1124
1156
  }
1125
1157
 
@@ -1128,14 +1160,14 @@ class Flow(BaseModel):
1128
1160
  logger.info(
1129
1161
  f"Dry run block '{block.block_name}' completed: "
1130
1162
  f"{len(current_dataset)} samples, "
1131
- f"{len(current_dataset.column_names)} columns, "
1163
+ f"{len(current_dataset.columns)} columns, "
1132
1164
  f"{block_execution_time:.2f}s"
1133
1165
  )
1134
1166
 
1135
1167
  # Store final results
1136
1168
  dry_run_results["final_dataset"] = {
1137
1169
  "rows": len(current_dataset),
1138
- "columns": current_dataset.column_names,
1170
+ "columns": current_dataset.columns.tolist(),
1139
1171
  "sample_data": current_dataset.to_dict()
1140
1172
  if len(current_dataset) > 0
1141
1173
  else {},
@@ -1170,7 +1202,7 @@ class Flow(BaseModel):
1170
1202
  def _estimate_total_time(
1171
1203
  self,
1172
1204
  first_run_results: dict[str, Any],
1173
- dataset: Dataset,
1205
+ dataset: pd.DataFrame,
1174
1206
  runtime_params: Optional[dict[str, dict[str, Any]]],
1175
1207
  max_concurrency: Optional[int],
1176
1208
  ) -> dict[str, Any]:
@@ -1183,7 +1215,7 @@ class Flow(BaseModel):
1183
1215
  ----------
1184
1216
  first_run_results : dict
1185
1217
  Results from the first dry run.
1186
- dataset : Dataset
1218
+ dataset : pd.DataFrame
1187
1219
  Full dataset for estimation.
1188
1220
  runtime_params : Optional[dict]
1189
1221
  Runtime parameters.
@@ -1332,13 +1364,13 @@ class Flow(BaseModel):
1332
1364
  """
1333
1365
  return self.metadata.dataset_requirements
1334
1366
 
1335
- def get_dataset_schema(self) -> Dataset:
1367
+ def get_dataset_schema(self) -> pd.DataFrame:
1336
1368
  """Get an empty dataset with the correct schema for this flow.
1337
1369
 
1338
1370
  Returns
1339
1371
  -------
1340
- Dataset
1341
- Empty HuggingFace Dataset with the correct schema/features for this flow.
1372
+ pd.DataFrame
1373
+ Empty DataFrame with the correct schema/features for this flow.
1342
1374
  Users can add data to this dataset or use it to validate their own dataset schema.
1343
1375
 
1344
1376
  Examples
@@ -1354,50 +1386,51 @@ class Flow(BaseModel):
1354
1386
  ... })
1355
1387
  >>>
1356
1388
  >>> # Or validate your existing dataset schema
1357
- >>> my_dataset = Dataset.from_dict(my_data)
1358
- >>> if my_dataset.features == schema_dataset.features:
1389
+ >>> my_dataset = pd.DataFrame(my_data)
1390
+ >>> if my_dataset.dtypes.equals(schema_dataset.dtypes):
1359
1391
  ... print("Schema matches!")
1360
1392
  """
1361
1393
 
1362
1394
  requirements = self.get_dataset_requirements()
1363
1395
 
1364
1396
  if requirements is None:
1365
- # Return empty dataset with no schema requirements
1366
- return Dataset.from_dict({})
1397
+ # Return empty dataframe with no schema requirements
1398
+ return pd.DataFrame({})
1367
1399
 
1368
- # Build schema features
1369
- schema_features = {}
1400
+ # Build schema with column names and dtypes
1401
+ schema = {}
1370
1402
 
1371
1403
  # Process required columns
1372
1404
  for col_name in requirements.required_columns:
1373
1405
  col_type = requirements.column_types.get(col_name, "string")
1374
- schema_features[col_name] = self._map_column_type_to_feature(col_type)
1406
+ schema[col_name] = self._map_column_type_to_dtype(col_type)
1375
1407
 
1376
1408
  # Process optional columns
1377
1409
  for col_name in requirements.optional_columns:
1378
1410
  col_type = requirements.column_types.get(col_name, "string")
1379
- schema_features[col_name] = self._map_column_type_to_feature(col_type)
1411
+ schema[col_name] = self._map_column_type_to_dtype(col_type)
1380
1412
 
1381
- # Create empty dataset with the correct features
1382
- features = datasets.Features(schema_features)
1383
- empty_data = {col_name: [] for col_name in schema_features.keys()}
1413
+ # Create empty dataframe with the correct dtypes
1414
+ empty_data = {
1415
+ col_name: pd.Series([], dtype=dtype) for col_name, dtype in schema.items()
1416
+ }
1384
1417
 
1385
- return Dataset.from_dict(empty_data, features=features)
1418
+ return pd.DataFrame(empty_data)
1386
1419
 
1387
- def _map_column_type_to_feature(self, col_type: str):
1388
- """Map column type string to HuggingFace feature type."""
1389
- # Map common type names to HuggingFace types
1420
+ def _map_column_type_to_dtype(self, col_type: str):
1421
+ """Map column type string to pandas dtype."""
1422
+ # Map common type names to pandas dtypes
1390
1423
  if col_type in ["str", "string", "text"]:
1391
- return datasets.Value("string")
1424
+ return "object" # pandas uses 'object' for strings
1392
1425
  elif col_type in ["int", "integer"]:
1393
- return datasets.Value("int64")
1426
+ return "Int64" # nullable integer
1394
1427
  elif col_type in ["float", "number"]:
1395
- return datasets.Value("float64")
1428
+ return "float64"
1396
1429
  elif col_type in ["bool", "boolean"]:
1397
- return datasets.Value("bool")
1430
+ return "boolean" # nullable boolean
1398
1431
  else:
1399
- # Default to string for unknown types
1400
- return datasets.Value("string")
1432
+ # Default to object (string) for unknown types
1433
+ return "object"
1401
1434
 
1402
1435
  def print_info(self) -> None:
1403
1436
  """