dragon-ml-toolbox 3.2.1__tar.gz → 3.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (30) hide show
  1. {dragon_ml_toolbox-3.2.1/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-3.4.0}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/ETL_engineering.py +64 -15
  4. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/VIF_factor.py +2 -2
  5. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/data_exploration.py +42 -0
  6. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/utilities.py +52 -6
  7. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/pyproject.toml +1 -1
  8. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/LICENSE +0 -0
  9. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/LICENSE-THIRD-PARTY.md +0 -0
  10. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/README.md +0 -0
  11. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  12. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  13. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  14. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  15. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/GUI_tools.py +0 -0
  16. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/MICE_imputation.py +0 -0
  17. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/ML_callbacks.py +0 -0
  18. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/ML_evaluation.py +0 -0
  19. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/ML_trainer.py +0 -0
  20. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/ML_tutorial.py +0 -0
  21. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/PSO_optimization.py +0 -0
  22. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/RNN_forecast.py +0 -0
  23. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/__init__.py +0 -0
  24. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/_particle_swarm_optimization.py +0 -0
  25. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/_pytorch_models.py +0 -0
  26. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/datasetmaster.py +0 -0
  27. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/ensemble_learning.py +0 -0
  28. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/handle_excel.py +0 -0
  29. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/ml_tools/logger.py +0 -0
  30. {dragon_ml_toolbox-3.2.1 → dragon_ml_toolbox-3.4.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 3.2.1
3
+ Version: 3.4.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 3.2.1
3
+ Version: 3.4.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -12,6 +12,7 @@ __all__ = [
12
12
  "TransformationRecipe",
13
13
  "DataProcessor",
14
14
  "BinaryTransformer",
15
+ "MultiBinaryDummifier",
15
16
  "KeywordDummifier",
16
17
  "NumberExtractor",
17
18
  "MultiNumberExtractor",
@@ -400,12 +401,72 @@ class BinaryTransformer:
400
401
  return (~contains_keyword).cast(pl.UInt8)
401
402
 
402
403
 
404
+ class MultiBinaryDummifier:
405
+ """
406
+ A one-to-many transformer that creates multiple binary columns from a single
407
+ text column based on a list of keywords.
408
+
409
+ For each keyword provided, this transformer generates a corresponding column
410
+ with a value of 1 if the keyword is present in the input string, and 0 otherwise.
411
+ It is designed to be used within the DataProcessor pipeline.
412
+
413
+ Args:
414
+ keywords (List[str]):
415
+ A list of strings, where each string is a keyword to search for. A separate
416
+ binary column will be created for each keyword.
417
+ case_insensitive (bool):
418
+ If True, keyword matching ignores case. Defaults to True.
419
+ """
420
+ def __init__(self, keywords: List[str], case_insensitive: bool = True):
421
+ if not isinstance(keywords, list) or not all(isinstance(k, str) for k in keywords):
422
+ raise TypeError("The 'keywords' argument must be a list of strings.")
423
+ if not keywords:
424
+ raise ValueError("The 'keywords' list cannot be empty.")
425
+
426
+ self.keywords = keywords
427
+ self.case_insensitive = case_insensitive
428
+
429
+ def __call__(self, column: pl.Series) -> pl.DataFrame:
430
+ """
431
+ Executes the dummification logic.
432
+
433
+ Args:
434
+ column (pl.Series): The input Polars Series to transform.
435
+
436
+ Returns:
437
+ pl.DataFrame: A DataFrame where each column corresponds to a keyword.
438
+ """
439
+ # Ensure the input is treated as a string, preserving nulls
440
+ str_column = column.cast(pl.Utf8)
441
+
442
+ output_expressions = []
443
+ for i, keyword in enumerate(self.keywords):
444
+ # Escape keyword to treat it as a literal, not a regex pattern
445
+ base_pattern = re.escape(keyword)
446
+
447
+ # Add case-insensitivity flag `(?i)` if needed
448
+ pattern = f"(?i){base_pattern}" if self.case_insensitive else base_pattern
449
+
450
+ # Create the binary expression
451
+ expr = (
452
+ pl.when(str_column.is_null())
453
+ .then(None) # Propagate nulls from original column
454
+ .when(str_column.str.contains(pattern))
455
+ .then(pl.lit(1, dtype=pl.UInt8))
456
+ .otherwise(pl.lit(0, dtype=pl.UInt8))
457
+ .alias(f"col_{i}") # Generic name for DataProcessor
458
+ )
459
+ output_expressions.append(expr)
460
+
461
+ return pl.select(output_expressions)
462
+
463
+
403
464
  class KeywordDummifier:
404
465
  """
405
466
  A configurable transformer that creates one-hot encoded columns based on
406
467
  keyword matching in a Polars Series.
407
468
 
408
- Instantiate this class with keyword configurations. The instance can be used as a 'transform' callable compatible with the `TransformationRecipe`.
469
+ Operates on a "first match wins" principle.
409
470
 
410
471
  Args:
411
472
  group_names (List[str]):
@@ -417,17 +478,14 @@ class KeywordDummifier:
417
478
  `group_name` at the same index and contains the keywords to search for.
418
479
  case_insensitive (bool):
419
480
  If True, keyword matching ignores case.
420
- drop_empty (bool):
421
- If True, columns that contain no positive matches (all zeros) will be dropped from the final output.
422
481
  """
423
- def __init__(self, group_names: List[str], group_keywords: List[List[str]], case_insensitive: bool = True, drop_empty: bool = True):
482
+ def __init__(self, group_names: List[str], group_keywords: List[List[str]], case_insensitive: bool = True):
424
483
  if len(group_names) != len(group_keywords):
425
484
  raise ValueError("Initialization failed: 'group_names' and 'group_keywords' must have the same length.")
426
485
 
427
486
  self.group_names = group_names
428
487
  self.group_keywords = group_keywords
429
488
  self.case_insensitive = case_insensitive
430
- self.drop_empty = drop_empty
431
489
 
432
490
  def __call__(self, column: pl.Series) -> pl.DataFrame:
433
491
  """
@@ -474,16 +532,7 @@ class KeywordDummifier:
474
532
  # If a group had no matches, create a column of zeros
475
533
  final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(name))
476
534
 
477
- # First, create a full DataFrame with all potential columns
478
- result_df = pl.DataFrame(final_columns)
479
-
480
- # If drop_empty is True, filter out all-zero columns
481
- if self.drop_empty:
482
- # A column is kept if its sum is greater than 0
483
- cols_to_keep = [col for col in result_df if col.sum() > 0]
484
- return result_df.select(cols_to_keep)
485
-
486
- return result_df
535
+ return pl.DataFrame(final_columns)
487
536
 
488
537
 
489
538
  class NumberExtractor:
@@ -35,7 +35,7 @@ def compute_vif(
35
35
  Args:
36
36
  df (pd.DataFrame): The input DataFrame.
37
37
  use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
38
- ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
38
+ ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `use_columns` is provided.
39
39
  max_features_to_plot (int): Adjust the number of features shown in the plot.
40
40
  save_dir (str | Path | None): Directory to save the plot as SVG. If None, the plot is not saved.
41
41
  filename (str | None): Optional filename for saving the plot. Defaults to "VIF_plot.svg".
@@ -194,7 +194,7 @@ def compute_vif_multi(input_directory: Union[str, Path],
194
194
  output_plot_directory (str | Path): Save plots to this directory.
195
195
  output_dataset_directory (str | Path | None): If provided, saves new CSV files to this directory.
196
196
  use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
197
- ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
197
+ ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `use_columns` is provided.
198
198
  max_features_to_plot (int): Adjust the number of features shown in the plot.
199
199
  fontsize (int): Base fontsize to scale title and labels on hte plot.
200
200
 
@@ -15,6 +15,7 @@ import re
15
15
  # Keep track of all available tools, show using `info()`
16
16
  __all__ = [
17
17
  "summarize_dataframe",
18
+ "drop_zero_only_columns",
18
19
  "drop_rows_with_missing_data",
19
20
  "split_features_targets",
20
21
  "show_null_columns",
@@ -61,6 +62,47 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
61
62
  return summary
62
63
 
63
64
 
65
+ def drop_zero_only_columns(df: pd.DataFrame, verbose: bool=True) -> pd.DataFrame:
66
+ """
67
+ Removes columns from a pandas DataFrame that contain only zeros and null/NaN values.
68
+
69
+ This utility is useful for cleaning data after dummification steps that may result in empty columns.
70
+
71
+ Args:
72
+ df (pd.DataFrame):
73
+ The pandas DataFrame to clean.
74
+
75
+ Returns:
76
+ pd.DataFrame:
77
+ A new DataFrame with the empty columns removed.
78
+ """
79
+ if not isinstance(df, pd.DataFrame):
80
+ raise TypeError("Input must be a pandas DataFrame.")
81
+
82
+ original_columns = set(df.columns)
83
+
84
+ cols_to_keep = []
85
+ for col_name in df.columns:
86
+ column = df[col_name]
87
+
88
+ # Keep any column that is not numeric by default
89
+ if not is_numeric_dtype(column):
90
+ cols_to_keep.append(col_name)
91
+ continue
92
+
93
+ # For numeric columns, check if there's at least one non-zero value.
94
+ if (column != 0).any():
95
+ cols_to_keep.append(col_name)
96
+
97
+ dropped_columns = original_columns - set(cols_to_keep)
98
+ if dropped_columns and verbose:
99
+ print(f"Dropped {len(dropped_columns)} columns:")
100
+ for dropped_column in dropped_columns:
101
+ print(f" {dropped_column}")
102
+
103
+ return df[cols_to_keep]
104
+
105
+
64
106
  def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]], threshold: float = 0.7) -> pd.DataFrame:
65
107
  """
66
108
  Drops rows from the DataFrame using a two-stage strategy:
@@ -24,7 +24,8 @@ __all__ = [
24
24
  "threshold_binary_values_batch",
25
25
  "serialize_object",
26
26
  "deserialize_object",
27
- "distribute_datasets_by_target"
27
+ "distribute_datasets_by_target",
28
+ "train_dataset_orchestrator"
28
29
  ]
29
30
 
30
31
 
@@ -497,7 +498,7 @@ def threshold_binary_values_batch(
497
498
  return np.hstack([cont_part, bin_part])
498
499
 
499
500
 
500
- def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose: bool=True, raise_on_error: bool=False) -> Optional[Path]:
501
+ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose: bool=True, raise_on_error: bool=False) -> None:
501
502
  """
502
503
  Serializes a Python object using joblib; suitable for Python built-ins, numpy, and pandas.
503
504
 
@@ -505,9 +506,6 @@ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose
505
506
  obj (Any) : The Python object to serialize.
506
507
  save_dir (str | Path) : Directory path where the serialized object will be saved.
507
508
  filename (str) : Name for the output file, extension will be appended if needed.
508
-
509
- Returns:
510
- (Path | None) : The full file path where the object was saved if successful; otherwise, None.
511
509
  """
512
510
  try:
513
511
  save_path = make_fullpath(save_dir, make=True)
@@ -526,7 +524,7 @@ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose
526
524
  else:
527
525
  if verbose:
528
526
  print(f"✅ Object of type '{type(obj)}' saved to '{full_path}'")
529
- return full_path
527
+ return None
530
528
 
531
529
 
532
530
  def deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
@@ -597,6 +595,54 @@ def distribute_datasets_by_target(
597
595
  yield target, subset
598
596
 
599
597
 
598
+ def train_dataset_orchestrator(list_of_dirs: list[Union[str,Path]],
599
+ target_columns: list[str],
600
+ save_dir: Union[str,Path],
601
+ safe_mode: bool=False):
602
+ """
603
+ Orchestrates the creation of single-target datasets from multiple directories each with a variable number of CSV datasets.
604
+
605
+ This function iterates through a list of directories, finds all CSV files,
606
+ and splits each dataframe based on the provided target columns. Each resulting
607
+ single-target dataframe is then saved to a specified directory.
608
+
609
+ Parameters
610
+ ----------
611
+ list_of_dirs : list[str | Path]
612
+ A list of directory paths where the source CSV files are located.
613
+ target_columns : list[str]
614
+ A list of column names to be used as targets for splitting the datasets.
615
+ save_dir : str | Path
616
+ The directory where the newly created single-target datasets will be saved.
617
+ safe_mode : bool
618
+ If True, prefixes the saved filename with the source directory name to prevent overwriting files with the same name from different sources.
619
+ """
620
+ all_dir_paths: list[Path] = list()
621
+ for dir in list_of_dirs:
622
+ dir_path = make_fullpath(dir)
623
+ if not dir_path.is_dir():
624
+ raise IOError(f"'{dir}' is not a directory.")
625
+ all_dir_paths.append(dir_path)
626
+
627
+ # main loop
628
+ total_saved = 0
629
+ for df_dir in all_dir_paths:
630
+ for df_name, df_path in list_csv_paths(df_dir).items():
631
+ try:
632
+ for target_name, df in distribute_datasets_by_target(df_or_path=df_path, target_columns=target_columns, verbose=False):
633
+ if safe_mode:
634
+ filename = df_dir.name + '_' + target_name + '_' + df_name
635
+ else:
636
+ filename = target_name + '_' + df_name
637
+ save_dataframe(df=df, save_dir=save_dir, filename=filename)
638
+ total_saved += 1
639
+ except Exception as e:
640
+ print(f"⚠️ Failed to process file '{df_path}'. Reason: {e}")
641
+ continue
642
+
643
+ print(f"{total_saved} single-target datasets were created.")
644
+
645
+
600
646
  class LogKeys:
601
647
  """
602
648
  Used for ML scripts only
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "3.2.1"
3
+ version = "3.4.0"
4
4
  description = "A collection of tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }