dragon-ml-toolbox 4.2.2__tar.gz → 4.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (35) hide show
  1. {dragon_ml_toolbox-4.2.2/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-4.4.0}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/ETL_engineering.py +1 -1
  4. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/custom_logger.py +15 -5
  5. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/data_exploration.py +49 -30
  6. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/pyproject.toml +1 -1
  7. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/LICENSE +0 -0
  8. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/LICENSE-THIRD-PARTY.md +0 -0
  9. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/README.md +0 -0
  10. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  11. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  12. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  13. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  14. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/GUI_tools.py +0 -0
  15. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/MICE_imputation.py +0 -0
  16. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/ML_callbacks.py +0 -0
  17. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/ML_evaluation.py +0 -0
  18. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/ML_inference.py +0 -0
  19. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/ML_trainer.py +0 -0
  20. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/PSO_optimization.py +0 -0
  21. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/RNN_forecast.py +0 -0
  22. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/SQL.py +0 -0
  23. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/VIF_factor.py +0 -0
  24. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/__init__.py +0 -0
  25. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/_logger.py +0 -0
  26. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/_pytorch_models.py +0 -0
  27. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/_script_info.py +0 -0
  28. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/datasetmaster.py +0 -0
  29. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/ensemble_inference.py +0 -0
  30. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/ensemble_learning.py +0 -0
  31. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/handle_excel.py +0 -0
  32. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/keys.py +0 -0
  33. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/path_manager.py +0 -0
  34. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/utilities.py +0 -0
  35. {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 4.2.2
3
+ Version: 4.4.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 4.2.2
3
+ Version: 4.4.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -569,7 +569,7 @@ class NumberExtractor:
569
569
  self,
570
570
  regex_pattern: str = r"(\d+\.?\d*)",
571
571
  dtype: Literal["float", "int"] = "float",
572
- round_digits: Optional[int] = None,
572
+ round_digits: Optional[int] = 2,
573
573
  ):
574
574
  # --- Validation ---
575
575
  if not isinstance(regex_pattern, str):
@@ -1,9 +1,9 @@
1
1
  from pathlib import Path
2
2
  from datetime import datetime
3
3
  from typing import Union, List, Dict, Any
4
- import pandas as pd
5
4
  import traceback
6
5
  import json
6
+ import csv
7
7
  from .path_manager import sanitize_filename, make_fullpath
8
8
  from ._script_info import _script_info
9
9
  from ._logger import _LOGGER
@@ -18,7 +18,6 @@ def custom_logger(
18
18
  data: Union[
19
19
  List[Any],
20
20
  Dict[Any, Any],
21
- pd.DataFrame,
22
21
  str,
23
22
  BaseException
24
23
  ],
@@ -75,7 +74,7 @@ def custom_logger(
75
74
  _log_exception_to_log(data, base_path.with_suffix(".log"))
76
75
 
77
76
  else:
78
- raise ValueError("Unsupported data type. Must be list, dict, DataFrame, str, or BaseException.")
77
+ raise ValueError("Unsupported data type. Must be list, dict, str, or BaseException.")
79
78
 
80
79
  _LOGGER.info(f"🗄️ Log saved to: '{base_path}'")
81
80
 
@@ -106,8 +105,19 @@ def _log_dict_to_csv(data: Dict[Any, List[Any]], path: Path) -> None:
106
105
  padded_value = value + [None] * (max_length - len(value))
107
106
  sanitized_dict[sanitized_key] = padded_value
108
107
 
109
- df = pd.DataFrame(sanitized_dict)
110
- df.to_csv(path, index=False)
108
+ # The `newline=''` argument is important to prevent extra blank rows
109
+ with open(path, 'w', newline='', encoding='utf-8') as csv_file:
110
+ writer = csv.writer(csv_file)
111
+
112
+ # 1. Write the header row from the sanitized dictionary keys
113
+ header = list(sanitized_dict.keys())
114
+ writer.writerow(header)
115
+
116
+ # 2. Transpose columns to rows and write them
117
+ # zip(*sanitized_dict.values()) elegantly converts the column data
118
+ # (lists in the dict) into row-by-row tuples.
119
+ rows_to_write = zip(*sanitized_dict.values())
120
+ writer.writerows(rows_to_write)
111
121
 
112
122
 
113
123
  def _log_string_to_log(data: str, path: Path) -> None:
@@ -15,9 +15,9 @@ __all__ = [
15
15
  "summarize_dataframe",
16
16
  "drop_constant_columns",
17
17
  "drop_rows_with_missing_data",
18
- "split_features_targets",
19
18
  "show_null_columns",
20
19
  "drop_columns_with_missing_data",
20
+ "split_features_targets",
21
21
  "split_continuous_binary",
22
22
  "plot_correlation_heatmap",
23
23
  "plot_value_distributions",
@@ -125,17 +125,19 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
125
125
 
126
126
  # Stage 1: Drop rows with all target columns missing
127
127
  if targets is not None:
128
- target_na = df_clean[targets].isnull().all(axis=1)
128
+ # validate targets
129
+ valid_targets = _validate_columns(df_clean, targets)
130
+ target_na = df_clean[valid_targets].isnull().all(axis=1)
129
131
  if target_na.any():
130
132
  print(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
131
133
  df_clean = df_clean[~target_na]
132
134
  else:
133
135
  print("✅ No rows with all targets missing.")
134
136
  else:
135
- targets = []
137
+ valid_targets = []
136
138
 
137
139
  # Stage 2: Drop rows based on feature column missing values
138
- feature_cols = [col for col in df_clean.columns if col not in targets]
140
+ feature_cols = [col for col in df_clean.columns if col not in valid_targets]
139
141
  if feature_cols:
140
142
  feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
141
143
  rows_to_drop = feature_na_frac[feature_na_frac > threshold].index
@@ -150,30 +152,6 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
150
152
  return df_clean
151
153
 
152
154
 
153
- def split_features_targets(df: pd.DataFrame, targets: list[str]):
154
- """
155
- Splits a DataFrame's columns into features and targets.
156
-
157
- Args:
158
- df (pd.DataFrame): Pandas DataFrame containing the dataset.
159
- targets (list[str]): List of column names to be treated as target variables.
160
-
161
- Returns:
162
- tuple: A tuple containing:
163
- - pd.DataFrame: Features dataframe.
164
- - pd.DataFrame: Targets dataframe.
165
-
166
- Prints:
167
- - Shape of the original dataframe.
168
- - Shape of the features dataframe.
169
- - Shape of the targets dataframe.
170
- """
171
- df_targets = df[targets]
172
- df_features = df.drop(columns=targets)
173
- print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
174
- return df_features, df_targets
175
-
176
-
177
155
  def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
178
156
  """
179
157
  Displays a table of columns with missing values, showing both the count and
@@ -202,7 +180,7 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
202
180
  return null_summary
203
181
 
204
182
 
205
- def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True) -> pd.DataFrame:
183
+ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True, skip_columns: Optional[List[str]]=None) -> pd.DataFrame:
206
184
  """
207
185
  Drops columns with more than `threshold` fraction of missing values.
208
186
 
@@ -210,11 +188,22 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
210
188
  df (pd.DataFrame): The input DataFrame.
211
189
  threshold (float): Fraction of missing values above which columns are dropped.
212
190
  show_nulls_after (bool): Prints `show_null_columns` after dropping columns.
191
+ skip_columns (list[str] | None): If given, these columns wont be included in the drop process.
213
192
 
214
193
  Returns:
215
194
  pd.DataFrame: A new DataFrame without the dropped columns.
216
195
  """
217
- missing_fraction = df.isnull().mean()
196
+ # If skip_columns is provided, create a list of columns to check.
197
+ # Otherwise, check all columns.
198
+ cols_to_check = df.columns
199
+ if skip_columns:
200
+ # Use set difference for efficient exclusion
201
+ cols_to_check = df.columns.difference(skip_columns)
202
+
203
+ # Calculate the missing fraction only on the columns to be checked
204
+ missing_fraction = df[cols_to_check].isnull().mean()
205
+
206
+
218
207
  cols_to_drop = missing_fraction[missing_fraction > threshold].index
219
208
 
220
209
  if len(cols_to_drop) > 0:
@@ -231,6 +220,31 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
231
220
  return df
232
221
 
233
222
 
223
+ def split_features_targets(df: pd.DataFrame, targets: list[str]):
224
+ """
225
+ Splits a DataFrame's columns into features and targets.
226
+
227
+ Args:
228
+ df (pd.DataFrame): Pandas DataFrame containing the dataset.
229
+ targets (list[str]): List of column names to be treated as target variables.
230
+
231
+ Returns:
232
+ tuple: A tuple containing:
233
+ - pd.DataFrame: Features dataframe.
234
+ - pd.DataFrame: Targets dataframe.
235
+
236
+ Prints:
237
+ - Shape of the original dataframe.
238
+ - Shape of the features dataframe.
239
+ - Shape of the targets dataframe.
240
+ """
241
+ valid_targets = _validate_columns(df, targets)
242
+ df_targets = df[valid_targets]
243
+ df_features = df.drop(columns=valid_targets)
244
+ print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
245
+ return df_features, df_targets
246
+
247
+
234
248
  def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
235
249
  """
236
250
  Split DataFrame into two DataFrames: one with continuous columns, one with binary columns.
@@ -631,5 +645,10 @@ def standardize_percentages(
631
645
  return df_copy
632
646
 
633
647
 
648
+ def _validate_columns(df: pd.DataFrame, columns: list[str]):
649
+ valid_columns = [column for column in columns if column in df.columns]
650
+ return valid_columns
651
+
652
+
634
653
  def info():
635
654
  _script_info(__all__)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "4.2.2"
3
+ version = "4.4.0"
4
4
  description = "A collection of tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }