dragon-ml-toolbox 4.2.2__tar.gz → 4.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-4.2.2/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-4.4.0}/PKG-INFO +1 -1
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/ETL_engineering.py +1 -1
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/custom_logger.py +15 -5
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/data_exploration.py +49 -30
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/pyproject.toml +1 -1
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/LICENSE +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/README.md +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/GUI_tools.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/MICE_imputation.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/ML_callbacks.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/ML_evaluation.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/ML_inference.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/ML_trainer.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/PSO_optimization.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/RNN_forecast.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/SQL.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/VIF_factor.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/_logger.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/_pytorch_models.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/_script_info.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/datasetmaster.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/ensemble_inference.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/ensemble_learning.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/handle_excel.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/keys.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/path_manager.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/ml_tools/utilities.py +0 -0
- {dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/setup.cfg +0 -0
|
@@ -569,7 +569,7 @@ class NumberExtractor:
|
|
|
569
569
|
self,
|
|
570
570
|
regex_pattern: str = r"(\d+\.?\d*)",
|
|
571
571
|
dtype: Literal["float", "int"] = "float",
|
|
572
|
-
round_digits: Optional[int] =
|
|
572
|
+
round_digits: Optional[int] = 2,
|
|
573
573
|
):
|
|
574
574
|
# --- Validation ---
|
|
575
575
|
if not isinstance(regex_pattern, str):
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from typing import Union, List, Dict, Any
|
|
4
|
-
import pandas as pd
|
|
5
4
|
import traceback
|
|
6
5
|
import json
|
|
6
|
+
import csv
|
|
7
7
|
from .path_manager import sanitize_filename, make_fullpath
|
|
8
8
|
from ._script_info import _script_info
|
|
9
9
|
from ._logger import _LOGGER
|
|
@@ -18,7 +18,6 @@ def custom_logger(
|
|
|
18
18
|
data: Union[
|
|
19
19
|
List[Any],
|
|
20
20
|
Dict[Any, Any],
|
|
21
|
-
pd.DataFrame,
|
|
22
21
|
str,
|
|
23
22
|
BaseException
|
|
24
23
|
],
|
|
@@ -75,7 +74,7 @@ def custom_logger(
|
|
|
75
74
|
_log_exception_to_log(data, base_path.with_suffix(".log"))
|
|
76
75
|
|
|
77
76
|
else:
|
|
78
|
-
raise ValueError("Unsupported data type. Must be list, dict,
|
|
77
|
+
raise ValueError("Unsupported data type. Must be list, dict, str, or BaseException.")
|
|
79
78
|
|
|
80
79
|
_LOGGER.info(f"🗄️ Log saved to: '{base_path}'")
|
|
81
80
|
|
|
@@ -106,8 +105,19 @@ def _log_dict_to_csv(data: Dict[Any, List[Any]], path: Path) -> None:
|
|
|
106
105
|
padded_value = value + [None] * (max_length - len(value))
|
|
107
106
|
sanitized_dict[sanitized_key] = padded_value
|
|
108
107
|
|
|
109
|
-
|
|
110
|
-
|
|
108
|
+
# The `newline=''` argument is important to prevent extra blank rows
|
|
109
|
+
with open(path, 'w', newline='', encoding='utf-8') as csv_file:
|
|
110
|
+
writer = csv.writer(csv_file)
|
|
111
|
+
|
|
112
|
+
# 1. Write the header row from the sanitized dictionary keys
|
|
113
|
+
header = list(sanitized_dict.keys())
|
|
114
|
+
writer.writerow(header)
|
|
115
|
+
|
|
116
|
+
# 2. Transpose columns to rows and write them
|
|
117
|
+
# zip(*sanitized_dict.values()) elegantly converts the column data
|
|
118
|
+
# (lists in the dict) into row-by-row tuples.
|
|
119
|
+
rows_to_write = zip(*sanitized_dict.values())
|
|
120
|
+
writer.writerows(rows_to_write)
|
|
111
121
|
|
|
112
122
|
|
|
113
123
|
def _log_string_to_log(data: str, path: Path) -> None:
|
|
@@ -15,9 +15,9 @@ __all__ = [
|
|
|
15
15
|
"summarize_dataframe",
|
|
16
16
|
"drop_constant_columns",
|
|
17
17
|
"drop_rows_with_missing_data",
|
|
18
|
-
"split_features_targets",
|
|
19
18
|
"show_null_columns",
|
|
20
19
|
"drop_columns_with_missing_data",
|
|
20
|
+
"split_features_targets",
|
|
21
21
|
"split_continuous_binary",
|
|
22
22
|
"plot_correlation_heatmap",
|
|
23
23
|
"plot_value_distributions",
|
|
@@ -125,17 +125,19 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
|
|
|
125
125
|
|
|
126
126
|
# Stage 1: Drop rows with all target columns missing
|
|
127
127
|
if targets is not None:
|
|
128
|
-
|
|
128
|
+
# validate targets
|
|
129
|
+
valid_targets = _validate_columns(df_clean, targets)
|
|
130
|
+
target_na = df_clean[valid_targets].isnull().all(axis=1)
|
|
129
131
|
if target_na.any():
|
|
130
132
|
print(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
|
|
131
133
|
df_clean = df_clean[~target_na]
|
|
132
134
|
else:
|
|
133
135
|
print("✅ No rows with all targets missing.")
|
|
134
136
|
else:
|
|
135
|
-
|
|
137
|
+
valid_targets = []
|
|
136
138
|
|
|
137
139
|
# Stage 2: Drop rows based on feature column missing values
|
|
138
|
-
feature_cols = [col for col in df_clean.columns if col not in
|
|
140
|
+
feature_cols = [col for col in df_clean.columns if col not in valid_targets]
|
|
139
141
|
if feature_cols:
|
|
140
142
|
feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
|
|
141
143
|
rows_to_drop = feature_na_frac[feature_na_frac > threshold].index
|
|
@@ -150,30 +152,6 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
|
|
|
150
152
|
return df_clean
|
|
151
153
|
|
|
152
154
|
|
|
153
|
-
def split_features_targets(df: pd.DataFrame, targets: list[str]):
|
|
154
|
-
"""
|
|
155
|
-
Splits a DataFrame's columns into features and targets.
|
|
156
|
-
|
|
157
|
-
Args:
|
|
158
|
-
df (pd.DataFrame): Pandas DataFrame containing the dataset.
|
|
159
|
-
targets (list[str]): List of column names to be treated as target variables.
|
|
160
|
-
|
|
161
|
-
Returns:
|
|
162
|
-
tuple: A tuple containing:
|
|
163
|
-
- pd.DataFrame: Features dataframe.
|
|
164
|
-
- pd.DataFrame: Targets dataframe.
|
|
165
|
-
|
|
166
|
-
Prints:
|
|
167
|
-
- Shape of the original dataframe.
|
|
168
|
-
- Shape of the features dataframe.
|
|
169
|
-
- Shape of the targets dataframe.
|
|
170
|
-
"""
|
|
171
|
-
df_targets = df[targets]
|
|
172
|
-
df_features = df.drop(columns=targets)
|
|
173
|
-
print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
|
|
174
|
-
return df_features, df_targets
|
|
175
|
-
|
|
176
|
-
|
|
177
155
|
def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
|
|
178
156
|
"""
|
|
179
157
|
Displays a table of columns with missing values, showing both the count and
|
|
@@ -202,7 +180,7 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
|
|
|
202
180
|
return null_summary
|
|
203
181
|
|
|
204
182
|
|
|
205
|
-
def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True) -> pd.DataFrame:
|
|
183
|
+
def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True, skip_columns: Optional[List[str]]=None) -> pd.DataFrame:
|
|
206
184
|
"""
|
|
207
185
|
Drops columns with more than `threshold` fraction of missing values.
|
|
208
186
|
|
|
@@ -210,11 +188,22 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
|
|
|
210
188
|
df (pd.DataFrame): The input DataFrame.
|
|
211
189
|
threshold (float): Fraction of missing values above which columns are dropped.
|
|
212
190
|
show_nulls_after (bool): Prints `show_null_columns` after dropping columns.
|
|
191
|
+
skip_columns (list[str] | None): If given, these columns wont be included in the drop process.
|
|
213
192
|
|
|
214
193
|
Returns:
|
|
215
194
|
pd.DataFrame: A new DataFrame without the dropped columns.
|
|
216
195
|
"""
|
|
217
|
-
|
|
196
|
+
# If skip_columns is provided, create a list of columns to check.
|
|
197
|
+
# Otherwise, check all columns.
|
|
198
|
+
cols_to_check = df.columns
|
|
199
|
+
if skip_columns:
|
|
200
|
+
# Use set difference for efficient exclusion
|
|
201
|
+
cols_to_check = df.columns.difference(skip_columns)
|
|
202
|
+
|
|
203
|
+
# Calculate the missing fraction only on the columns to be checked
|
|
204
|
+
missing_fraction = df[cols_to_check].isnull().mean()
|
|
205
|
+
|
|
206
|
+
|
|
218
207
|
cols_to_drop = missing_fraction[missing_fraction > threshold].index
|
|
219
208
|
|
|
220
209
|
if len(cols_to_drop) > 0:
|
|
@@ -231,6 +220,31 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
|
|
|
231
220
|
return df
|
|
232
221
|
|
|
233
222
|
|
|
223
|
+
def split_features_targets(df: pd.DataFrame, targets: list[str]):
|
|
224
|
+
"""
|
|
225
|
+
Splits a DataFrame's columns into features and targets.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
df (pd.DataFrame): Pandas DataFrame containing the dataset.
|
|
229
|
+
targets (list[str]): List of column names to be treated as target variables.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
tuple: A tuple containing:
|
|
233
|
+
- pd.DataFrame: Features dataframe.
|
|
234
|
+
- pd.DataFrame: Targets dataframe.
|
|
235
|
+
|
|
236
|
+
Prints:
|
|
237
|
+
- Shape of the original dataframe.
|
|
238
|
+
- Shape of the features dataframe.
|
|
239
|
+
- Shape of the targets dataframe.
|
|
240
|
+
"""
|
|
241
|
+
valid_targets = _validate_columns(df, targets)
|
|
242
|
+
df_targets = df[valid_targets]
|
|
243
|
+
df_features = df.drop(columns=valid_targets)
|
|
244
|
+
print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
|
|
245
|
+
return df_features, df_targets
|
|
246
|
+
|
|
247
|
+
|
|
234
248
|
def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
235
249
|
"""
|
|
236
250
|
Split DataFrame into two DataFrames: one with continuous columns, one with binary columns.
|
|
@@ -631,5 +645,10 @@ def standardize_percentages(
|
|
|
631
645
|
return df_copy
|
|
632
646
|
|
|
633
647
|
|
|
648
|
+
def _validate_columns(df: pd.DataFrame, columns: list[str]):
|
|
649
|
+
valid_columns = [column for column in columns if column in df.columns]
|
|
650
|
+
return valid_columns
|
|
651
|
+
|
|
652
|
+
|
|
634
653
|
def info():
|
|
635
654
|
_script_info(__all__)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/dragon_ml_toolbox.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-4.2.2 → dragon_ml_toolbox-4.4.0}/dragon_ml_toolbox.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|