dragon-ml-toolbox 8.0.0__py3-none-any.whl → 8.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-8.0.0.dist-info → dragon_ml_toolbox-8.2.0.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-8.0.0.dist-info → dragon_ml_toolbox-8.2.0.dist-info}/RECORD +8 -8
- ml_tools/ETL_engineering.py +47 -10
- ml_tools/data_exploration.py +80 -2
- {dragon_ml_toolbox-8.0.0.dist-info → dragon_ml_toolbox-8.2.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-8.0.0.dist-info → dragon_ml_toolbox-8.2.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-8.0.0.dist-info → dragon_ml_toolbox-8.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-8.0.0.dist-info → dragon_ml_toolbox-8.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
dragon_ml_toolbox-8.
|
|
2
|
-
dragon_ml_toolbox-8.
|
|
3
|
-
ml_tools/ETL_engineering.py,sha256=
|
|
1
|
+
dragon_ml_toolbox-8.2.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-8.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
|
|
3
|
+
ml_tools/ETL_engineering.py,sha256=69YGK4fN5ouRBknTvU4uZ8KLQGT-hPrvwymH-IygEnk,40911
|
|
4
4
|
ml_tools/GUI_tools.py,sha256=n4ZZ5kEjwK5rkOCFJE41HeLFfjhpJVLUSzk9Kd9Kr_0,45410
|
|
5
5
|
ml_tools/MICE_imputation.py,sha256=oFHg-OytOzPYTzBR_wIRHhP71cMn3aupDeT59ABsXlQ,11576
|
|
6
6
|
ml_tools/ML_callbacks.py,sha256=noedVMmHZ72Odbg28zqx5wkhhvX2v-jXicKE_NCAiqU,13838
|
|
@@ -21,7 +21,7 @@ ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
|
|
|
21
21
|
ml_tools/_logger.py,sha256=TpgYguxO-CWYqqgLW0tqFjtwZ58PE_W2OCfWNGZr0n0,1175
|
|
22
22
|
ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
23
23
|
ml_tools/custom_logger.py,sha256=nyLRxaRxkqYOFdSjI0X2BWXB8C2IU18QfmqIFKqSedI,5820
|
|
24
|
-
ml_tools/data_exploration.py,sha256=
|
|
24
|
+
ml_tools/data_exploration.py,sha256=RuMHWagXrSQi1MzAMlYeBeVg7UxhVvEq8gJ9bIam2BM,27103
|
|
25
25
|
ml_tools/ensemble_evaluation.py,sha256=wnqoTPg4WYWf2A8z5XT0eSlW4snEuLCXQVj88sZKzQ4,24683
|
|
26
26
|
ml_tools/ensemble_inference.py,sha256=rtU7eUaQne615n2g7IHZCJI-OvrBCcjxbTkEIvtCGFQ,9414
|
|
27
27
|
ml_tools/ensemble_learning.py,sha256=dAyFgSTyvxJWjc_enJ_8EUoWwiekBeoNyJNxVY-kcUU,21868
|
|
@@ -30,7 +30,7 @@ ml_tools/keys.py,sha256=HtPG8-MWh89C32A7eIlfuuA-DLwkxGkoDfwR2TGN9CQ,1074
|
|
|
30
30
|
ml_tools/optimization_tools.py,sha256=EL5tgNFwRo-82pbRE1CFVy9noNhULD7wprWuKadPheg,5090
|
|
31
31
|
ml_tools/path_manager.py,sha256=Z8e7w3MPqQaN8xmTnKuXZS6CIW59BFwwqGhGc00sdp4,13692
|
|
32
32
|
ml_tools/utilities.py,sha256=LqXXTovaHbA5AOKRk6Ru6DgAPAM0wPfYU70kUjYBryo,19231
|
|
33
|
-
dragon_ml_toolbox-8.
|
|
34
|
-
dragon_ml_toolbox-8.
|
|
35
|
-
dragon_ml_toolbox-8.
|
|
36
|
-
dragon_ml_toolbox-8.
|
|
33
|
+
dragon_ml_toolbox-8.2.0.dist-info/METADATA,sha256=C1rjTnTNSj6VI2khy7Xl1VjQ__MP6-b43x9RIQCHY3E,6778
|
|
34
|
+
dragon_ml_toolbox-8.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
35
|
+
dragon_ml_toolbox-8.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
36
|
+
dragon_ml_toolbox-8.2.0.dist-info/RECORD,,
|
ml_tools/ETL_engineering.py
CHANGED
|
@@ -3,7 +3,6 @@ import re
|
|
|
3
3
|
from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
|
|
4
4
|
from ._script_info import _script_info
|
|
5
5
|
from ._logger import _LOGGER
|
|
6
|
-
import warnings
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
__all__ = [
|
|
@@ -13,6 +12,7 @@ __all__ = [
|
|
|
13
12
|
"DataProcessor",
|
|
14
13
|
"BinaryTransformer",
|
|
15
14
|
"MultiBinaryDummifier",
|
|
15
|
+
"AutoDummifier",
|
|
16
16
|
"KeywordDummifier",
|
|
17
17
|
"NumberExtractor",
|
|
18
18
|
"MultiNumberExtractor",
|
|
@@ -277,16 +277,32 @@ class DataProcessor:
|
|
|
277
277
|
processed_columns.append(result.alias(output_col_spec))
|
|
278
278
|
|
|
279
279
|
elif isinstance(result, pl.DataFrame):
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
280
|
+
# 1. Handle list-based renaming
|
|
281
|
+
if isinstance(output_col_spec, list):
|
|
282
|
+
if len(result.columns) != len(output_col_spec):
|
|
283
|
+
raise ValueError(
|
|
284
|
+
f"Mismatch in '{input_col_name}': function produced {len(result.columns)} columns, "
|
|
285
|
+
f"but recipe specifies {len(output_col_spec)} output names."
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
renamed_df = result.rename(dict(zip(result.columns, output_col_spec)))
|
|
289
|
+
processed_columns.extend(renamed_df.get_columns())
|
|
290
|
+
|
|
291
|
+
# 2. Handle a string prefix for AutoDummifier
|
|
292
|
+
elif isinstance(output_col_spec, str):
|
|
293
|
+
prefix = output_col_spec
|
|
294
|
+
# Replace the original name part with the desired prefix.
|
|
295
|
+
new_names = {
|
|
296
|
+
col: f"{prefix}{col[len(input_col_name):]}" for col in result.columns
|
|
297
|
+
}
|
|
298
|
+
renamed_df = result.rename(new_names)
|
|
299
|
+
processed_columns.extend(renamed_df.get_columns())
|
|
287
300
|
|
|
288
|
-
|
|
289
|
-
|
|
301
|
+
else:
|
|
302
|
+
raise TypeError(
|
|
303
|
+
f"Function for '{input_col_name}' returned a DataFrame, "
|
|
304
|
+
f"so 'output_col' must be a list of names or a string prefix."
|
|
305
|
+
)
|
|
290
306
|
|
|
291
307
|
else:
|
|
292
308
|
raise TypeError(f"Function for '{input_col_name}' returned an unexpected type: {type(result)}.")
|
|
@@ -413,6 +429,27 @@ class BinaryTransformer:
|
|
|
413
429
|
return (~contains_keyword).cast(pl.UInt8)
|
|
414
430
|
|
|
415
431
|
|
|
432
|
+
class AutoDummifier:
|
|
433
|
+
"""
|
|
434
|
+
A transformer that performs one-hot encoding on a categorical column,
|
|
435
|
+
automatically detecting the unique categories from the data.
|
|
436
|
+
"""
|
|
437
|
+
def __call__(self, column: pl.Series) -> pl.DataFrame:
|
|
438
|
+
"""
|
|
439
|
+
Executes the one-hot encoding logic.
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
column (pl.Series): The input Polars Series of categories.
|
|
443
|
+
|
|
444
|
+
Returns:
|
|
445
|
+
pl.DataFrame: A DataFrame with one-hot encoded columns.
|
|
446
|
+
Column names are auto-generated by Polars as
|
|
447
|
+
'{original_col_name}_{category_value}'.
|
|
448
|
+
"""
|
|
449
|
+
# Ensure the column is treated as a string before creating dummies
|
|
450
|
+
return column.cast(pl.Utf8).to_dummies()
|
|
451
|
+
|
|
452
|
+
|
|
416
453
|
class MultiBinaryDummifier:
|
|
417
454
|
"""
|
|
418
455
|
A one-to-many transformer that creates multiple binary columns from a single
|
ml_tools/data_exploration.py
CHANGED
|
@@ -5,10 +5,12 @@ import matplotlib.pyplot as plt
|
|
|
5
5
|
import seaborn as sns
|
|
6
6
|
from typing import Union, Literal, Dict, Tuple, List, Optional
|
|
7
7
|
from pathlib import Path
|
|
8
|
+
import re
|
|
9
|
+
|
|
8
10
|
from .path_manager import sanitize_filename, make_fullpath
|
|
9
11
|
from ._script_info import _script_info
|
|
10
12
|
from ._logger import _LOGGER
|
|
11
|
-
import
|
|
13
|
+
from .utilities import save_dataframe
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
# Keep track of all available tools, show using `info()`
|
|
@@ -18,6 +20,7 @@ __all__ = [
|
|
|
18
20
|
"drop_rows_with_missing_data",
|
|
19
21
|
"show_null_columns",
|
|
20
22
|
"drop_columns_with_missing_data",
|
|
23
|
+
"drop_macro",
|
|
21
24
|
"split_features_targets",
|
|
22
25
|
"split_continuous_binary",
|
|
23
26
|
"plot_correlation_heatmap",
|
|
@@ -155,7 +158,7 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
|
|
|
155
158
|
|
|
156
159
|
def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
|
|
157
160
|
"""
|
|
158
|
-
|
|
161
|
+
Returns a table of columns with missing values, showing both the count and
|
|
159
162
|
percentage of missing entries per column.
|
|
160
163
|
|
|
161
164
|
Parameters:
|
|
@@ -221,6 +224,81 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
|
|
|
221
224
|
return df
|
|
222
225
|
|
|
223
226
|
|
|
227
|
+
def drop_macro(df: pd.DataFrame,
|
|
228
|
+
log_directory: Union[str,Path],
|
|
229
|
+
targets: list[str],
|
|
230
|
+
skip_targets: bool=False,
|
|
231
|
+
threshold: float=0.7) -> pd.DataFrame:
|
|
232
|
+
"""
|
|
233
|
+
Iteratively removes rows and columns with excessive missing data.
|
|
234
|
+
|
|
235
|
+
This function performs a comprehensive cleaning cycle on a DataFrame. It
|
|
236
|
+
repeatedly drops columns with constant values, followed by rows and columns that exceed
|
|
237
|
+
a specified threshold of missing values. The process continues until the
|
|
238
|
+
DataFrame's dimensions stabilize, ensuring that the interdependency between
|
|
239
|
+
row and column deletions is handled.
|
|
240
|
+
|
|
241
|
+
Initial and final missing data reports are saved to the specified log directory.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
df (pd.DataFrame): The input pandas DataFrame to be cleaned.
|
|
245
|
+
log_directory (Union[str, Path]): Path to the directory where the
|
|
246
|
+
'Missing_Data_start.csv' and 'Missing_Data_final.csv' logs
|
|
247
|
+
will be saved.
|
|
248
|
+
targets (list[str]): A list of column names to be treated as target
|
|
249
|
+
variables. This list guides the row-dropping logic.
|
|
250
|
+
skip_targets (bool, optional): If True, the columns listed in `targets`
|
|
251
|
+
will be exempt from being dropped, even if they exceed the missing
|
|
252
|
+
data threshold.
|
|
253
|
+
threshold (float, optional): The proportion of missing data required to drop
|
|
254
|
+
a row or column. For example, 0.7 means a row/column will be
|
|
255
|
+
dropped if 70% or more of its data is missing.
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
pd.DataFrame: A new, cleaned DataFrame with offending rows and columns removed.
|
|
259
|
+
"""
|
|
260
|
+
# make a deep copy to work with
|
|
261
|
+
df_clean = df.copy()
|
|
262
|
+
|
|
263
|
+
# Log initial state
|
|
264
|
+
missing_data = show_null_columns(df=df_clean)
|
|
265
|
+
save_dataframe(df=missing_data.reset_index(drop=False),
|
|
266
|
+
save_dir=log_directory,
|
|
267
|
+
filename="Missing_Data_start")
|
|
268
|
+
|
|
269
|
+
# Clean cycles for rows and columns
|
|
270
|
+
master = True
|
|
271
|
+
while master:
|
|
272
|
+
# track rows and columns
|
|
273
|
+
initial_rows, initial_columns = df_clean.shape
|
|
274
|
+
|
|
275
|
+
# drop constant columns
|
|
276
|
+
df_clean = drop_constant_columns(df=df_clean)
|
|
277
|
+
|
|
278
|
+
# clean rows
|
|
279
|
+
df_clean = drop_rows_with_missing_data(df=df_clean, targets=targets, threshold=threshold)
|
|
280
|
+
|
|
281
|
+
# clean columns
|
|
282
|
+
if skip_targets:
|
|
283
|
+
df_clean = drop_columns_with_missing_data(df=df_clean, threshold=threshold, show_nulls_after=False, skip_columns=targets)
|
|
284
|
+
else:
|
|
285
|
+
df_clean = drop_columns_with_missing_data(df=df_clean, threshold=threshold, show_nulls_after=False)
|
|
286
|
+
|
|
287
|
+
# cleaned?
|
|
288
|
+
remaining_rows, remaining_columns = df_clean.shape
|
|
289
|
+
if remaining_rows >= initial_rows and remaining_columns >= initial_columns:
|
|
290
|
+
master = False
|
|
291
|
+
|
|
292
|
+
# log final state
|
|
293
|
+
missing_data = show_null_columns(df=df_clean)
|
|
294
|
+
save_dataframe(df=missing_data.reset_index(drop=False),
|
|
295
|
+
save_dir=log_directory,
|
|
296
|
+
filename="Missing_Data_final")
|
|
297
|
+
|
|
298
|
+
# return cleaned dataframe
|
|
299
|
+
return df_clean
|
|
300
|
+
|
|
301
|
+
|
|
224
302
|
def split_features_targets(df: pd.DataFrame, targets: list[str]):
|
|
225
303
|
"""
|
|
226
304
|
Splits a DataFrame's columns into features and targets.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|