dragon-ml-toolbox 3.2.1__py3-none-any.whl → 3.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-3.2.1.dist-info → dragon_ml_toolbox-3.3.0.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-3.2.1.dist-info → dragon_ml_toolbox-3.3.0.dist-info}/RECORD +8 -8
- ml_tools/ETL_engineering.py +64 -15
- ml_tools/data_exploration.py +42 -0
- {dragon_ml_toolbox-3.2.1.dist-info → dragon_ml_toolbox-3.3.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-3.2.1.dist-info → dragon_ml_toolbox-3.3.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-3.2.1.dist-info → dragon_ml_toolbox-3.3.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-3.2.1.dist-info → dragon_ml_toolbox-3.3.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
dragon_ml_toolbox-3.
|
|
2
|
-
dragon_ml_toolbox-3.
|
|
3
|
-
ml_tools/ETL_engineering.py,sha256=
|
|
1
|
+
dragon_ml_toolbox-3.3.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-3.3.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
|
|
3
|
+
ml_tools/ETL_engineering.py,sha256=TL5sz7ac4uwa993Uy6Ex11-9vN_t5IaD-Y9UjyI88_o,39030
|
|
4
4
|
ml_tools/GUI_tools.py,sha256=uFx6zIrQZzDPSTtOSHz8ptz-fxZiQz-lXHcrqwuYV_E,20385
|
|
5
5
|
ml_tools/MICE_imputation.py,sha256=ed-YeQkEAeHxTNkWIHs09T4YeYNF0aqAnrUTcdIEp9E,11372
|
|
6
6
|
ml_tools/ML_callbacks.py,sha256=gHZk-lyzAax6iEtG26zHuoobdAZCFJ6BmI6pWoXkOrw,13189
|
|
@@ -13,13 +13,13 @@ ml_tools/VIF_factor.py,sha256=5GVAldH69Vkei3WRUZN1uPBMzGoOOeEOA-bgmZXbbUw,10301
|
|
|
13
13
|
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
14
|
ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
|
|
15
15
|
ml_tools/_pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
|
|
16
|
-
ml_tools/data_exploration.py,sha256=
|
|
16
|
+
ml_tools/data_exploration.py,sha256=IkpOyIRPKdu4qKeUdvvMvNPelSVWegNEKMqX3IInmpw,25003
|
|
17
17
|
ml_tools/datasetmaster.py,sha256=N-uwfzWnl_qnoAqjbfS98I1pVNra5u6rhKLdWbFIReA,30122
|
|
18
18
|
ml_tools/ensemble_learning.py,sha256=PPtBBLgLvaYOdY-MlcjXuxWWXf3JQavLNEysFgzjc_s,37470
|
|
19
19
|
ml_tools/handle_excel.py,sha256=lwds7rDLlGSCWiWGI7xNg-Z7kxAepogp0lstSFa0590,12949
|
|
20
20
|
ml_tools/logger.py,sha256=jC4Q2OqmDm8ZO9VpuZqBSWdXryqaJvLscqVJ6caNMOk,6009
|
|
21
21
|
ml_tools/utilities.py,sha256=opNR-ACH6BnLkWAKcb19ef5tFxfx22TI6E2o0RYwiGA,21021
|
|
22
|
-
dragon_ml_toolbox-3.
|
|
23
|
-
dragon_ml_toolbox-3.
|
|
24
|
-
dragon_ml_toolbox-3.
|
|
25
|
-
dragon_ml_toolbox-3.
|
|
22
|
+
dragon_ml_toolbox-3.3.0.dist-info/METADATA,sha256=rIwDGwx-RQ1XvVe0TLA8YkQIs71W7MoQJHfx28l580M,3273
|
|
23
|
+
dragon_ml_toolbox-3.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
24
|
+
dragon_ml_toolbox-3.3.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
25
|
+
dragon_ml_toolbox-3.3.0.dist-info/RECORD,,
|
ml_tools/ETL_engineering.py
CHANGED
|
@@ -12,6 +12,7 @@ __all__ = [
|
|
|
12
12
|
"TransformationRecipe",
|
|
13
13
|
"DataProcessor",
|
|
14
14
|
"BinaryTransformer",
|
|
15
|
+
"MultiBinaryDummifier",
|
|
15
16
|
"KeywordDummifier",
|
|
16
17
|
"NumberExtractor",
|
|
17
18
|
"MultiNumberExtractor",
|
|
@@ -400,12 +401,72 @@ class BinaryTransformer:
|
|
|
400
401
|
return (~contains_keyword).cast(pl.UInt8)
|
|
401
402
|
|
|
402
403
|
|
|
404
|
+
class MultiBinaryDummifier:
|
|
405
|
+
"""
|
|
406
|
+
A one-to-many transformer that creates multiple binary columns from a single
|
|
407
|
+
text column based on a list of keywords.
|
|
408
|
+
|
|
409
|
+
For each keyword provided, this transformer generates a corresponding column
|
|
410
|
+
with a value of 1 if the keyword is present in the input string, and 0 otherwise.
|
|
411
|
+
It is designed to be used within the DataProcessor pipeline.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
keywords (List[str]):
|
|
415
|
+
A list of strings, where each string is a keyword to search for. A separate
|
|
416
|
+
binary column will be created for each keyword.
|
|
417
|
+
case_insensitive (bool):
|
|
418
|
+
If True, keyword matching ignores case. Defaults to True.
|
|
419
|
+
"""
|
|
420
|
+
def __init__(self, keywords: List[str], case_insensitive: bool = True):
|
|
421
|
+
if not isinstance(keywords, list) or not all(isinstance(k, str) for k in keywords):
|
|
422
|
+
raise TypeError("The 'keywords' argument must be a list of strings.")
|
|
423
|
+
if not keywords:
|
|
424
|
+
raise ValueError("The 'keywords' list cannot be empty.")
|
|
425
|
+
|
|
426
|
+
self.keywords = keywords
|
|
427
|
+
self.case_insensitive = case_insensitive
|
|
428
|
+
|
|
429
|
+
def __call__(self, column: pl.Series) -> pl.DataFrame:
|
|
430
|
+
"""
|
|
431
|
+
Executes the dummification logic.
|
|
432
|
+
|
|
433
|
+
Args:
|
|
434
|
+
column (pl.Series): The input Polars Series to transform.
|
|
435
|
+
|
|
436
|
+
Returns:
|
|
437
|
+
pl.DataFrame: A DataFrame where each column corresponds to a keyword.
|
|
438
|
+
"""
|
|
439
|
+
# Ensure the input is treated as a string, preserving nulls
|
|
440
|
+
str_column = column.cast(pl.Utf8)
|
|
441
|
+
|
|
442
|
+
output_expressions = []
|
|
443
|
+
for i, keyword in enumerate(self.keywords):
|
|
444
|
+
# Escape keyword to treat it as a literal, not a regex pattern
|
|
445
|
+
base_pattern = re.escape(keyword)
|
|
446
|
+
|
|
447
|
+
# Add case-insensitivity flag `(?i)` if needed
|
|
448
|
+
pattern = f"(?i){base_pattern}" if self.case_insensitive else base_pattern
|
|
449
|
+
|
|
450
|
+
# Create the binary expression
|
|
451
|
+
expr = (
|
|
452
|
+
pl.when(str_column.is_null())
|
|
453
|
+
.then(None) # Propagate nulls from original column
|
|
454
|
+
.when(str_column.str.contains(pattern))
|
|
455
|
+
.then(pl.lit(1, dtype=pl.UInt8))
|
|
456
|
+
.otherwise(pl.lit(0, dtype=pl.UInt8))
|
|
457
|
+
.alias(f"col_{i}") # Generic name for DataProcessor
|
|
458
|
+
)
|
|
459
|
+
output_expressions.append(expr)
|
|
460
|
+
|
|
461
|
+
return pl.select(output_expressions)
|
|
462
|
+
|
|
463
|
+
|
|
403
464
|
class KeywordDummifier:
|
|
404
465
|
"""
|
|
405
466
|
A configurable transformer that creates one-hot encoded columns based on
|
|
406
467
|
keyword matching in a Polars Series.
|
|
407
468
|
|
|
408
|
-
|
|
469
|
+
Operates on a "first match wins" principle.
|
|
409
470
|
|
|
410
471
|
Args:
|
|
411
472
|
group_names (List[str]):
|
|
@@ -417,17 +478,14 @@ class KeywordDummifier:
|
|
|
417
478
|
`group_name` at the same index and contains the keywords to search for.
|
|
418
479
|
case_insensitive (bool):
|
|
419
480
|
If True, keyword matching ignores case.
|
|
420
|
-
drop_empty (bool):
|
|
421
|
-
If True, columns that contain no positive matches (all zeros) will be dropped from the final output.
|
|
422
481
|
"""
|
|
423
|
-
def __init__(self, group_names: List[str], group_keywords: List[List[str]], case_insensitive: bool = True
|
|
482
|
+
def __init__(self, group_names: List[str], group_keywords: List[List[str]], case_insensitive: bool = True):
|
|
424
483
|
if len(group_names) != len(group_keywords):
|
|
425
484
|
raise ValueError("Initialization failed: 'group_names' and 'group_keywords' must have the same length.")
|
|
426
485
|
|
|
427
486
|
self.group_names = group_names
|
|
428
487
|
self.group_keywords = group_keywords
|
|
429
488
|
self.case_insensitive = case_insensitive
|
|
430
|
-
self.drop_empty = drop_empty
|
|
431
489
|
|
|
432
490
|
def __call__(self, column: pl.Series) -> pl.DataFrame:
|
|
433
491
|
"""
|
|
@@ -474,16 +532,7 @@ class KeywordDummifier:
|
|
|
474
532
|
# If a group had no matches, create a column of zeros
|
|
475
533
|
final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(name))
|
|
476
534
|
|
|
477
|
-
|
|
478
|
-
result_df = pl.DataFrame(final_columns)
|
|
479
|
-
|
|
480
|
-
# If drop_empty is True, filter out all-zero columns
|
|
481
|
-
if self.drop_empty:
|
|
482
|
-
# A column is kept if its sum is greater than 0
|
|
483
|
-
cols_to_keep = [col for col in result_df if col.sum() > 0]
|
|
484
|
-
return result_df.select(cols_to_keep)
|
|
485
|
-
|
|
486
|
-
return result_df
|
|
535
|
+
return pl.DataFrame(final_columns)
|
|
487
536
|
|
|
488
537
|
|
|
489
538
|
class NumberExtractor:
|
ml_tools/data_exploration.py
CHANGED
|
@@ -15,6 +15,7 @@ import re
|
|
|
15
15
|
# Keep track of all available tools, show using `info()`
|
|
16
16
|
__all__ = [
|
|
17
17
|
"summarize_dataframe",
|
|
18
|
+
"drop_zero_only_columns",
|
|
18
19
|
"drop_rows_with_missing_data",
|
|
19
20
|
"split_features_targets",
|
|
20
21
|
"show_null_columns",
|
|
@@ -61,6 +62,47 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
|
|
|
61
62
|
return summary
|
|
62
63
|
|
|
63
64
|
|
|
65
|
+
def drop_zero_only_columns(df: pd.DataFrame, verbose: bool=True) -> pd.DataFrame:
|
|
66
|
+
"""
|
|
67
|
+
Removes columns from a pandas DataFrame that contain only zeros and null/NaN values.
|
|
68
|
+
|
|
69
|
+
This utility is useful for cleaning data after dummification steps that may result in empty columns.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
df (pd.DataFrame):
|
|
73
|
+
The pandas DataFrame to clean.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
pd.DataFrame:
|
|
77
|
+
A new DataFrame with the empty columns removed.
|
|
78
|
+
"""
|
|
79
|
+
if not isinstance(df, pd.DataFrame):
|
|
80
|
+
raise TypeError("Input must be a pandas DataFrame.")
|
|
81
|
+
|
|
82
|
+
original_columns = set(df.columns)
|
|
83
|
+
|
|
84
|
+
cols_to_keep = []
|
|
85
|
+
for col_name in df.columns:
|
|
86
|
+
column = df[col_name]
|
|
87
|
+
|
|
88
|
+
# Keep any column that is not numeric by default
|
|
89
|
+
if not is_numeric_dtype(column):
|
|
90
|
+
cols_to_keep.append(col_name)
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
# For numeric columns, check if there's at least one non-zero value.
|
|
94
|
+
if (column != 0).any():
|
|
95
|
+
cols_to_keep.append(col_name)
|
|
96
|
+
|
|
97
|
+
dropped_columns = original_columns - set(cols_to_keep)
|
|
98
|
+
if dropped_columns and verbose:
|
|
99
|
+
print(f"Dropped {len(dropped_columns)} columns:")
|
|
100
|
+
for dropped_column in dropped_columns:
|
|
101
|
+
print(f" {dropped_column}")
|
|
102
|
+
|
|
103
|
+
return df[cols_to_keep]
|
|
104
|
+
|
|
105
|
+
|
|
64
106
|
def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]], threshold: float = 0.7) -> pd.DataFrame:
|
|
65
107
|
"""
|
|
66
108
|
Drops rows from the DataFrame using a two-stage strategy:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|