dragon-ml-toolbox 4.2.1__py3-none-any.whl → 4.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-4.2.1.dist-info → dragon_ml_toolbox-4.3.0.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-4.2.1.dist-info → dragon_ml_toolbox-4.3.0.dist-info}/RECORD +8 -8
- ml_tools/ETL_engineering.py +27 -31
- ml_tools/data_exploration.py +41 -28
- {dragon_ml_toolbox-4.2.1.dist-info → dragon_ml_toolbox-4.3.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-4.2.1.dist-info → dragon_ml_toolbox-4.3.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-4.2.1.dist-info → dragon_ml_toolbox-4.3.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-4.2.1.dist-info → dragon_ml_toolbox-4.3.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
dragon_ml_toolbox-4.
|
|
2
|
-
dragon_ml_toolbox-4.
|
|
3
|
-
ml_tools/ETL_engineering.py,sha256=
|
|
1
|
+
dragon_ml_toolbox-4.3.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-4.3.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
|
|
3
|
+
ml_tools/ETL_engineering.py,sha256=P7HN_e3vfmrOqDDK-IenyRSFQPr0N3V9e2gN75QFVWs,39372
|
|
4
4
|
ml_tools/GUI_tools.py,sha256=n4ZZ5kEjwK5rkOCFJE41HeLFfjhpJVLUSzk9Kd9Kr_0,45410
|
|
5
5
|
ml_tools/MICE_imputation.py,sha256=b6ZTs8RedXFifOpuMCzr68xM16mCBVh1Ua6kcGfiVtg,11462
|
|
6
6
|
ml_tools/ML_callbacks.py,sha256=0a-Rbr0Xp_B1FNopOKBBmuJ4MqazS5JgDiT7wx1dHvE,13161
|
|
@@ -16,7 +16,7 @@ ml_tools/_logger.py,sha256=TpgYguxO-CWYqqgLW0tqFjtwZ58PE_W2OCfWNGZr0n0,1175
|
|
|
16
16
|
ml_tools/_pytorch_models.py,sha256=ewPPsTHgmRPzMMWwObZOdH1vxm2Ij2VWZP38NC6zSH4,10135
|
|
17
17
|
ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
18
18
|
ml_tools/custom_logger.py,sha256=a3ywSCQT7j5ypR-usnKh2l861d_aVJ93ZRVqxrHsBBw,4112
|
|
19
|
-
ml_tools/data_exploration.py,sha256=
|
|
19
|
+
ml_tools/data_exploration.py,sha256=T4nO9YSDGvrpom7JELtoQTyg7XTEmvQz-jG0KKxqTRk,23467
|
|
20
20
|
ml_tools/datasetmaster.py,sha256=_tNC2v98eCQGr3nMW_EFs83TRgRme8Uc7ttg1vosmQU,30106
|
|
21
21
|
ml_tools/ensemble_inference.py,sha256=0SNX3YAz5bpvtwYmqEwqyWeIJP2Pb-v-bemENRSO7qg,9426
|
|
22
22
|
ml_tools/ensemble_learning.py,sha256=Zi1oy6G2FWnTI5hBwjlexwF3JKALFS2FN6F8HAlVi_s,35391
|
|
@@ -24,7 +24,7 @@ ml_tools/handle_excel.py,sha256=J9iwIqMZemoxK49J5osSwp9Ge0h9YTKyYGbOm53hcno,1300
|
|
|
24
24
|
ml_tools/keys.py,sha256=kK9UF-hek2VcPGFILCKl5geoN6flmMOu7IzhdEA6z5Y,1068
|
|
25
25
|
ml_tools/path_manager.py,sha256=Z8e7w3MPqQaN8xmTnKuXZS6CIW59BFwwqGhGc00sdp4,13692
|
|
26
26
|
ml_tools/utilities.py,sha256=mz-M351DzxWxnYVcLX-7ZQ6c-RGoCV9g4VTS9Qif2Es,18348
|
|
27
|
-
dragon_ml_toolbox-4.
|
|
28
|
-
dragon_ml_toolbox-4.
|
|
29
|
-
dragon_ml_toolbox-4.
|
|
30
|
-
dragon_ml_toolbox-4.
|
|
27
|
+
dragon_ml_toolbox-4.3.0.dist-info/METADATA,sha256=7aZO_5P8SDx4tPFTtb3MTAaRgf_vbcOEURaxpT3MGK8,6572
|
|
28
|
+
dragon_ml_toolbox-4.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
29
|
+
dragon_ml_toolbox-4.3.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
30
|
+
dragon_ml_toolbox-4.3.0.dist-info/RECORD,,
|
ml_tools/ETL_engineering.py
CHANGED
|
@@ -3,6 +3,7 @@ import re
|
|
|
3
3
|
from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
|
|
4
4
|
from ._script_info import _script_info
|
|
5
5
|
from ._logger import _LOGGER
|
|
6
|
+
import warnings
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
__all__ = [
|
|
@@ -50,7 +51,7 @@ class ColumnCleaner:
|
|
|
50
51
|
```python
|
|
51
52
|
id_rules = {
|
|
52
53
|
# Matches 'ID-12345' or 'ID 12345' and reformats to 'ID:12345'
|
|
53
|
-
r'ID[- ](
|
|
54
|
+
r'ID[- ](\\d+)': r'ID:$1'
|
|
54
55
|
}
|
|
55
56
|
|
|
56
57
|
id_cleaner = ColumnCleaner(column_name='user_id', rules=id_rules)
|
|
@@ -700,26 +701,28 @@ class MultiNumberExtractor:
|
|
|
700
701
|
|
|
701
702
|
class RatioCalculator:
|
|
702
703
|
"""
|
|
703
|
-
A transformer that parses a string ratio (e.g., "40:5" or "30/2") and
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
regex_pattern (str, optional):
|
|
707
|
-
The regex pattern to find the numerator and denominator. It MUST
|
|
708
|
-
contain exactly two capturing groups: the first for the
|
|
709
|
-
numerator and the second for the denominator. Defaults to a
|
|
710
|
-
pattern that handles common delimiters like ':' and '/'.
|
|
704
|
+
A transformer that parses a string ratio (e.g., "40:5" or "30/2") and
|
|
705
|
+
computes the result of the division. It gracefully handles strings that
|
|
706
|
+
do not match the pattern by returning null.
|
|
711
707
|
"""
|
|
712
708
|
def __init__(
|
|
713
709
|
self,
|
|
714
|
-
|
|
710
|
+
# Default pattern includes the full-width colon ':'
|
|
711
|
+
regex_pattern: str = r"(\d+\.?\d*)\s*[::/]\s*(\d+\.?\d*)"
|
|
715
712
|
):
|
|
716
|
-
# --- Validation ---
|
|
713
|
+
# --- Robust Validation ---
|
|
717
714
|
try:
|
|
718
|
-
|
|
715
|
+
compiled_pattern = re.compile(regex_pattern)
|
|
716
|
+
if compiled_pattern.groups != 2:
|
|
719
717
|
raise ValueError(
|
|
720
|
-
"regex_pattern must contain exactly two "
|
|
718
|
+
"RatioCalculator regex_pattern must contain exactly two "
|
|
721
719
|
"capturing groups '(...)'."
|
|
722
720
|
)
|
|
721
|
+
if compiled_pattern.groupindex:
|
|
722
|
+
raise ValueError(
|
|
723
|
+
"RatioCalculator must be initialized with unnamed capturing groups "
|
|
724
|
+
"(e.g., '(\\d+)'), not named groups (e.g., '(?P<name>\\d+)')."
|
|
725
|
+
)
|
|
723
726
|
except re.error as e:
|
|
724
727
|
raise ValueError(f"Invalid regex pattern provided: {e}") from e
|
|
725
728
|
|
|
@@ -728,27 +731,20 @@ class RatioCalculator:
|
|
|
728
731
|
def __call__(self, column: pl.Series) -> pl.Series:
|
|
729
732
|
"""
|
|
730
733
|
Applies the ratio calculation logic to the input column.
|
|
731
|
-
|
|
732
|
-
Args:
|
|
733
|
-
column (pl.Series): The input Polars Series of ratio strings.
|
|
734
|
-
|
|
735
|
-
Returns:
|
|
736
|
-
pl.Series: A new Series of floats containing the division result.
|
|
737
|
-
Returns null for invalid formats or division by zero.
|
|
734
|
+
This version uses .str.extract() for maximum stability.
|
|
738
735
|
"""
|
|
739
|
-
#
|
|
740
|
-
|
|
741
|
-
|
|
736
|
+
# Extract numerator (group 1) and denominator (group 2) separately.
|
|
737
|
+
numerator_expr = column.str.extract(self.regex_pattern, 1).cast(pl.Float64, strict=False)
|
|
738
|
+
denominator_expr = column.str.extract(self.regex_pattern, 2).cast(pl.Float64, strict=False)
|
|
742
739
|
|
|
743
|
-
#
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
740
|
+
# Calculate the ratio, handling division by zero.
|
|
741
|
+
final_expr = pl.when(denominator_expr != 0).then(
|
|
742
|
+
numerator_expr / denominator_expr
|
|
743
|
+
).otherwise(
|
|
744
|
+
None # Handles both null denominators and division by zero
|
|
745
|
+
)
|
|
747
746
|
|
|
748
|
-
|
|
749
|
-
final_expr = pl.when(denominator != 0).then(numerator / denominator).otherwise(None)
|
|
750
|
-
|
|
751
|
-
return pl.select(final_expr).to_series()
|
|
747
|
+
return pl.select(final_expr.round(4)).to_series()
|
|
752
748
|
|
|
753
749
|
|
|
754
750
|
class CategoryMapper:
|
ml_tools/data_exploration.py
CHANGED
|
@@ -15,9 +15,9 @@ __all__ = [
|
|
|
15
15
|
"summarize_dataframe",
|
|
16
16
|
"drop_constant_columns",
|
|
17
17
|
"drop_rows_with_missing_data",
|
|
18
|
-
"split_features_targets",
|
|
19
18
|
"show_null_columns",
|
|
20
19
|
"drop_columns_with_missing_data",
|
|
20
|
+
"split_features_targets",
|
|
21
21
|
"split_continuous_binary",
|
|
22
22
|
"plot_correlation_heatmap",
|
|
23
23
|
"plot_value_distributions",
|
|
@@ -125,7 +125,9 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
|
|
|
125
125
|
|
|
126
126
|
# Stage 1: Drop rows with all target columns missing
|
|
127
127
|
if targets is not None:
|
|
128
|
-
|
|
128
|
+
# validate targets
|
|
129
|
+
valid_targets = [target for target in targets if target in df_clean.columns]
|
|
130
|
+
target_na = df_clean[valid_targets].isnull().all(axis=1)
|
|
129
131
|
if target_na.any():
|
|
130
132
|
print(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
|
|
131
133
|
df_clean = df_clean[~target_na]
|
|
@@ -150,30 +152,6 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
|
|
|
150
152
|
return df_clean
|
|
151
153
|
|
|
152
154
|
|
|
153
|
-
def split_features_targets(df: pd.DataFrame, targets: list[str]):
|
|
154
|
-
"""
|
|
155
|
-
Splits a DataFrame's columns into features and targets.
|
|
156
|
-
|
|
157
|
-
Args:
|
|
158
|
-
df (pd.DataFrame): Pandas DataFrame containing the dataset.
|
|
159
|
-
targets (list[str]): List of column names to be treated as target variables.
|
|
160
|
-
|
|
161
|
-
Returns:
|
|
162
|
-
tuple: A tuple containing:
|
|
163
|
-
- pd.DataFrame: Features dataframe.
|
|
164
|
-
- pd.DataFrame: Targets dataframe.
|
|
165
|
-
|
|
166
|
-
Prints:
|
|
167
|
-
- Shape of the original dataframe.
|
|
168
|
-
- Shape of the features dataframe.
|
|
169
|
-
- Shape of the targets dataframe.
|
|
170
|
-
"""
|
|
171
|
-
df_targets = df[targets]
|
|
172
|
-
df_features = df.drop(columns=targets)
|
|
173
|
-
print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
|
|
174
|
-
return df_features, df_targets
|
|
175
|
-
|
|
176
|
-
|
|
177
155
|
def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
|
|
178
156
|
"""
|
|
179
157
|
Displays a table of columns with missing values, showing both the count and
|
|
@@ -202,7 +180,7 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
|
|
|
202
180
|
return null_summary
|
|
203
181
|
|
|
204
182
|
|
|
205
|
-
def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True) -> pd.DataFrame:
|
|
183
|
+
def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True, skip_columns: Optional[List[str]]=None) -> pd.DataFrame:
|
|
206
184
|
"""
|
|
207
185
|
Drops columns with more than `threshold` fraction of missing values.
|
|
208
186
|
|
|
@@ -210,11 +188,22 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
|
|
|
210
188
|
df (pd.DataFrame): The input DataFrame.
|
|
211
189
|
threshold (float): Fraction of missing values above which columns are dropped.
|
|
212
190
|
show_nulls_after (bool): Prints `show_null_columns` after dropping columns.
|
|
191
|
+
skip_columns (list[str] | None): If given, these columns wont be included in the drop process.
|
|
213
192
|
|
|
214
193
|
Returns:
|
|
215
194
|
pd.DataFrame: A new DataFrame without the dropped columns.
|
|
216
195
|
"""
|
|
217
|
-
|
|
196
|
+
# If skip_columns is provided, create a list of columns to check.
|
|
197
|
+
# Otherwise, check all columns.
|
|
198
|
+
cols_to_check = df.columns
|
|
199
|
+
if skip_columns:
|
|
200
|
+
# Use set difference for efficient exclusion
|
|
201
|
+
cols_to_check = df.columns.difference(skip_columns)
|
|
202
|
+
|
|
203
|
+
# Calculate the missing fraction only on the columns to be checked
|
|
204
|
+
missing_fraction = df[cols_to_check].isnull().mean()
|
|
205
|
+
|
|
206
|
+
|
|
218
207
|
cols_to_drop = missing_fraction[missing_fraction > threshold].index
|
|
219
208
|
|
|
220
209
|
if len(cols_to_drop) > 0:
|
|
@@ -231,6 +220,30 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
|
|
|
231
220
|
return df
|
|
232
221
|
|
|
233
222
|
|
|
223
|
+
def split_features_targets(df: pd.DataFrame, targets: list[str]):
|
|
224
|
+
"""
|
|
225
|
+
Splits a DataFrame's columns into features and targets.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
df (pd.DataFrame): Pandas DataFrame containing the dataset.
|
|
229
|
+
targets (list[str]): List of column names to be treated as target variables.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
tuple: A tuple containing:
|
|
233
|
+
- pd.DataFrame: Features dataframe.
|
|
234
|
+
- pd.DataFrame: Targets dataframe.
|
|
235
|
+
|
|
236
|
+
Prints:
|
|
237
|
+
- Shape of the original dataframe.
|
|
238
|
+
- Shape of the features dataframe.
|
|
239
|
+
- Shape of the targets dataframe.
|
|
240
|
+
"""
|
|
241
|
+
df_targets = df[targets]
|
|
242
|
+
df_features = df.drop(columns=targets)
|
|
243
|
+
print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
|
|
244
|
+
return df_features, df_targets
|
|
245
|
+
|
|
246
|
+
|
|
234
247
|
def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
235
248
|
"""
|
|
236
249
|
Split DataFrame into two DataFrames: one with continuous columns, one with binary columns.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|