dragon-ml-toolbox 10.11.1__py3-none-any.whl → 10.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-10.11.1.dist-info → dragon_ml_toolbox-10.12.0.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-10.11.1.dist-info → dragon_ml_toolbox-10.12.0.dist-info}/RECORD +9 -9
- ml_tools/ML_models.py +19 -11
- ml_tools/data_exploration.py +223 -1
- ml_tools/path_manager.py +18 -11
- {dragon_ml_toolbox-10.11.1.dist-info → dragon_ml_toolbox-10.12.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-10.11.1.dist-info → dragon_ml_toolbox-10.12.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-10.11.1.dist-info → dragon_ml_toolbox-10.12.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-10.11.1.dist-info → dragon_ml_toolbox-10.12.0.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
dragon_ml_toolbox-10.
|
|
2
|
-
dragon_ml_toolbox-10.
|
|
1
|
+
dragon_ml_toolbox-10.12.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-10.12.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
|
|
3
3
|
ml_tools/ETL_cleaning.py,sha256=lSP5q6-ukGhJBPV8dlsqJvPXAzj4du_0J-SbtEd0Pjg,19292
|
|
4
4
|
ml_tools/ETL_engineering.py,sha256=a6KCWH6kRatZtjaFEF_o917ApPMK5_vRD-BjfCDAl-E,49400
|
|
5
5
|
ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
|
|
@@ -9,7 +9,7 @@ ml_tools/ML_datasetmaster.py,sha256=vqKZhCXsvN5yeRJdOKqMPh5OhY1xe6xlNjM3WoH5lys,
|
|
|
9
9
|
ml_tools/ML_evaluation.py,sha256=6FB6S-aDDpFzQdrp3flBVECzEsHhMbQknYVGhHooEFs,16207
|
|
10
10
|
ml_tools/ML_evaluation_multi.py,sha256=2jTSNFCu8cz5C05EusnrDyffs59M2Fq3UXSHxo2TR1A,12515
|
|
11
11
|
ml_tools/ML_inference.py,sha256=SGDPiPxs_OYDKKRZziFMyaWcC8A37c70W9t-dMP5niI,23066
|
|
12
|
-
ml_tools/ML_models.py,sha256=
|
|
12
|
+
ml_tools/ML_models.py,sha256=JMFOuw4jtX5RtUFpkQWS8-dzDW0AwqYjbl67XRCVubA,27996
|
|
13
13
|
ml_tools/ML_optimization.py,sha256=a2Uxe1g-y4I-gFa8ENIM8QDS-Pz3hoPRRaVXAWMbyQA,13491
|
|
14
14
|
ml_tools/ML_scaler.py,sha256=h2ymq5u953Lx60Qb38Y0mAWj85x9PbnP0xYNQ3pd8-w,7535
|
|
15
15
|
ml_tools/ML_trainer.py,sha256=_g48w5Ak-wQr5fGHdJqlcpnzv3gWyL1ghkOhy9VOZbo,23930
|
|
@@ -21,16 +21,16 @@ ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
|
|
|
21
21
|
ml_tools/_logger.py,sha256=wcImAiXEZKPNcwM30qBh3t7HvoPURonJY0nrgMGF0sM,4719
|
|
22
22
|
ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
23
23
|
ml_tools/custom_logger.py,sha256=ry43hk54K6xKo8jRAgq1sFxUpOA9T0LIJ7sw0so2BW0,5880
|
|
24
|
-
ml_tools/data_exploration.py,sha256
|
|
24
|
+
ml_tools/data_exploration.py,sha256=-aTi5jmv4AepPgi2k_85qEJsSLx5zPOtTbhorqzUvGQ,38542
|
|
25
25
|
ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
|
|
26
26
|
ml_tools/ensemble_inference.py,sha256=EFHnbjbu31fcVp88NBx8lWAVdu2Gpg9MY9huVZJHFfM,9350
|
|
27
27
|
ml_tools/ensemble_learning.py,sha256=3s0kH4i_naj0IVl_T4knst-Hwg4TScWjEdsXX5KAi7I,21929
|
|
28
28
|
ml_tools/handle_excel.py,sha256=He4UT15sCGhaG-JKfs7uYVAubxWjrqgJ6U7OhMR2fuE,14005
|
|
29
29
|
ml_tools/keys.py,sha256=FDpbS3Jb0pjrVvvp2_8nZi919mbob_-xwuy5OOtKM_A,1848
|
|
30
30
|
ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
|
|
31
|
-
ml_tools/path_manager.py,sha256=
|
|
31
|
+
ml_tools/path_manager.py,sha256=ke0MYOhYheRPX599GUbrvRsYHn2JKUmMDldS5LP6LQA,18431
|
|
32
32
|
ml_tools/utilities.py,sha256=uheMUjQJ1zI69gASsE-mCq4KlRPVGgrgqson02rGNYM,30755
|
|
33
|
-
dragon_ml_toolbox-10.
|
|
34
|
-
dragon_ml_toolbox-10.
|
|
35
|
-
dragon_ml_toolbox-10.
|
|
36
|
-
dragon_ml_toolbox-10.
|
|
33
|
+
dragon_ml_toolbox-10.12.0.dist-info/METADATA,sha256=dgxB7Ad4a5Zf1CPzLZFo5ny2Siotmsm2mWjQ8B7Nsa4,6969
|
|
34
|
+
dragon_ml_toolbox-10.12.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
35
|
+
dragon_ml_toolbox-10.12.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
36
|
+
dragon_ml_toolbox-10.12.0.dist-info/RECORD,,
|
ml_tools/ML_models.py
CHANGED
|
@@ -300,8 +300,8 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
|
|
|
300
300
|
sequence with a standard Transformer Encoder.
|
|
301
301
|
"""
|
|
302
302
|
def __init__(self, *,
|
|
303
|
+
in_features: int,
|
|
303
304
|
out_targets: int,
|
|
304
|
-
numerical_indices: List[int],
|
|
305
305
|
categorical_map: Dict[int, int],
|
|
306
306
|
embedding_dim: int = 32,
|
|
307
307
|
num_heads: int = 8,
|
|
@@ -309,8 +309,8 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
|
|
|
309
309
|
dropout: float = 0.1):
|
|
310
310
|
"""
|
|
311
311
|
Args:
|
|
312
|
+
in_features (int): The total number of columns in the input data (features).
|
|
312
313
|
out_targets (int): Number of output targets (1 for regression).
|
|
313
|
-
numerical_indices (List[int]): Column indices for numerical features.
|
|
314
314
|
categorical_map (Dict[int, int]): Maps categorical column index to its cardinality (number of unique categories).
|
|
315
315
|
embedding_dim (int): The dimension for all feature embeddings. Must be divisible by num_heads.
|
|
316
316
|
num_heads (int): The number of heads in the multi-head attention mechanism.
|
|
@@ -330,15 +330,25 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
|
|
|
330
330
|
their cardinality (the number of unique categories) via the `categorical_map` parameter.
|
|
331
331
|
|
|
332
332
|
**Ordinal & Binary Features** (e.g., 'Low/Medium/High', 'True/False'): Should be treated as **numerical**. Map them to numbers that
|
|
333
|
-
represent their state (e.g., `{'Low': 0, 'Medium': 1}` or `{False: 0, True: 1}`). Their column indices should be included in the
|
|
334
|
-
`
|
|
333
|
+
represent their state (e.g., `{'Low': 0, 'Medium': 1}` or `{False: 0, True: 1}`). Their column indices should **NOT** be included in the
|
|
334
|
+
`categorical_map` parameter.
|
|
335
335
|
|
|
336
|
-
**Standard Numerical Features** (e.g., 'Age', 'Price'):
|
|
337
|
-
scale them before training.
|
|
336
|
+
**Standard Numerical and Continuous Features** (e.g., 'Age', 'Price'): It is highly recommended to scale them before training.
|
|
338
337
|
"""
|
|
339
338
|
super().__init__()
|
|
340
|
-
|
|
339
|
+
|
|
340
|
+
# --- Validation ---
|
|
341
|
+
if categorical_map and max(categorical_map.keys()) >= in_features:
|
|
342
|
+
_LOGGER.error(f"A categorical index ({max(categorical_map.keys())}) is out of bounds for the provided input features ({in_features}).")
|
|
343
|
+
raise ValueError()
|
|
344
|
+
|
|
345
|
+
# --- Derive numerical indices ---
|
|
346
|
+
all_indices = set(range(in_features))
|
|
347
|
+
categorical_indices_set = set(categorical_map.keys())
|
|
348
|
+
numerical_indices = sorted(list(all_indices - categorical_indices_set))
|
|
349
|
+
|
|
341
350
|
# --- Save configuration ---
|
|
351
|
+
self.in_features = in_features
|
|
342
352
|
self.out_targets = out_targets
|
|
343
353
|
self.numerical_indices = numerical_indices
|
|
344
354
|
self.categorical_map = categorical_map
|
|
@@ -405,8 +415,8 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
|
|
|
405
415
|
def get_architecture_config(self) -> Dict[str, Any]:
|
|
406
416
|
"""Returns the full configuration of the model."""
|
|
407
417
|
return {
|
|
418
|
+
'in_features': self.in_features,
|
|
408
419
|
'out_targets': self.out_targets,
|
|
409
|
-
'numerical_indices': self.numerical_indices,
|
|
410
420
|
'categorical_map': self.categorical_map,
|
|
411
421
|
'embedding_dim': self.embedding_dim,
|
|
412
422
|
'num_heads': self.num_heads,
|
|
@@ -416,11 +426,9 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
|
|
|
416
426
|
|
|
417
427
|
def __repr__(self) -> str:
|
|
418
428
|
"""Returns the developer-friendly string representation of the model."""
|
|
419
|
-
num_features = len(self.numerical_indices) + len(self.categorical_map)
|
|
420
|
-
|
|
421
429
|
# Build the architecture string part-by-part
|
|
422
430
|
parts = [
|
|
423
|
-
f"Tokenizer(features={
|
|
431
|
+
f"Tokenizer(features={self.in_features}, dim={self.embedding_dim})",
|
|
424
432
|
"[CLS]",
|
|
425
433
|
f"TransformerEncoder(layers={self.num_layers}, heads={self.num_heads})",
|
|
426
434
|
f"PredictionHead(outputs={self.out_targets})"
|
ml_tools/data_exploration.py
CHANGED
|
@@ -22,6 +22,7 @@ __all__ = [
|
|
|
22
22
|
"drop_columns_with_missing_data",
|
|
23
23
|
"drop_macro",
|
|
24
24
|
"clean_column_names",
|
|
25
|
+
"encode_categorical_features",
|
|
25
26
|
"split_features_targets",
|
|
26
27
|
"split_continuous_binary",
|
|
27
28
|
"plot_correlation_heatmap",
|
|
@@ -29,7 +30,9 @@ __all__ = [
|
|
|
29
30
|
"clip_outliers_single",
|
|
30
31
|
"clip_outliers_multi",
|
|
31
32
|
"match_and_filter_columns_by_regex",
|
|
32
|
-
"standardize_percentages"
|
|
33
|
+
"standardize_percentages",
|
|
34
|
+
"create_transformer_categorical_map",
|
|
35
|
+
"reconstruct_one_hot"
|
|
33
36
|
]
|
|
34
37
|
|
|
35
38
|
|
|
@@ -337,6 +340,90 @@ def clean_column_names(df: pd.DataFrame, replacement_char: str = '-', replacemen
|
|
|
337
340
|
return new_df
|
|
338
341
|
|
|
339
342
|
|
|
343
|
+
def encode_categorical_features(
|
|
344
|
+
df: pd.DataFrame,
|
|
345
|
+
columns_to_encode: List[str],
|
|
346
|
+
encode_nulls: bool,
|
|
347
|
+
split_resulting_dataset: bool = True,
|
|
348
|
+
verbose: bool = True
|
|
349
|
+
) -> Tuple[Dict[str, Dict[str, int]], pd.DataFrame, Optional[pd.DataFrame]]:
|
|
350
|
+
"""
|
|
351
|
+
Finds unique values in specified categorical columns, encodes them into integers,
|
|
352
|
+
and returns a dictionary containing the mappings for each column.
|
|
353
|
+
|
|
354
|
+
This function automates the label encoding process and generates a simple,
|
|
355
|
+
human-readable dictionary of the mappings.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
df (pd.DataFrame): The input DataFrame.
|
|
359
|
+
columns_to_encode (List[str]): A list of column names to be encoded.
|
|
360
|
+
encode_nulls (bool): If True, encodes Null values as a distinct category
|
|
361
|
+
"Other" with a value of 0. Other categories start from 1.
|
|
362
|
+
If False, Nulls are ignored.
|
|
363
|
+
split_resulting_dataset (bool): If True, returns two separate DataFrames:
|
|
364
|
+
one with non-categorical columns and one with the encoded columns.
|
|
365
|
+
If False, returns a single DataFrame with all columns.
|
|
366
|
+
verbose (bool): If True, prints encoding progress.
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
Tuple:
|
|
370
|
+
|
|
371
|
+
- Dict[str, Dict[str, int]]: A dictionary where each key is a column name and the value is its category-to-integer mapping.
|
|
372
|
+
|
|
373
|
+
- pd.DataFrame: The original dataframe with or without encoded columns (see `split_resulting_dataset`).
|
|
374
|
+
|
|
375
|
+
- pd.DataFrame | None: If `split_resulting_dataset` is True, the encoded columns as a new dataframe.
|
|
376
|
+
"""
|
|
377
|
+
df_encoded = df.copy()
|
|
378
|
+
|
|
379
|
+
# Validate columns
|
|
380
|
+
valid_columns = [col for col in columns_to_encode if col in df_encoded.columns]
|
|
381
|
+
missing_columns = set(columns_to_encode) - set(valid_columns)
|
|
382
|
+
if missing_columns:
|
|
383
|
+
_LOGGER.warning(f"Columns not found and will be skipped: {list(missing_columns)}")
|
|
384
|
+
|
|
385
|
+
mappings: Dict[str, Dict[str, int]] = {}
|
|
386
|
+
|
|
387
|
+
_LOGGER.info(f"Encoding {len(valid_columns)} categorical column(s).")
|
|
388
|
+
for col_name in valid_columns:
|
|
389
|
+
has_nulls = df_encoded[col_name].isnull().any()
|
|
390
|
+
|
|
391
|
+
if encode_nulls and has_nulls:
|
|
392
|
+
# Handle nulls: "Other" -> 0, other categories -> 1, 2, 3...
|
|
393
|
+
categories = sorted([str(cat) for cat in df_encoded[col_name].dropna().unique()])
|
|
394
|
+
# Start mapping from 1 for non-null values
|
|
395
|
+
mapping = {category: i + 1 for i, category in enumerate(categories)}
|
|
396
|
+
|
|
397
|
+
# Apply mapping and fill remaining NaNs with 0
|
|
398
|
+
mapped_series = df_encoded[col_name].astype(str).map(mapping)
|
|
399
|
+
df_encoded[col_name] = mapped_series.fillna(0).astype(int)
|
|
400
|
+
|
|
401
|
+
# Create the complete user-facing map including "Other"
|
|
402
|
+
user_mapping = {**mapping, "Other": 0}
|
|
403
|
+
mappings[col_name] = user_mapping
|
|
404
|
+
else:
|
|
405
|
+
# ignore nulls
|
|
406
|
+
categories = sorted([str(cat) for cat in df_encoded[col_name].dropna().unique()])
|
|
407
|
+
|
|
408
|
+
mapping = {category: i for i, category in enumerate(categories)}
|
|
409
|
+
|
|
410
|
+
df_encoded[col_name] = df_encoded[col_name].astype(str).map(mapping)
|
|
411
|
+
|
|
412
|
+
mappings[col_name] = mapping
|
|
413
|
+
|
|
414
|
+
if verbose:
|
|
415
|
+
cardinality = len(mappings[col_name])
|
|
416
|
+
print(f" - Encoded '{col_name}' with {cardinality} unique values.")
|
|
417
|
+
|
|
418
|
+
# Handle the dataset splitting logic
|
|
419
|
+
if split_resulting_dataset:
|
|
420
|
+
df_categorical = df_encoded[valid_columns].to_frame()
|
|
421
|
+
df_non_categorical = df.drop(columns=valid_columns)
|
|
422
|
+
return mappings, df_non_categorical, df_categorical
|
|
423
|
+
else:
|
|
424
|
+
return mappings, df_encoded, None
|
|
425
|
+
|
|
426
|
+
|
|
340
427
|
def split_features_targets(df: pd.DataFrame, targets: list[str]):
|
|
341
428
|
"""
|
|
342
429
|
Splits a DataFrame's columns into features and targets.
|
|
@@ -766,6 +853,141 @@ def standardize_percentages(
|
|
|
766
853
|
return df_copy
|
|
767
854
|
|
|
768
855
|
|
|
856
|
+
def create_transformer_categorical_map(
|
|
857
|
+
df: pd.DataFrame,
|
|
858
|
+
mappings: Dict[str, Dict[str, int]],
|
|
859
|
+
verbose: bool = True
|
|
860
|
+
) -> Dict[int, int]:
|
|
861
|
+
"""
|
|
862
|
+
Creates the `categorical_map` required by a `TabularTransformer` model.
|
|
863
|
+
|
|
864
|
+
This function should be called late in the preprocessing pipeline, after all
|
|
865
|
+
column additions, deletions, or reordering have occurred. It uses the final
|
|
866
|
+
DataFrame's column order to map the correct column index to its cardinality.
|
|
867
|
+
|
|
868
|
+
Args:
|
|
869
|
+
df (pd.DataFrame): The final, processed DataFrame.
|
|
870
|
+
mappings (Dict[str, Dict[str, int]]): The mappings dictionary generated by
|
|
871
|
+
`encode_categorical_features`, containing the category-to-integer
|
|
872
|
+
mapping for each categorical column.
|
|
873
|
+
verbose (bool): If True, prints mapping progress.
|
|
874
|
+
|
|
875
|
+
Returns:
|
|
876
|
+
(Dict[int, int]): The final `categorical_map` for the transformer,
|
|
877
|
+
mapping each column's current index to its cardinality (e.g., {0: 3}).
|
|
878
|
+
"""
|
|
879
|
+
transformer_map = {}
|
|
880
|
+
categorical_column_names = mappings.keys()
|
|
881
|
+
|
|
882
|
+
_LOGGER.info("Creating categorical map for TabularTransformer.")
|
|
883
|
+
for col_name in categorical_column_names:
|
|
884
|
+
if col_name in df.columns:
|
|
885
|
+
col_idx = df.columns.get_loc(col_name)
|
|
886
|
+
|
|
887
|
+
# Get cardinality directly from the length of the mapping dictionary
|
|
888
|
+
cardinality = len(mappings[col_name])
|
|
889
|
+
|
|
890
|
+
transformer_map[col_idx] = cardinality
|
|
891
|
+
if verbose:
|
|
892
|
+
print(f" - Mapping column '{col_name}' at index {col_idx} with cardinality {cardinality}.")
|
|
893
|
+
else:
|
|
894
|
+
_LOGGER.warning(f"Categorical column '{col_name}' not found in the final DataFrame. Skipping.")
|
|
895
|
+
|
|
896
|
+
return transformer_map
|
|
897
|
+
|
|
898
|
+
|
|
899
|
+
def reconstruct_one_hot(
|
|
900
|
+
df: pd.DataFrame,
|
|
901
|
+
base_feature_names: List[str],
|
|
902
|
+
separator: str = '_',
|
|
903
|
+
drop_original: bool = True
|
|
904
|
+
) -> pd.DataFrame:
|
|
905
|
+
"""
|
|
906
|
+
Reconstructs original categorical columns from a one-hot encoded DataFrame.
|
|
907
|
+
|
|
908
|
+
This function identifies groups of one-hot encoded columns based on a common
|
|
909
|
+
prefix (base feature name) and a separator. It then collapses each group
|
|
910
|
+
into a single column containing the categorical value.
|
|
911
|
+
|
|
912
|
+
Args:
|
|
913
|
+
df (pd.DataFrame):
|
|
914
|
+
The input DataFrame with one-hot encoded columns.
|
|
915
|
+
base_features (List[str]):
|
|
916
|
+
A list of base feature names to reconstruct. For example, if you have
|
|
917
|
+
columns 'B_a', 'B_b', 'B_c', you would pass `['B']`.
|
|
918
|
+
separator (str):
|
|
919
|
+
The character separating the base name from the categorical value in
|
|
920
|
+
the column names (e.g., '_' in 'B_a').
|
|
921
|
+
drop_original (bool):
|
|
922
|
+
If True, the original one-hot encoded columns will be dropped from
|
|
923
|
+
the returned DataFrame.
|
|
924
|
+
|
|
925
|
+
Returns:
|
|
926
|
+
pd.DataFrame:
|
|
927
|
+
A new DataFrame with the specified one-hot encoded features
|
|
928
|
+
reconstructed into single categorical columns.
|
|
929
|
+
|
|
930
|
+
<br>
|
|
931
|
+
|
|
932
|
+
## Note:
|
|
933
|
+
|
|
934
|
+
This function is designed to be robust, but users should be aware of two key edge cases:
|
|
935
|
+
|
|
936
|
+
1. **Ambiguous Base Feature Prefixes**: If `base_feature_names` list contains names where one is a prefix of another (e.g., `['feat', 'feat_ext']`), the order is critical. The function will match columns greedily. To avoid incorrect grouping, always list the **most specific base names first** (e.g., `['feat_ext', 'feat']`).
|
|
937
|
+
|
|
938
|
+
2. **Malformed One-Hot Data**: If a row contains multiple `1`s within the same feature group (e.g., both `B_a` and `B_c` are `1`), the function will not raise an error. It uses `.idxmax()`, which returns the first column that contains the maximum value. This means it will silently select the first category it encounters and ignore the others, potentially masking an upstream data issue.
|
|
939
|
+
"""
|
|
940
|
+
if not isinstance(df, pd.DataFrame):
|
|
941
|
+
_LOGGER.error("Input must be a pandas DataFrame.")
|
|
942
|
+
raise TypeError()
|
|
943
|
+
|
|
944
|
+
new_df = df.copy()
|
|
945
|
+
all_ohe_cols_to_drop = []
|
|
946
|
+
reconstructed_count = 0
|
|
947
|
+
|
|
948
|
+
_LOGGER.info(f"Attempting to reconstruct {len(base_feature_names)} one-hot encoded feature(s).")
|
|
949
|
+
|
|
950
|
+
for base_name in base_feature_names:
|
|
951
|
+
# Regex to find all columns belonging to this base feature.
|
|
952
|
+
pattern = f"^{re.escape(base_name)}{re.escape(separator)}"
|
|
953
|
+
|
|
954
|
+
# Find matching columns
|
|
955
|
+
ohe_cols = [col for col in df.columns if re.match(pattern, col)]
|
|
956
|
+
|
|
957
|
+
if not ohe_cols:
|
|
958
|
+
_LOGGER.warning(f"No one-hot encoded columns found for base feature '{base_name}'. Skipping.")
|
|
959
|
+
continue
|
|
960
|
+
|
|
961
|
+
# For each row, find the column name with the maximum value (which is 1)
|
|
962
|
+
reconstructed_series = new_df[ohe_cols].idxmax(axis=1)
|
|
963
|
+
|
|
964
|
+
# Extract the categorical value (the suffix) from the column name
|
|
965
|
+
# Use n=1 in split to handle cases where the category itself might contain the separator
|
|
966
|
+
new_column_values = reconstructed_series.str.split(separator, n=1).str[1]
|
|
967
|
+
|
|
968
|
+
# Handle rows where all OHE columns were 0 (e.g., original value was NaN).
|
|
969
|
+
# In these cases, idxmax returns the first column name, but the sum of values is 0.
|
|
970
|
+
all_zero_mask = new_df[ohe_cols].sum(axis=1) == 0
|
|
971
|
+
new_column_values.loc[all_zero_mask] = np.nan
|
|
972
|
+
|
|
973
|
+
# Assign the new reconstructed column to the DataFrame
|
|
974
|
+
new_df[base_name] = new_column_values
|
|
975
|
+
|
|
976
|
+
all_ohe_cols_to_drop.extend(ohe_cols)
|
|
977
|
+
reconstructed_count += 1
|
|
978
|
+
print(f" - Reconstructed '{base_name}' from {len(ohe_cols)} columns.")
|
|
979
|
+
|
|
980
|
+
if drop_original and all_ohe_cols_to_drop:
|
|
981
|
+
# Drop the original OHE columns, ensuring no duplicates in the drop list
|
|
982
|
+
unique_cols_to_drop = list(set(all_ohe_cols_to_drop))
|
|
983
|
+
new_df.drop(columns=unique_cols_to_drop, inplace=True)
|
|
984
|
+
_LOGGER.info(f"Dropped {len(unique_cols_to_drop)} original one-hot encoded columns.")
|
|
985
|
+
|
|
986
|
+
_LOGGER.info(f"Successfully reconstructed {reconstructed_count} feature(s).")
|
|
987
|
+
|
|
988
|
+
return new_df
|
|
989
|
+
|
|
990
|
+
|
|
769
991
|
def _validate_columns(df: pd.DataFrame, columns: list[str]):
|
|
770
992
|
valid_columns = [column for column in columns if column in df.columns]
|
|
771
993
|
return valid_columns
|
ml_tools/path_manager.py
CHANGED
|
@@ -248,26 +248,33 @@ class PathManager:
|
|
|
248
248
|
_LOGGER.error(f"'{type(self).__name__}' object has no attribute or path key '{sanitized_name}'")
|
|
249
249
|
raise AttributeError()
|
|
250
250
|
|
|
251
|
-
def __setattr__(self, name: str, value: Union[str, Path]):
|
|
251
|
+
def __setattr__(self, name: str, value: Union[str, Path, bool, dict, str, int, tuple]):
|
|
252
252
|
"""Allows attribute-style setting of paths, e.g., PM.data = 'path/to/data'."""
|
|
253
|
-
# Check for internal attributes
|
|
253
|
+
# Check for internal attributes, which are set directly on the object.
|
|
254
254
|
if name.startswith('_'):
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
255
|
+
# This check prevents setting new private attributes after __init__ is done.
|
|
256
|
+
is_initialized = self.__dict__.get('_initialized', False)
|
|
257
|
+
if is_initialized:
|
|
258
|
+
_LOGGER.error(f"Cannot set private attribute '{name}' after initialization.")
|
|
259
|
+
raise AttributeError()
|
|
260
|
+
super().__setattr__(name, value)
|
|
261
261
|
return
|
|
262
262
|
|
|
263
|
-
#
|
|
263
|
+
# Sanitize the key for the public path.
|
|
264
264
|
sanitized_name = self._sanitize_key(name)
|
|
265
265
|
self._check_underscore_key(sanitized_name)
|
|
266
|
-
|
|
266
|
+
|
|
267
|
+
# Prevent overwriting existing methods (e.g., PM.status = 'foo').
|
|
268
|
+
# This check looks at the class, not the instance therefore won't trigger __getattr__.
|
|
269
|
+
if hasattr(self.__class__, sanitized_name):
|
|
267
270
|
_LOGGER.error(f"Cannot overwrite existing attribute or method '{sanitized_name}' ({name}).")
|
|
268
271
|
raise AttributeError()
|
|
272
|
+
|
|
273
|
+
if not isinstance(value, (str, Path)):
|
|
274
|
+
_LOGGER.error(f"Cannot assign type '{type(value).__name__}' to a path. Must be str or Path.")
|
|
275
|
+
raise TypeError
|
|
269
276
|
|
|
270
|
-
# If all checks pass, treat it as a public path.
|
|
277
|
+
# If all checks pass, treat it as a public path and store it in the _paths dictionary.
|
|
271
278
|
self._paths[sanitized_name] = Path(value)
|
|
272
279
|
|
|
273
280
|
|
|
File without changes
|
{dragon_ml_toolbox-10.11.1.dist-info → dragon_ml_toolbox-10.12.0.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|