upgini 1.2.81a3832.dev8__py3-none-any.whl → 1.2.81a3832.dev10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +5 -5
- upgini/metrics.py +172 -113
- {upgini-1.2.81a3832.dev8.dist-info → upgini-1.2.81a3832.dev10.dist-info}/METADATA +1 -1
- {upgini-1.2.81a3832.dev8.dist-info → upgini-1.2.81a3832.dev10.dist-info}/RECORD +7 -7
- {upgini-1.2.81a3832.dev8.dist-info → upgini-1.2.81a3832.dev10.dist-info}/WHEEL +0 -0
- {upgini-1.2.81a3832.dev8.dist-info → upgini-1.2.81a3832.dev10.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.81a3832.
|
1
|
+
__version__ = "1.2.81a3832.dev10"
|
upgini/features_enricher.py
CHANGED
@@ -1768,10 +1768,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
1768
1768
|
df = generator.generate(df)
|
1769
1769
|
generated_features.extend(generator.generated_features)
|
1770
1770
|
|
1771
|
-
|
1772
|
-
|
1773
|
-
|
1774
|
-
columns_renaming = {c: c for c in df.columns}
|
1771
|
+
normalizer = Normalizer(self.bundle, self.logger)
|
1772
|
+
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
1773
|
+
columns_renaming = normalizer.columns_renaming
|
1774
|
+
# columns_renaming = {c: c for c in df.columns}
|
1775
1775
|
|
1776
1776
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
1777
1777
|
|
@@ -3881,7 +3881,7 @@ if response.status_code == 200:
|
|
3881
3881
|
if features_meta is None:
|
3882
3882
|
raise Exception(self.bundle.get("missing_features_meta"))
|
3883
3883
|
|
3884
|
-
return [f.name for f in features_meta if f.type == "categorical"
|
3884
|
+
return [f.name for f in features_meta if f.type == "categorical"]
|
3885
3885
|
|
3886
3886
|
def __prepare_feature_importances(
|
3887
3887
|
self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
upgini/metrics.py
CHANGED
@@ -6,7 +6,7 @@ import re
|
|
6
6
|
from collections import defaultdict
|
7
7
|
from copy import deepcopy
|
8
8
|
from dataclasses import dataclass
|
9
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
9
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
|
10
10
|
|
11
11
|
import lightgbm as lgb
|
12
12
|
import numpy as np
|
@@ -32,7 +32,10 @@ except ImportError:
|
|
32
32
|
available_scorers = SCORERS
|
33
33
|
from sklearn.metrics import mean_squared_error
|
34
34
|
from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
|
35
|
-
from sklearn.model_selection import
|
35
|
+
from sklearn.model_selection import ( # , TimeSeriesSplit
|
36
|
+
BaseCrossValidator,
|
37
|
+
TimeSeriesSplit,
|
38
|
+
)
|
36
39
|
|
37
40
|
from upgini.errors import ValidationError
|
38
41
|
from upgini.metadata import ModelTaskType
|
@@ -250,8 +253,49 @@ class _CrossValResults:
|
|
250
253
|
return f"{self.metric:.3f} ± {self.metric_std:.3f}"
|
251
254
|
|
252
255
|
|
256
|
+
def is_numeric_object(x: pd.Series) -> bool:
|
257
|
+
try:
|
258
|
+
pd.to_numeric(x, errors="raise")
|
259
|
+
return True
|
260
|
+
except (ValueError, TypeError):
|
261
|
+
return False
|
262
|
+
|
263
|
+
|
264
|
+
def is_valid_numeric_array_data(data: pd.Series) -> bool:
|
265
|
+
data_without_na = data.dropna()
|
266
|
+
if data_without_na.empty:
|
267
|
+
return False
|
268
|
+
|
269
|
+
first_element = data_without_na.iloc[0]
|
270
|
+
|
271
|
+
# numpy.ndarray with numeric types
|
272
|
+
if isinstance(first_element, np.ndarray):
|
273
|
+
return np.issubdtype(first_element.dtype, np.number)
|
274
|
+
|
275
|
+
# DataFrame with all numeric columns
|
276
|
+
elif isinstance(first_element, pd.DataFrame):
|
277
|
+
return all(np.issubdtype(dtype, np.number) for dtype in first_element.dtypes)
|
278
|
+
|
279
|
+
# list or list of lists with numeric types
|
280
|
+
elif isinstance(first_element, list):
|
281
|
+
try:
|
282
|
+
# flat list
|
283
|
+
if all(isinstance(x, (int, float, np.number)) or pd.isna(x) for x in first_element):
|
284
|
+
return True
|
285
|
+
# list of lists
|
286
|
+
elif all(
|
287
|
+
isinstance(x, list) and all(isinstance(y, (int, float, np.number)) or pd.isna(y) for y in x)
|
288
|
+
for x in first_element
|
289
|
+
):
|
290
|
+
return True
|
291
|
+
except Exception:
|
292
|
+
return False
|
293
|
+
|
294
|
+
return False
|
295
|
+
|
296
|
+
|
253
297
|
class EstimatorWrapper:
|
254
|
-
default_estimator = "catboost"
|
298
|
+
default_estimator: Literal["catboost", "lightgbm"] = "catboost"
|
255
299
|
|
256
300
|
def __init__(
|
257
301
|
self,
|
@@ -279,6 +323,7 @@ class EstimatorWrapper:
|
|
279
323
|
self.groups = groups
|
280
324
|
self.text_features = text_features
|
281
325
|
self.logger = logger or logging.getLogger()
|
326
|
+
self.droped_features = []
|
282
327
|
|
283
328
|
def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
|
284
329
|
x, y, _, fit_params = self._prepare_to_fit(x, y)
|
@@ -286,26 +331,51 @@ class EstimatorWrapper:
|
|
286
331
|
self.estimator.fit(x, y, **kwargs)
|
287
332
|
return self
|
288
333
|
|
289
|
-
def predict(self, **kwargs):
|
290
|
-
|
334
|
+
def predict(self, x: pd.DataFrame, **kwargs):
|
335
|
+
x, _, _ = self._prepare_to_calculate(x, None)
|
336
|
+
return self.estimator.predict(x, **kwargs)
|
291
337
|
|
292
338
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
293
339
|
x, y, groups = self._prepare_data(x, y, groups=self.groups)
|
294
|
-
return x, y, groups, {}
|
295
340
|
|
296
|
-
def _prepare_data(
|
297
|
-
self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
|
298
|
-
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
|
299
341
|
self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
|
342
|
+
self.droped_features = []
|
300
343
|
for c in x.columns:
|
301
|
-
if c
|
302
|
-
|
303
|
-
|
304
|
-
|
344
|
+
if _get_unique_count(x[c]) < 2:
|
345
|
+
self.logger.warning(f"Remove feature {c} because it has less than 2 unique values")
|
346
|
+
self.droped_features.append(c)
|
347
|
+
if c in self.cat_features:
|
348
|
+
self.cat_features.remove(c)
|
349
|
+
x.drop(columns=[c], inplace=True)
|
350
|
+
elif c in self.cat_features:
|
351
|
+
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
352
|
+
x[c] = x[c].astype(np.int64)
|
353
|
+
elif is_numeric_object(x[c]):
|
354
|
+
self.logger.warning(
|
355
|
+
f"Convert numeric feature {c} of type {x[c].dtype} to numeric and remove from cat_features"
|
356
|
+
)
|
357
|
+
x[c] = pd.to_numeric(x[c], errors="coerce")
|
358
|
+
self.cat_features.remove(c)
|
359
|
+
elif x[c].dtype != "category":
|
305
360
|
x[c] = x[c].astype(str)
|
361
|
+
elif self.text_features is not None and c in self.text_features:
|
362
|
+
x[c] = x[c].astype(str)
|
306
363
|
else:
|
307
|
-
if x[c].dtype == "category" and x[c].cat.categories.dtype ==
|
364
|
+
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
308
365
|
x[c] = x[c].astype(np.int64)
|
366
|
+
elif not is_valid_numeric_array_data(x[c]):
|
367
|
+
try:
|
368
|
+
x[c] = pd.to_numeric(x[c], errors="raise")
|
369
|
+
except (ValueError, TypeError):
|
370
|
+
self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
|
371
|
+
self.droped_features.append(c)
|
372
|
+
x.drop(columns=[c], inplace=True)
|
373
|
+
|
374
|
+
return x, y, groups, {}
|
375
|
+
|
376
|
+
def _prepare_data(
|
377
|
+
self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
|
378
|
+
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
|
309
379
|
|
310
380
|
if not isinstance(y, pd.Series):
|
311
381
|
raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
|
@@ -335,6 +405,25 @@ class EstimatorWrapper:
|
|
335
405
|
|
336
406
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
337
407
|
x, y, _ = self._prepare_data(x, y)
|
408
|
+
|
409
|
+
if self.droped_features:
|
410
|
+
self.logger.warning(f"Dropped features: {self.droped_features}")
|
411
|
+
x = x.drop(columns=self.droped_features)
|
412
|
+
|
413
|
+
for c in x.columns:
|
414
|
+
if c in self.cat_features:
|
415
|
+
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
416
|
+
x[c] = x[c].astype(np.int64)
|
417
|
+
elif x[c].dtype != "category":
|
418
|
+
x[c] = x[c].astype(str)
|
419
|
+
elif self.text_features is not None and c in self.text_features:
|
420
|
+
x[c] = x[c].astype(str)
|
421
|
+
else:
|
422
|
+
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
423
|
+
x[c] = x[c].astype(np.int64)
|
424
|
+
elif not is_valid_numeric_array_data(x[c]):
|
425
|
+
x[c] = pd.to_numeric(x[c], errors="coerce")
|
426
|
+
|
338
427
|
return x, y, {}
|
339
428
|
|
340
429
|
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
@@ -550,8 +639,6 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
550
639
|
)
|
551
640
|
self.emb_features = None
|
552
641
|
self.grouped_embedding_features = None
|
553
|
-
self.drop_cat_features = []
|
554
|
-
self.features_to_encode = []
|
555
642
|
|
556
643
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
557
644
|
x, y, groups, params = super()._prepare_to_fit(x, y)
|
@@ -560,55 +647,60 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
560
647
|
import catboost
|
561
648
|
from catboost import CatBoostClassifier
|
562
649
|
|
563
|
-
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
650
|
+
if not hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
651
|
+
self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
|
652
|
+
else:
|
564
653
|
emb_pattern = r"(.+)_emb\d+"
|
565
654
|
self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
566
|
-
|
567
|
-
|
568
|
-
"Embedding features count more than 3, so group them into one vector for CatBoost: "
|
569
|
-
f"{self.emb_features}"
|
570
|
-
)
|
571
|
-
x, self.grouped_embedding_features = self.group_embeddings(x)
|
655
|
+
x, self.grouped_embedding_features = self.group_embeddings(x)
|
656
|
+
if len(self.grouped_embedding_features) > 0:
|
572
657
|
params["embedding_features"] = self.grouped_embedding_features
|
573
|
-
else:
|
574
|
-
self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
|
575
|
-
self.grouped_embedding_features = None
|
576
|
-
else:
|
577
|
-
self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
|
578
658
|
|
579
659
|
# Find text features from passed in generate_features
|
580
|
-
if hasattr(CatBoostClassifier, "get_text_feature_indices"):
|
660
|
+
if not hasattr(CatBoostClassifier, "get_text_feature_indices"):
|
661
|
+
self.text_features = None
|
662
|
+
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
663
|
+
else:
|
581
664
|
if self.text_features is not None:
|
582
665
|
self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
|
583
666
|
self.text_features = [f for f in self.text_features if f in x.columns and not is_numeric_dtype(x[f])]
|
584
667
|
self.logger.info(f"Rest text features after checks: {self.text_features}")
|
585
668
|
params["text_features"] = self.text_features
|
586
|
-
else:
|
587
|
-
self.text_features = None
|
588
|
-
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
589
669
|
|
590
670
|
# Find rest categorical features
|
591
|
-
self.cat_features
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
671
|
+
self.cat_features = [
|
672
|
+
f
|
673
|
+
for f in self.cat_features
|
674
|
+
if f not in (self.text_features or []) and f not in (self.grouped_embedding_features or [])
|
675
|
+
]
|
676
|
+
if self.cat_features:
|
677
|
+
for c in self.cat_features:
|
596
678
|
if is_numeric_dtype(x[c]):
|
597
679
|
x[c] = x[c].fillna(np.nan)
|
598
|
-
|
680
|
+
elif x[c].dtype != "category":
|
599
681
|
x[c] = x[c].fillna("NA")
|
600
|
-
params["cat_features"] = self.
|
682
|
+
params["cat_features"] = self.cat_features
|
601
683
|
|
602
684
|
return x, y, groups, params
|
603
685
|
|
604
686
|
def group_embeddings(self, df: pd.DataFrame):
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
687
|
+
embeddings_columns = []
|
688
|
+
if len(self.emb_features) > 3:
|
689
|
+
self.logger.info(
|
690
|
+
"Embedding features count more than 3, so group them into one vector for CatBoost: "
|
691
|
+
f"{self.emb_features}"
|
692
|
+
)
|
693
|
+
emb_name = "__grouped_embeddings"
|
694
|
+
df = df.copy()
|
695
|
+
df[self.emb_features] = df[self.emb_features].fillna(0.0)
|
696
|
+
embeddings_series = pd.Series(df[self.emb_features].values.tolist(), index=df.index)
|
697
|
+
df = pd.concat([df.drop(columns=self.emb_features), pd.DataFrame({emb_name: embeddings_series})], axis=1)
|
698
|
+
embeddings_columns.append(emb_name)
|
699
|
+
for c in df.columns:
|
700
|
+
if is_valid_numeric_array_data(df[c]):
|
701
|
+
embeddings_columns.append(c)
|
702
|
+
|
703
|
+
return df, embeddings_columns
|
612
704
|
|
613
705
|
def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
|
614
706
|
if "__grouped_embeddings" in shap_values:
|
@@ -618,8 +710,6 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
618
710
|
return shap_values
|
619
711
|
|
620
712
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
621
|
-
if self.exclude_features:
|
622
|
-
x = x.drop(columns=self.exclude_features)
|
623
713
|
x, y, params = super()._prepare_to_calculate(x, y)
|
624
714
|
if self.text_features:
|
625
715
|
params["text_features"] = self.text_features
|
@@ -627,13 +717,13 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
627
717
|
x, emb_columns = self.group_embeddings(x)
|
628
718
|
params["embedding_features"] = emb_columns
|
629
719
|
|
630
|
-
if self.
|
631
|
-
for c in self.
|
720
|
+
if self.cat_features:
|
721
|
+
for c in self.cat_features:
|
632
722
|
if is_numeric_dtype(x[c]):
|
633
723
|
x[c] = x[c].fillna(np.nan)
|
634
|
-
|
724
|
+
elif x[c].dtype != "category":
|
635
725
|
x[c] = x[c].fillna("NA")
|
636
|
-
params["cat_features"] = self.
|
726
|
+
params["cat_features"] = self.cat_features
|
637
727
|
|
638
728
|
return x, y, params
|
639
729
|
|
@@ -658,7 +748,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
658
748
|
)
|
659
749
|
for f in high_cardinality_features:
|
660
750
|
self.text_features.remove(f)
|
661
|
-
self.
|
751
|
+
self.droped_features.append(f)
|
662
752
|
x = x.drop(columns=f, errors="ignore")
|
663
753
|
return super().cross_val_predict(x, y, baseline_score_column)
|
664
754
|
else:
|
@@ -731,8 +821,6 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
731
821
|
)
|
732
822
|
self.cat_encoder = None
|
733
823
|
self.n_classes = None
|
734
|
-
self.exclude_features = []
|
735
|
-
self.features_to_encode = []
|
736
824
|
|
737
825
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
738
826
|
x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
|
@@ -742,25 +830,23 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
742
830
|
if self.target_type == ModelTaskType.BINARY:
|
743
831
|
params["eval_metric"] = "auc"
|
744
832
|
params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
|
745
|
-
self.cat_features
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, return_df=True)
|
750
|
-
encoded = encoder.fit_transform(x[self.features_to_encode].astype("object"), y_numpy).astype("category")
|
751
|
-
x[self.features_to_encode] = encoded
|
833
|
+
if self.cat_features:
|
834
|
+
encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, cols=self.cat_features, return_df=True)
|
835
|
+
encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
|
836
|
+
x[self.cat_features] = encoded
|
752
837
|
self.cat_encoder = encoder
|
753
|
-
|
838
|
+
for c in x.columns:
|
839
|
+
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
840
|
+
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
841
|
+
self.droped_features.append(c)
|
842
|
+
x = x.drop(columns=c, errors="ignore")
|
754
843
|
return x, y_numpy, groups, params
|
755
844
|
|
756
845
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
757
|
-
if self.exclude_features:
|
758
|
-
x = x.drop(columns=self.exclude_features)
|
759
846
|
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
760
|
-
if self.
|
761
|
-
|
762
|
-
|
763
|
-
)
|
847
|
+
if self.cat_features is not None and self.cat_encoder is not None:
|
848
|
+
encoded = self.cat_encoder.transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
|
849
|
+
x[self.cat_features] = encoded
|
764
850
|
return x, y_numpy, params
|
765
851
|
|
766
852
|
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
@@ -822,9 +908,6 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
822
908
|
|
823
909
|
def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
824
910
|
x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
|
825
|
-
self.cat_features, self.features_to_encode, self.exclude_features = _get_cat_features(
|
826
|
-
self.logger, x, self.cat_features
|
827
|
-
)
|
828
911
|
num_features = [col for col in x.columns if col not in self.cat_features]
|
829
912
|
x[num_features] = x[num_features].fillna(-999)
|
830
913
|
if self.cat_features:
|
@@ -832,20 +915,23 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
832
915
|
encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
|
833
916
|
x[self.cat_features] = encoded
|
834
917
|
self.cat_encoder = encoder
|
918
|
+
for c in x.columns:
|
919
|
+
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
920
|
+
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
921
|
+
self.droped_features.append(c)
|
922
|
+
x = x.drop(columns=c, errors="ignore")
|
835
923
|
return x, y_numpy, groups, params
|
836
924
|
|
837
925
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
838
|
-
|
839
|
-
x = x.drop(columns=self.exclude_features)
|
840
|
-
x, y, params = super()._prepare_to_calculate(x, y)
|
926
|
+
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
841
927
|
if self.cat_features is not None:
|
842
928
|
num_features = [col for col in x.columns if col not in self.cat_features]
|
843
929
|
x[num_features] = x[num_features].fillna(-999)
|
844
|
-
if self.
|
845
|
-
x[self.
|
846
|
-
x[self.
|
930
|
+
if self.cat_features and self.cat_encoder is not None:
|
931
|
+
x[self.cat_features] = self.cat_encoder.transform(
|
932
|
+
x[self.cat_features].astype("object"), y_numpy
|
847
933
|
).astype("category")
|
848
|
-
return x,
|
934
|
+
return x, y_numpy, params
|
849
935
|
|
850
936
|
|
851
937
|
def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
@@ -931,40 +1017,6 @@ def define_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None
|
|
931
1017
|
return scoring, metric_name, multiplier
|
932
1018
|
|
933
1019
|
|
934
|
-
def _get_cat_features(
|
935
|
-
logger: logging.Logger,
|
936
|
-
x: pd.DataFrame,
|
937
|
-
cat_features: Optional[List[str]],
|
938
|
-
text_features: Optional[List[str]] = None,
|
939
|
-
emb_features: Optional[List[str]] = None,
|
940
|
-
) -> List[str]:
|
941
|
-
cat_features = cat_features or []
|
942
|
-
text_features = text_features or []
|
943
|
-
emb_features = emb_features or []
|
944
|
-
exclude_features = text_features + emb_features
|
945
|
-
cat_features = [c for c in cat_features if c not in exclude_features]
|
946
|
-
unique_cat_features = []
|
947
|
-
drop_cat_features = []
|
948
|
-
for name in cat_features:
|
949
|
-
# Remove constant categorical features
|
950
|
-
if x[name].nunique(dropna=False) > 1:
|
951
|
-
unique_cat_features.append(name)
|
952
|
-
else:
|
953
|
-
logger.warning(f"Drop column {name} on preparing data for fit")
|
954
|
-
x.drop(columns=name, inplace=True)
|
955
|
-
drop_cat_features.append(name)
|
956
|
-
cat_features = unique_cat_features
|
957
|
-
|
958
|
-
logger.info(f"Selected categorical features: {cat_features}")
|
959
|
-
|
960
|
-
features_to_encode = list(set(x.select_dtypes(exclude=[np.number, np.datetime64, pd.CategoricalDtype()]).columns))
|
961
|
-
features_to_encode = [f for f in cat_features if f in features_to_encode]
|
962
|
-
|
963
|
-
logger.info(f"Features to encode: {features_to_encode}")
|
964
|
-
|
965
|
-
return cat_features, features_to_encode, drop_cat_features
|
966
|
-
|
967
|
-
|
968
1020
|
def _get_add_params(input_params, add_params):
|
969
1021
|
output_params = dict(input_params)
|
970
1022
|
if add_params is not None:
|
@@ -1050,3 +1102,10 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
|
|
1050
1102
|
multioutput=multioutput,
|
1051
1103
|
)
|
1052
1104
|
return mse if squared else np.sqrt(mse)
|
1105
|
+
|
1106
|
+
|
1107
|
+
def _get_unique_count(series: pd.Series) -> int:
|
1108
|
+
try:
|
1109
|
+
return series.nunique(dropna=False)
|
1110
|
+
except TypeError:
|
1111
|
+
return series.astype(str).nunique(dropna=False)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.81a3832.
|
3
|
+
Version: 1.2.81a3832.dev10
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -1,12 +1,12 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=dUnN248oLg0rBaOttshEyx0_AtLIiP6ku5lXmtwrlQo,34
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=ZSSukaq4_mngCkJyQe-XCssXbH8nOD7ByWfSHi9nypc,210847
|
7
7
|
upgini/http.py,sha256=AfaJ3c8z_tK2hZFEehNybDKE0mp1tYcyAP_l0_p8bLQ,43933
|
8
8
|
upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
|
9
|
-
upgini/metrics.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=DpXJtooXDCLTJUf3JlfIsJiwx9Hg-2vv4-k4RWkXFMU,42269
|
10
10
|
upgini/search_task.py,sha256=RcvAE785yksWTsTNWuZFVNlk32jHElMoEna1T_C5N8Q,17823
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
|
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.81a3832.
|
74
|
-
upgini-1.2.81a3832.
|
75
|
-
upgini-1.2.81a3832.
|
76
|
-
upgini-1.2.81a3832.
|
73
|
+
upgini-1.2.81a3832.dev10.dist-info/METADATA,sha256=F0Eg-CF-u-X2QDwUGlH0Fom-Ys1Br4bfoR_RBUq0ob8,49173
|
74
|
+
upgini-1.2.81a3832.dev10.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
75
|
+
upgini-1.2.81a3832.dev10.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.81a3832.dev10.dist-info/RECORD,,
|
File without changes
|
File without changes
|