upgini 1.2.81a3832.dev9__py3-none-any.whl → 1.2.81a3832.dev10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/metrics.py +170 -113
- {upgini-1.2.81a3832.dev9.dist-info → upgini-1.2.81a3832.dev10.dist-info}/METADATA +1 -1
- {upgini-1.2.81a3832.dev9.dist-info → upgini-1.2.81a3832.dev10.dist-info}/RECORD +6 -6
- {upgini-1.2.81a3832.dev9.dist-info → upgini-1.2.81a3832.dev10.dist-info}/WHEEL +0 -0
- {upgini-1.2.81a3832.dev9.dist-info → upgini-1.2.81a3832.dev10.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.81a3832.
|
1
|
+
__version__ = "1.2.81a3832.dev10"
|
upgini/metrics.py
CHANGED
@@ -32,7 +32,10 @@ except ImportError:
|
|
32
32
|
available_scorers = SCORERS
|
33
33
|
from sklearn.metrics import mean_squared_error
|
34
34
|
from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
|
35
|
-
from sklearn.model_selection import
|
35
|
+
from sklearn.model_selection import ( # , TimeSeriesSplit
|
36
|
+
BaseCrossValidator,
|
37
|
+
TimeSeriesSplit,
|
38
|
+
)
|
36
39
|
|
37
40
|
from upgini.errors import ValidationError
|
38
41
|
from upgini.metadata import ModelTaskType
|
@@ -250,6 +253,47 @@ class _CrossValResults:
|
|
250
253
|
return f"{self.metric:.3f} ± {self.metric_std:.3f}"
|
251
254
|
|
252
255
|
|
256
|
+
def is_numeric_object(x: pd.Series) -> bool:
|
257
|
+
try:
|
258
|
+
pd.to_numeric(x, errors="raise")
|
259
|
+
return True
|
260
|
+
except (ValueError, TypeError):
|
261
|
+
return False
|
262
|
+
|
263
|
+
|
264
|
+
def is_valid_numeric_array_data(data: pd.Series) -> bool:
|
265
|
+
data_without_na = data.dropna()
|
266
|
+
if data_without_na.empty:
|
267
|
+
return False
|
268
|
+
|
269
|
+
first_element = data_without_na.iloc[0]
|
270
|
+
|
271
|
+
# numpy.ndarray with numeric types
|
272
|
+
if isinstance(first_element, np.ndarray):
|
273
|
+
return np.issubdtype(first_element.dtype, np.number)
|
274
|
+
|
275
|
+
# DataFrame with all numeric columns
|
276
|
+
elif isinstance(first_element, pd.DataFrame):
|
277
|
+
return all(np.issubdtype(dtype, np.number) for dtype in first_element.dtypes)
|
278
|
+
|
279
|
+
# list or list of lists with numeric types
|
280
|
+
elif isinstance(first_element, list):
|
281
|
+
try:
|
282
|
+
# flat list
|
283
|
+
if all(isinstance(x, (int, float, np.number)) or pd.isna(x) for x in first_element):
|
284
|
+
return True
|
285
|
+
# list of lists
|
286
|
+
elif all(
|
287
|
+
isinstance(x, list) and all(isinstance(y, (int, float, np.number)) or pd.isna(y) for y in x)
|
288
|
+
for x in first_element
|
289
|
+
):
|
290
|
+
return True
|
291
|
+
except Exception:
|
292
|
+
return False
|
293
|
+
|
294
|
+
return False
|
295
|
+
|
296
|
+
|
253
297
|
class EstimatorWrapper:
|
254
298
|
default_estimator: Literal["catboost", "lightgbm"] = "catboost"
|
255
299
|
|
@@ -279,6 +323,7 @@ class EstimatorWrapper:
|
|
279
323
|
self.groups = groups
|
280
324
|
self.text_features = text_features
|
281
325
|
self.logger = logger or logging.getLogger()
|
326
|
+
self.droped_features = []
|
282
327
|
|
283
328
|
def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
|
284
329
|
x, y, _, fit_params = self._prepare_to_fit(x, y)
|
@@ -286,28 +331,51 @@ class EstimatorWrapper:
|
|
286
331
|
self.estimator.fit(x, y, **kwargs)
|
287
332
|
return self
|
288
333
|
|
289
|
-
def predict(self, **kwargs):
|
290
|
-
|
334
|
+
def predict(self, x: pd.DataFrame, **kwargs):
|
335
|
+
x, _, _ = self._prepare_to_calculate(x, None)
|
336
|
+
return self.estimator.predict(x, **kwargs)
|
291
337
|
|
292
338
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
293
339
|
x, y, groups = self._prepare_data(x, y, groups=self.groups)
|
294
|
-
return x, y, groups, {}
|
295
340
|
|
296
|
-
def _prepare_data(
|
297
|
-
self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
|
298
|
-
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
|
299
341
|
self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
|
342
|
+
self.droped_features = []
|
300
343
|
for c in x.columns:
|
301
|
-
if c
|
302
|
-
|
303
|
-
|
304
|
-
|
344
|
+
if _get_unique_count(x[c]) < 2:
|
345
|
+
self.logger.warning(f"Remove feature {c} because it has less than 2 unique values")
|
346
|
+
self.droped_features.append(c)
|
347
|
+
if c in self.cat_features:
|
348
|
+
self.cat_features.remove(c)
|
349
|
+
x.drop(columns=[c], inplace=True)
|
350
|
+
elif c in self.cat_features:
|
351
|
+
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
352
|
+
x[c] = x[c].astype(np.int64)
|
353
|
+
elif is_numeric_object(x[c]):
|
354
|
+
self.logger.warning(
|
355
|
+
f"Convert numeric feature {c} of type {x[c].dtype} to numeric and remove from cat_features"
|
356
|
+
)
|
357
|
+
x[c] = pd.to_numeric(x[c], errors="coerce")
|
358
|
+
self.cat_features.remove(c)
|
359
|
+
elif x[c].dtype != "category":
|
305
360
|
x[c] = x[c].astype(str)
|
361
|
+
elif self.text_features is not None and c in self.text_features:
|
362
|
+
x[c] = x[c].astype(str)
|
306
363
|
else:
|
307
|
-
if x[c].dtype == "category" and x[c].cat.categories.dtype ==
|
364
|
+
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
308
365
|
x[c] = x[c].astype(np.int64)
|
309
|
-
elif not
|
310
|
-
|
366
|
+
elif not is_valid_numeric_array_data(x[c]):
|
367
|
+
try:
|
368
|
+
x[c] = pd.to_numeric(x[c], errors="raise")
|
369
|
+
except (ValueError, TypeError):
|
370
|
+
self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
|
371
|
+
self.droped_features.append(c)
|
372
|
+
x.drop(columns=[c], inplace=True)
|
373
|
+
|
374
|
+
return x, y, groups, {}
|
375
|
+
|
376
|
+
def _prepare_data(
|
377
|
+
self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
|
378
|
+
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
|
311
379
|
|
312
380
|
if not isinstance(y, pd.Series):
|
313
381
|
raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
|
@@ -337,6 +405,25 @@ class EstimatorWrapper:
|
|
337
405
|
|
338
406
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
339
407
|
x, y, _ = self._prepare_data(x, y)
|
408
|
+
|
409
|
+
if self.droped_features:
|
410
|
+
self.logger.warning(f"Dropped features: {self.droped_features}")
|
411
|
+
x = x.drop(columns=self.droped_features)
|
412
|
+
|
413
|
+
for c in x.columns:
|
414
|
+
if c in self.cat_features:
|
415
|
+
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
416
|
+
x[c] = x[c].astype(np.int64)
|
417
|
+
elif x[c].dtype != "category":
|
418
|
+
x[c] = x[c].astype(str)
|
419
|
+
elif self.text_features is not None and c in self.text_features:
|
420
|
+
x[c] = x[c].astype(str)
|
421
|
+
else:
|
422
|
+
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
423
|
+
x[c] = x[c].astype(np.int64)
|
424
|
+
elif not is_valid_numeric_array_data(x[c]):
|
425
|
+
x[c] = pd.to_numeric(x[c], errors="coerce")
|
426
|
+
|
340
427
|
return x, y, {}
|
341
428
|
|
342
429
|
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
@@ -552,8 +639,6 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
552
639
|
)
|
553
640
|
self.emb_features = None
|
554
641
|
self.grouped_embedding_features = None
|
555
|
-
self.drop_cat_features = []
|
556
|
-
self.features_to_encode = []
|
557
642
|
|
558
643
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
559
644
|
x, y, groups, params = super()._prepare_to_fit(x, y)
|
@@ -562,55 +647,60 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
562
647
|
import catboost
|
563
648
|
from catboost import CatBoostClassifier
|
564
649
|
|
565
|
-
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
650
|
+
if not hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
651
|
+
self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
|
652
|
+
else:
|
566
653
|
emb_pattern = r"(.+)_emb\d+"
|
567
654
|
self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
568
|
-
|
569
|
-
|
570
|
-
"Embedding features count more than 3, so group them into one vector for CatBoost: "
|
571
|
-
f"{self.emb_features}"
|
572
|
-
)
|
573
|
-
x, self.grouped_embedding_features = self.group_embeddings(x)
|
655
|
+
x, self.grouped_embedding_features = self.group_embeddings(x)
|
656
|
+
if len(self.grouped_embedding_features) > 0:
|
574
657
|
params["embedding_features"] = self.grouped_embedding_features
|
575
|
-
else:
|
576
|
-
self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
|
577
|
-
self.grouped_embedding_features = None
|
578
|
-
else:
|
579
|
-
self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
|
580
658
|
|
581
659
|
# Find text features from passed in generate_features
|
582
|
-
if hasattr(CatBoostClassifier, "get_text_feature_indices"):
|
660
|
+
if not hasattr(CatBoostClassifier, "get_text_feature_indices"):
|
661
|
+
self.text_features = None
|
662
|
+
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
663
|
+
else:
|
583
664
|
if self.text_features is not None:
|
584
665
|
self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
|
585
666
|
self.text_features = [f for f in self.text_features if f in x.columns and not is_numeric_dtype(x[f])]
|
586
667
|
self.logger.info(f"Rest text features after checks: {self.text_features}")
|
587
668
|
params["text_features"] = self.text_features
|
588
|
-
else:
|
589
|
-
self.text_features = None
|
590
|
-
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
591
669
|
|
592
670
|
# Find rest categorical features
|
593
|
-
self.cat_features
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
671
|
+
self.cat_features = [
|
672
|
+
f
|
673
|
+
for f in self.cat_features
|
674
|
+
if f not in (self.text_features or []) and f not in (self.grouped_embedding_features or [])
|
675
|
+
]
|
676
|
+
if self.cat_features:
|
677
|
+
for c in self.cat_features:
|
598
678
|
if is_numeric_dtype(x[c]):
|
599
679
|
x[c] = x[c].fillna(np.nan)
|
600
|
-
|
680
|
+
elif x[c].dtype != "category":
|
601
681
|
x[c] = x[c].fillna("NA")
|
602
|
-
params["cat_features"] = self.
|
682
|
+
params["cat_features"] = self.cat_features
|
603
683
|
|
604
684
|
return x, y, groups, params
|
605
685
|
|
606
686
|
def group_embeddings(self, df: pd.DataFrame):
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
687
|
+
embeddings_columns = []
|
688
|
+
if len(self.emb_features) > 3:
|
689
|
+
self.logger.info(
|
690
|
+
"Embedding features count more than 3, so group them into one vector for CatBoost: "
|
691
|
+
f"{self.emb_features}"
|
692
|
+
)
|
693
|
+
emb_name = "__grouped_embeddings"
|
694
|
+
df = df.copy()
|
695
|
+
df[self.emb_features] = df[self.emb_features].fillna(0.0)
|
696
|
+
embeddings_series = pd.Series(df[self.emb_features].values.tolist(), index=df.index)
|
697
|
+
df = pd.concat([df.drop(columns=self.emb_features), pd.DataFrame({emb_name: embeddings_series})], axis=1)
|
698
|
+
embeddings_columns.append(emb_name)
|
699
|
+
for c in df.columns:
|
700
|
+
if is_valid_numeric_array_data(df[c]):
|
701
|
+
embeddings_columns.append(c)
|
702
|
+
|
703
|
+
return df, embeddings_columns
|
614
704
|
|
615
705
|
def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
|
616
706
|
if "__grouped_embeddings" in shap_values:
|
@@ -620,8 +710,6 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
620
710
|
return shap_values
|
621
711
|
|
622
712
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
623
|
-
if self.exclude_features:
|
624
|
-
x = x.drop(columns=self.exclude_features)
|
625
713
|
x, y, params = super()._prepare_to_calculate(x, y)
|
626
714
|
if self.text_features:
|
627
715
|
params["text_features"] = self.text_features
|
@@ -629,13 +717,13 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
629
717
|
x, emb_columns = self.group_embeddings(x)
|
630
718
|
params["embedding_features"] = emb_columns
|
631
719
|
|
632
|
-
if self.
|
633
|
-
for c in self.
|
720
|
+
if self.cat_features:
|
721
|
+
for c in self.cat_features:
|
634
722
|
if is_numeric_dtype(x[c]):
|
635
723
|
x[c] = x[c].fillna(np.nan)
|
636
|
-
|
724
|
+
elif x[c].dtype != "category":
|
637
725
|
x[c] = x[c].fillna("NA")
|
638
|
-
params["cat_features"] = self.
|
726
|
+
params["cat_features"] = self.cat_features
|
639
727
|
|
640
728
|
return x, y, params
|
641
729
|
|
@@ -660,7 +748,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
660
748
|
)
|
661
749
|
for f in high_cardinality_features:
|
662
750
|
self.text_features.remove(f)
|
663
|
-
self.
|
751
|
+
self.droped_features.append(f)
|
664
752
|
x = x.drop(columns=f, errors="ignore")
|
665
753
|
return super().cross_val_predict(x, y, baseline_score_column)
|
666
754
|
else:
|
@@ -733,8 +821,6 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
733
821
|
)
|
734
822
|
self.cat_encoder = None
|
735
823
|
self.n_classes = None
|
736
|
-
self.exclude_features = []
|
737
|
-
self.features_to_encode = []
|
738
824
|
|
739
825
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
740
826
|
x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
|
@@ -744,25 +830,23 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
744
830
|
if self.target_type == ModelTaskType.BINARY:
|
745
831
|
params["eval_metric"] = "auc"
|
746
832
|
params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
|
747
|
-
self.cat_features
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, return_df=True)
|
752
|
-
encoded = encoder.fit_transform(x[self.features_to_encode].astype("object"), y_numpy).astype("category")
|
753
|
-
x[self.features_to_encode] = encoded
|
833
|
+
if self.cat_features:
|
834
|
+
encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, cols=self.cat_features, return_df=True)
|
835
|
+
encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
|
836
|
+
x[self.cat_features] = encoded
|
754
837
|
self.cat_encoder = encoder
|
755
|
-
|
838
|
+
for c in x.columns:
|
839
|
+
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
840
|
+
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
841
|
+
self.droped_features.append(c)
|
842
|
+
x = x.drop(columns=c, errors="ignore")
|
756
843
|
return x, y_numpy, groups, params
|
757
844
|
|
758
845
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
759
|
-
if self.exclude_features:
|
760
|
-
x = x.drop(columns=self.exclude_features)
|
761
846
|
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
762
|
-
if self.
|
763
|
-
|
764
|
-
|
765
|
-
)
|
847
|
+
if self.cat_features is not None and self.cat_encoder is not None:
|
848
|
+
encoded = self.cat_encoder.transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
|
849
|
+
x[self.cat_features] = encoded
|
766
850
|
return x, y_numpy, params
|
767
851
|
|
768
852
|
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
@@ -824,9 +908,6 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
824
908
|
|
825
909
|
def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
826
910
|
x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
|
827
|
-
self.cat_features, self.features_to_encode, self.exclude_features = _get_cat_features(
|
828
|
-
self.logger, x, self.cat_features
|
829
|
-
)
|
830
911
|
num_features = [col for col in x.columns if col not in self.cat_features]
|
831
912
|
x[num_features] = x[num_features].fillna(-999)
|
832
913
|
if self.cat_features:
|
@@ -834,20 +915,23 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
834
915
|
encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
|
835
916
|
x[self.cat_features] = encoded
|
836
917
|
self.cat_encoder = encoder
|
918
|
+
for c in x.columns:
|
919
|
+
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
920
|
+
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
921
|
+
self.droped_features.append(c)
|
922
|
+
x = x.drop(columns=c, errors="ignore")
|
837
923
|
return x, y_numpy, groups, params
|
838
924
|
|
839
925
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
840
|
-
|
841
|
-
x = x.drop(columns=self.exclude_features)
|
842
|
-
x, y, params = super()._prepare_to_calculate(x, y)
|
926
|
+
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
843
927
|
if self.cat_features is not None:
|
844
928
|
num_features = [col for col in x.columns if col not in self.cat_features]
|
845
929
|
x[num_features] = x[num_features].fillna(-999)
|
846
|
-
if self.
|
847
|
-
x[self.
|
848
|
-
x[self.
|
930
|
+
if self.cat_features and self.cat_encoder is not None:
|
931
|
+
x[self.cat_features] = self.cat_encoder.transform(
|
932
|
+
x[self.cat_features].astype("object"), y_numpy
|
849
933
|
).astype("category")
|
850
|
-
return x,
|
934
|
+
return x, y_numpy, params
|
851
935
|
|
852
936
|
|
853
937
|
def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
@@ -933,40 +1017,6 @@ def define_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None
|
|
933
1017
|
return scoring, metric_name, multiplier
|
934
1018
|
|
935
1019
|
|
936
|
-
def _get_cat_features(
|
937
|
-
logger: logging.Logger,
|
938
|
-
x: pd.DataFrame,
|
939
|
-
cat_features: Optional[List[str]],
|
940
|
-
text_features: Optional[List[str]] = None,
|
941
|
-
emb_features: Optional[List[str]] = None,
|
942
|
-
) -> List[str]:
|
943
|
-
cat_features = cat_features or []
|
944
|
-
text_features = text_features or []
|
945
|
-
emb_features = emb_features or []
|
946
|
-
exclude_features = text_features + emb_features
|
947
|
-
cat_features = [c for c in cat_features if c not in exclude_features]
|
948
|
-
unique_cat_features = []
|
949
|
-
drop_cat_features = []
|
950
|
-
for name in cat_features:
|
951
|
-
# Remove constant categorical features
|
952
|
-
if x[name].nunique(dropna=False) > 1:
|
953
|
-
unique_cat_features.append(name)
|
954
|
-
else:
|
955
|
-
logger.warning(f"Drop column {name} on preparing data for fit")
|
956
|
-
x.drop(columns=name, inplace=True)
|
957
|
-
drop_cat_features.append(name)
|
958
|
-
cat_features = unique_cat_features
|
959
|
-
|
960
|
-
logger.info(f"Selected categorical features: {cat_features}")
|
961
|
-
|
962
|
-
features_to_encode = list(set(x.select_dtypes(exclude=[np.number, np.datetime64, pd.CategoricalDtype()]).columns))
|
963
|
-
features_to_encode = [f for f in cat_features if f in features_to_encode]
|
964
|
-
|
965
|
-
logger.info(f"Features to encode: {features_to_encode}")
|
966
|
-
|
967
|
-
return cat_features, features_to_encode, drop_cat_features
|
968
|
-
|
969
|
-
|
970
1020
|
def _get_add_params(input_params, add_params):
|
971
1021
|
output_params = dict(input_params)
|
972
1022
|
if add_params is not None:
|
@@ -1052,3 +1102,10 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
|
|
1052
1102
|
multioutput=multioutput,
|
1053
1103
|
)
|
1054
1104
|
return mse if squared else np.sqrt(mse)
|
1105
|
+
|
1106
|
+
|
1107
|
+
def _get_unique_count(series: pd.Series) -> int:
|
1108
|
+
try:
|
1109
|
+
return series.nunique(dropna=False)
|
1110
|
+
except TypeError:
|
1111
|
+
return series.astype(str).nunique(dropna=False)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.81a3832.
|
3
|
+
Version: 1.2.81a3832.dev10
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -1,4 +1,4 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=dUnN248oLg0rBaOttshEyx0_AtLIiP6ku5lXmtwrlQo,34
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
|
@@ -6,7 +6,7 @@ upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
6
|
upgini/features_enricher.py,sha256=ZSSukaq4_mngCkJyQe-XCssXbH8nOD7ByWfSHi9nypc,210847
|
7
7
|
upgini/http.py,sha256=AfaJ3c8z_tK2hZFEehNybDKE0mp1tYcyAP_l0_p8bLQ,43933
|
8
8
|
upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
|
9
|
-
upgini/metrics.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=DpXJtooXDCLTJUf3JlfIsJiwx9Hg-2vv4-k4RWkXFMU,42269
|
10
10
|
upgini/search_task.py,sha256=RcvAE785yksWTsTNWuZFVNlk32jHElMoEna1T_C5N8Q,17823
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
|
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.81a3832.
|
74
|
-
upgini-1.2.81a3832.
|
75
|
-
upgini-1.2.81a3832.
|
76
|
-
upgini-1.2.81a3832.
|
73
|
+
upgini-1.2.81a3832.dev10.dist-info/METADATA,sha256=F0Eg-CF-u-X2QDwUGlH0Fom-Ys1Br4bfoR_RBUq0ob8,49173
|
74
|
+
upgini-1.2.81a3832.dev10.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
75
|
+
upgini-1.2.81a3832.dev10.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.81a3832.dev10.dist-info/RECORD,,
|
File without changes
|
File without changes
|