upgini 1.2.81a3832.dev9__py3-none-any.whl → 1.2.81a3832.dev11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +1 -1
- upgini/metrics.py +197 -119
- upgini/search_task.py +1 -0
- {upgini-1.2.81a3832.dev9.dist-info → upgini-1.2.81a3832.dev11.dist-info}/METADATA +1 -1
- {upgini-1.2.81a3832.dev9.dist-info → upgini-1.2.81a3832.dev11.dist-info}/RECORD +8 -8
- {upgini-1.2.81a3832.dev9.dist-info → upgini-1.2.81a3832.dev11.dist-info}/WHEEL +0 -0
- {upgini-1.2.81a3832.dev9.dist-info → upgini-1.2.81a3832.dev11.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.81a3832.
|
1
|
+
__version__ = "1.2.81a3832.dev11"
|
upgini/features_enricher.py
CHANGED
@@ -4245,7 +4245,7 @@ if response.status_code == 200:
|
|
4245
4245
|
def __show_selected_features(self, search_keys: Dict[str, SearchKey]):
|
4246
4246
|
search_key_names = [col for col, tpe in search_keys.items() if tpe != SearchKey.CUSTOM_KEY]
|
4247
4247
|
if self.fit_columns_renaming:
|
4248
|
-
search_key_names = [self.fit_columns_renaming.get(col, col) for col in search_key_names]
|
4248
|
+
search_key_names = sorted(set([self.fit_columns_renaming.get(col, col) for col in search_key_names]))
|
4249
4249
|
msg = self.bundle.get("features_info_header").format(len(self.feature_names_), search_key_names)
|
4250
4250
|
|
4251
4251
|
try:
|
upgini/metrics.py
CHANGED
@@ -15,7 +15,7 @@ from catboost import CatBoostClassifier, CatBoostRegressor
|
|
15
15
|
from category_encoders.cat_boost import CatBoostEncoder
|
16
16
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
17
17
|
from numpy import log1p
|
18
|
-
from pandas.api.types import is_numeric_dtype
|
18
|
+
from pandas.api.types import is_numeric_dtype, is_integer_dtype, is_float_dtype
|
19
19
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
20
20
|
|
21
21
|
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|
@@ -32,7 +32,10 @@ except ImportError:
|
|
32
32
|
available_scorers = SCORERS
|
33
33
|
from sklearn.metrics import mean_squared_error
|
34
34
|
from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
|
35
|
-
from sklearn.model_selection import
|
35
|
+
from sklearn.model_selection import ( # , TimeSeriesSplit
|
36
|
+
BaseCrossValidator,
|
37
|
+
TimeSeriesSplit,
|
38
|
+
)
|
36
39
|
|
37
40
|
from upgini.errors import ValidationError
|
38
41
|
from upgini.metadata import ModelTaskType
|
@@ -250,6 +253,47 @@ class _CrossValResults:
|
|
250
253
|
return f"{self.metric:.3f} ± {self.metric_std:.3f}"
|
251
254
|
|
252
255
|
|
256
|
+
def is_numeric_object(x: pd.Series) -> bool:
|
257
|
+
try:
|
258
|
+
pd.to_numeric(x, errors="raise")
|
259
|
+
return True
|
260
|
+
except (ValueError, TypeError):
|
261
|
+
return False
|
262
|
+
|
263
|
+
|
264
|
+
def is_valid_numeric_array_data(data: pd.Series) -> bool:
|
265
|
+
data_without_na = data.dropna()
|
266
|
+
if data_without_na.empty:
|
267
|
+
return False
|
268
|
+
|
269
|
+
first_element = data_without_na.iloc[0]
|
270
|
+
|
271
|
+
# numpy.ndarray with numeric types
|
272
|
+
if isinstance(first_element, np.ndarray):
|
273
|
+
return np.issubdtype(first_element.dtype, np.number)
|
274
|
+
|
275
|
+
# DataFrame with all numeric columns
|
276
|
+
elif isinstance(first_element, pd.DataFrame):
|
277
|
+
return all(np.issubdtype(dtype, np.number) for dtype in first_element.dtypes)
|
278
|
+
|
279
|
+
# list or list of lists with numeric types
|
280
|
+
elif isinstance(first_element, list):
|
281
|
+
try:
|
282
|
+
# flat list
|
283
|
+
if all(isinstance(x, (int, float, np.number)) or pd.isna(x) for x in first_element):
|
284
|
+
return True
|
285
|
+
# list of lists
|
286
|
+
elif all(
|
287
|
+
isinstance(x, list) and all(isinstance(y, (int, float, np.number)) or pd.isna(y) for y in x)
|
288
|
+
for x in first_element
|
289
|
+
):
|
290
|
+
return True
|
291
|
+
except Exception:
|
292
|
+
return False
|
293
|
+
|
294
|
+
return False
|
295
|
+
|
296
|
+
|
253
297
|
class EstimatorWrapper:
|
254
298
|
default_estimator: Literal["catboost", "lightgbm"] = "catboost"
|
255
299
|
|
@@ -279,6 +323,10 @@ class EstimatorWrapper:
|
|
279
323
|
self.groups = groups
|
280
324
|
self.text_features = text_features
|
281
325
|
self.logger = logger or logging.getLogger()
|
326
|
+
self.droped_features = []
|
327
|
+
self.converted_to_int = []
|
328
|
+
self.converted_to_str = []
|
329
|
+
self.converted_to_numeric = []
|
282
330
|
|
283
331
|
def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
|
284
332
|
x, y, _, fit_params = self._prepare_to_fit(x, y)
|
@@ -286,28 +334,13 @@ class EstimatorWrapper:
|
|
286
334
|
self.estimator.fit(x, y, **kwargs)
|
287
335
|
return self
|
288
336
|
|
289
|
-
def predict(self, **kwargs):
|
290
|
-
|
291
|
-
|
292
|
-
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
293
|
-
x, y, groups = self._prepare_data(x, y, groups=self.groups)
|
294
|
-
return x, y, groups, {}
|
337
|
+
def predict(self, x: pd.DataFrame, **kwargs):
|
338
|
+
x, _, _ = self._prepare_to_calculate(x, None)
|
339
|
+
return self.estimator.predict(x, **kwargs)
|
295
340
|
|
296
341
|
def _prepare_data(
|
297
342
|
self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
|
298
343
|
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
|
299
|
-
self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
|
300
|
-
for c in x.columns:
|
301
|
-
if c not in self.cat_features:
|
302
|
-
if is_numeric_dtype(x[c]):
|
303
|
-
x[c] = x[c].astype(float)
|
304
|
-
elif not x[c].dtype == "category":
|
305
|
-
x[c] = x[c].astype(str)
|
306
|
-
else:
|
307
|
-
if x[c].dtype == "category" and x[c].cat.categories.dtype == np.int64:
|
308
|
-
x[c] = x[c].astype(np.int64)
|
309
|
-
elif not is_numeric_dtype(x[c]):
|
310
|
-
x[c] = x[c].astype(str).astype("category")
|
311
344
|
|
312
345
|
if not isinstance(y, pd.Series):
|
313
346
|
raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
|
@@ -335,8 +368,83 @@ class EstimatorWrapper:
|
|
335
368
|
|
336
369
|
return x, y
|
337
370
|
|
371
|
+
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
372
|
+
x, y, groups = self._prepare_data(x, y, groups=self.groups)
|
373
|
+
|
374
|
+
self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
|
375
|
+
self.droped_features = []
|
376
|
+
self.converted_to_int = []
|
377
|
+
self.converted_to_str = []
|
378
|
+
self.converted_to_numeric = []
|
379
|
+
for c in x.columns:
|
380
|
+
if _get_unique_count(x[c]) < 2:
|
381
|
+
self.logger.warning(f"Remove feature {c} because it has less than 2 unique values")
|
382
|
+
if c in self.cat_features:
|
383
|
+
self.cat_features.remove(c)
|
384
|
+
x.drop(columns=[c], inplace=True)
|
385
|
+
self.droped_features.append(c)
|
386
|
+
elif self.text_features is not None and c in self.text_features:
|
387
|
+
x[c] = x[c].astype(str)
|
388
|
+
self.converted_to_str.append(c)
|
389
|
+
elif c in self.cat_features:
|
390
|
+
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
391
|
+
x[c] = x[c].astype(np.int64)
|
392
|
+
self.converted_to_int.append(c)
|
393
|
+
elif x[c].dtype == "category" and is_integer_dtype(x[c].cat.categories):
|
394
|
+
self.logger.info(
|
395
|
+
f"Convert categorical feature {c} with integer categories"
|
396
|
+
" to int64 and remove from cat_features"
|
397
|
+
)
|
398
|
+
x[c] = x[c].astype(np.int64)
|
399
|
+
self.converted_to_int.append(c)
|
400
|
+
self.cat_features.remove(c)
|
401
|
+
elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
|
402
|
+
self.logger.info(
|
403
|
+
f"Convert float cat feature {c} to string"
|
404
|
+
)
|
405
|
+
x[c] = x[c].astype(str)
|
406
|
+
self.converted_to_str.append(c)
|
407
|
+
elif x[c].dtype not in ["category", "int64"]:
|
408
|
+
x[c] = x[c].astype(str)
|
409
|
+
self.converted_to_str.append(c)
|
410
|
+
else:
|
411
|
+
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
412
|
+
self.logger.info(f"Convert bool feature {c} to int64")
|
413
|
+
x[c] = x[c].astype(np.int64)
|
414
|
+
self.converted_to_int.append(c)
|
415
|
+
elif not is_valid_numeric_array_data(x[c]):
|
416
|
+
try:
|
417
|
+
x[c] = pd.to_numeric(x[c], errors="raise")
|
418
|
+
self.converted_to_numeric.append(c)
|
419
|
+
except (ValueError, TypeError):
|
420
|
+
self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
|
421
|
+
x.drop(columns=[c], inplace=True)
|
422
|
+
self.droped_features.append(c)
|
423
|
+
|
424
|
+
return x, y, groups, {}
|
425
|
+
|
338
426
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
339
427
|
x, y, _ = self._prepare_data(x, y)
|
428
|
+
|
429
|
+
if self.droped_features:
|
430
|
+
self.logger.info(f"Drop features on calculate metrics: {self.droped_features}")
|
431
|
+
x = x.drop(columns=self.droped_features)
|
432
|
+
|
433
|
+
if self.converted_to_int:
|
434
|
+
self.logger.info(f"Convert to int features on calculate metrics: {self.converted_to_int}")
|
435
|
+
for c in self.converted_to_int:
|
436
|
+
x[c] = x[c].astype(np.int64)
|
437
|
+
|
438
|
+
if self.converted_to_str:
|
439
|
+
self.logger.info(f"Convert to str features on calculate metrics: {self.converted_to_str}")
|
440
|
+
for c in self.converted_to_str:
|
441
|
+
x[c] = x[c].astype(str)
|
442
|
+
|
443
|
+
if self.converted_to_numeric:
|
444
|
+
self.logger.info(f"Convert to numeric features on calculate metrics: {self.converted_to_numeric}")
|
445
|
+
for c in self.converted_to_numeric:
|
446
|
+
x[c] = pd.to_numeric(x[c], errors="coerce")
|
447
|
+
|
340
448
|
return x, y, {}
|
341
449
|
|
342
450
|
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
@@ -552,8 +660,6 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
552
660
|
)
|
553
661
|
self.emb_features = None
|
554
662
|
self.grouped_embedding_features = None
|
555
|
-
self.drop_cat_features = []
|
556
|
-
self.features_to_encode = []
|
557
663
|
|
558
664
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
559
665
|
x, y, groups, params = super()._prepare_to_fit(x, y)
|
@@ -562,55 +668,60 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
562
668
|
import catboost
|
563
669
|
from catboost import CatBoostClassifier
|
564
670
|
|
565
|
-
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
671
|
+
if not hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
672
|
+
self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
|
673
|
+
else:
|
566
674
|
emb_pattern = r"(.+)_emb\d+"
|
567
675
|
self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
568
|
-
|
569
|
-
|
570
|
-
"Embedding features count more than 3, so group them into one vector for CatBoost: "
|
571
|
-
f"{self.emb_features}"
|
572
|
-
)
|
573
|
-
x, self.grouped_embedding_features = self.group_embeddings(x)
|
676
|
+
x, self.grouped_embedding_features = self.group_embeddings(x)
|
677
|
+
if len(self.grouped_embedding_features) > 0:
|
574
678
|
params["embedding_features"] = self.grouped_embedding_features
|
575
|
-
else:
|
576
|
-
self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
|
577
|
-
self.grouped_embedding_features = None
|
578
|
-
else:
|
579
|
-
self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
|
580
679
|
|
581
680
|
# Find text features from passed in generate_features
|
582
|
-
if hasattr(CatBoostClassifier, "get_text_feature_indices"):
|
681
|
+
if not hasattr(CatBoostClassifier, "get_text_feature_indices"):
|
682
|
+
self.text_features = None
|
683
|
+
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
684
|
+
else:
|
583
685
|
if self.text_features is not None:
|
584
686
|
self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
|
585
687
|
self.text_features = [f for f in self.text_features if f in x.columns and not is_numeric_dtype(x[f])]
|
586
688
|
self.logger.info(f"Rest text features after checks: {self.text_features}")
|
587
689
|
params["text_features"] = self.text_features
|
588
|
-
else:
|
589
|
-
self.text_features = None
|
590
|
-
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
591
690
|
|
592
691
|
# Find rest categorical features
|
593
|
-
self.cat_features
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
692
|
+
self.cat_features = [
|
693
|
+
f
|
694
|
+
for f in self.cat_features
|
695
|
+
if f not in (self.text_features or []) and f not in (self.grouped_embedding_features or [])
|
696
|
+
]
|
697
|
+
if self.cat_features:
|
698
|
+
for c in self.cat_features:
|
598
699
|
if is_numeric_dtype(x[c]):
|
599
700
|
x[c] = x[c].fillna(np.nan)
|
600
|
-
|
701
|
+
elif x[c].dtype != "category":
|
601
702
|
x[c] = x[c].fillna("NA")
|
602
|
-
params["cat_features"] = self.
|
703
|
+
params["cat_features"] = self.cat_features
|
603
704
|
|
604
705
|
return x, y, groups, params
|
605
706
|
|
606
707
|
def group_embeddings(self, df: pd.DataFrame):
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
708
|
+
embeddings_columns = []
|
709
|
+
if len(self.emb_features) > 3:
|
710
|
+
self.logger.info(
|
711
|
+
"Embedding features count more than 3, so group them into one vector for CatBoost: "
|
712
|
+
f"{self.emb_features}"
|
713
|
+
)
|
714
|
+
emb_name = "__grouped_embeddings"
|
715
|
+
df = df.copy()
|
716
|
+
df[self.emb_features] = df[self.emb_features].fillna(0.0)
|
717
|
+
embeddings_series = pd.Series(df[self.emb_features].values.tolist(), index=df.index)
|
718
|
+
df = pd.concat([df.drop(columns=self.emb_features), pd.DataFrame({emb_name: embeddings_series})], axis=1)
|
719
|
+
embeddings_columns.append(emb_name)
|
720
|
+
for c in df.columns:
|
721
|
+
if is_valid_numeric_array_data(df[c]):
|
722
|
+
embeddings_columns.append(c)
|
723
|
+
|
724
|
+
return df, embeddings_columns
|
614
725
|
|
615
726
|
def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
|
616
727
|
if "__grouped_embeddings" in shap_values:
|
@@ -620,8 +731,6 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
620
731
|
return shap_values
|
621
732
|
|
622
733
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
623
|
-
if self.exclude_features:
|
624
|
-
x = x.drop(columns=self.exclude_features)
|
625
734
|
x, y, params = super()._prepare_to_calculate(x, y)
|
626
735
|
if self.text_features:
|
627
736
|
params["text_features"] = self.text_features
|
@@ -629,13 +738,13 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
629
738
|
x, emb_columns = self.group_embeddings(x)
|
630
739
|
params["embedding_features"] = emb_columns
|
631
740
|
|
632
|
-
if self.
|
633
|
-
for c in self.
|
741
|
+
if self.cat_features:
|
742
|
+
for c in self.cat_features:
|
634
743
|
if is_numeric_dtype(x[c]):
|
635
744
|
x[c] = x[c].fillna(np.nan)
|
636
|
-
|
745
|
+
elif x[c].dtype != "category":
|
637
746
|
x[c] = x[c].fillna("NA")
|
638
|
-
params["cat_features"] = self.
|
747
|
+
params["cat_features"] = self.cat_features
|
639
748
|
|
640
749
|
return x, y, params
|
641
750
|
|
@@ -660,7 +769,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
660
769
|
)
|
661
770
|
for f in high_cardinality_features:
|
662
771
|
self.text_features.remove(f)
|
663
|
-
self.
|
772
|
+
self.droped_features.append(f)
|
664
773
|
x = x.drop(columns=f, errors="ignore")
|
665
774
|
return super().cross_val_predict(x, y, baseline_score_column)
|
666
775
|
else:
|
@@ -733,8 +842,6 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
733
842
|
)
|
734
843
|
self.cat_encoder = None
|
735
844
|
self.n_classes = None
|
736
|
-
self.exclude_features = []
|
737
|
-
self.features_to_encode = []
|
738
845
|
|
739
846
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
740
847
|
x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
|
@@ -744,25 +851,23 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
744
851
|
if self.target_type == ModelTaskType.BINARY:
|
745
852
|
params["eval_metric"] = "auc"
|
746
853
|
params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
|
747
|
-
self.cat_features
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, return_df=True)
|
752
|
-
encoded = encoder.fit_transform(x[self.features_to_encode].astype("object"), y_numpy).astype("category")
|
753
|
-
x[self.features_to_encode] = encoded
|
854
|
+
if self.cat_features:
|
855
|
+
encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, cols=self.cat_features, return_df=True)
|
856
|
+
encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
|
857
|
+
x[self.cat_features] = encoded
|
754
858
|
self.cat_encoder = encoder
|
755
|
-
|
859
|
+
for c in x.columns:
|
860
|
+
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
861
|
+
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
862
|
+
self.droped_features.append(c)
|
863
|
+
x = x.drop(columns=c, errors="ignore")
|
756
864
|
return x, y_numpy, groups, params
|
757
865
|
|
758
866
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
759
|
-
if self.exclude_features:
|
760
|
-
x = x.drop(columns=self.exclude_features)
|
761
867
|
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
762
|
-
if self.
|
763
|
-
|
764
|
-
|
765
|
-
)
|
868
|
+
if self.cat_features is not None and self.cat_encoder is not None:
|
869
|
+
encoded = self.cat_encoder.transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
|
870
|
+
x[self.cat_features] = encoded
|
766
871
|
return x, y_numpy, params
|
767
872
|
|
768
873
|
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
@@ -824,9 +929,6 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
824
929
|
|
825
930
|
def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
826
931
|
x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
|
827
|
-
self.cat_features, self.features_to_encode, self.exclude_features = _get_cat_features(
|
828
|
-
self.logger, x, self.cat_features
|
829
|
-
)
|
830
932
|
num_features = [col for col in x.columns if col not in self.cat_features]
|
831
933
|
x[num_features] = x[num_features].fillna(-999)
|
832
934
|
if self.cat_features:
|
@@ -834,20 +936,23 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
834
936
|
encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
|
835
937
|
x[self.cat_features] = encoded
|
836
938
|
self.cat_encoder = encoder
|
939
|
+
for c in x.columns:
|
940
|
+
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
941
|
+
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
942
|
+
self.droped_features.append(c)
|
943
|
+
x = x.drop(columns=c, errors="ignore")
|
837
944
|
return x, y_numpy, groups, params
|
838
945
|
|
839
946
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
840
|
-
|
841
|
-
x = x.drop(columns=self.exclude_features)
|
842
|
-
x, y, params = super()._prepare_to_calculate(x, y)
|
947
|
+
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
843
948
|
if self.cat_features is not None:
|
844
949
|
num_features = [col for col in x.columns if col not in self.cat_features]
|
845
950
|
x[num_features] = x[num_features].fillna(-999)
|
846
|
-
if self.
|
847
|
-
x[self.
|
848
|
-
x[self.
|
951
|
+
if self.cat_features and self.cat_encoder is not None:
|
952
|
+
x[self.cat_features] = self.cat_encoder.transform(
|
953
|
+
x[self.cat_features].astype("object"), y_numpy
|
849
954
|
).astype("category")
|
850
|
-
return x,
|
955
|
+
return x, y_numpy, params
|
851
956
|
|
852
957
|
|
853
958
|
def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
@@ -933,40 +1038,6 @@ def define_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None
|
|
933
1038
|
return scoring, metric_name, multiplier
|
934
1039
|
|
935
1040
|
|
936
|
-
def _get_cat_features(
|
937
|
-
logger: logging.Logger,
|
938
|
-
x: pd.DataFrame,
|
939
|
-
cat_features: Optional[List[str]],
|
940
|
-
text_features: Optional[List[str]] = None,
|
941
|
-
emb_features: Optional[List[str]] = None,
|
942
|
-
) -> List[str]:
|
943
|
-
cat_features = cat_features or []
|
944
|
-
text_features = text_features or []
|
945
|
-
emb_features = emb_features or []
|
946
|
-
exclude_features = text_features + emb_features
|
947
|
-
cat_features = [c for c in cat_features if c not in exclude_features]
|
948
|
-
unique_cat_features = []
|
949
|
-
drop_cat_features = []
|
950
|
-
for name in cat_features:
|
951
|
-
# Remove constant categorical features
|
952
|
-
if x[name].nunique(dropna=False) > 1:
|
953
|
-
unique_cat_features.append(name)
|
954
|
-
else:
|
955
|
-
logger.warning(f"Drop column {name} on preparing data for fit")
|
956
|
-
x.drop(columns=name, inplace=True)
|
957
|
-
drop_cat_features.append(name)
|
958
|
-
cat_features = unique_cat_features
|
959
|
-
|
960
|
-
logger.info(f"Selected categorical features: {cat_features}")
|
961
|
-
|
962
|
-
features_to_encode = list(set(x.select_dtypes(exclude=[np.number, np.datetime64, pd.CategoricalDtype()]).columns))
|
963
|
-
features_to_encode = [f for f in cat_features if f in features_to_encode]
|
964
|
-
|
965
|
-
logger.info(f"Features to encode: {features_to_encode}")
|
966
|
-
|
967
|
-
return cat_features, features_to_encode, drop_cat_features
|
968
|
-
|
969
|
-
|
970
1041
|
def _get_add_params(input_params, add_params):
|
971
1042
|
output_params = dict(input_params)
|
972
1043
|
if add_params is not None:
|
@@ -1052,3 +1123,10 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
|
|
1052
1123
|
multioutput=multioutput,
|
1053
1124
|
)
|
1054
1125
|
return mse if squared else np.sqrt(mse)
|
1126
|
+
|
1127
|
+
|
1128
|
+
def _get_unique_count(series: pd.Series) -> int:
|
1129
|
+
try:
|
1130
|
+
return series.nunique(dropna=False)
|
1131
|
+
except TypeError:
|
1132
|
+
return series.astype(str).nunique(dropna=False)
|
upgini/search_task.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.81a3832.
|
3
|
+
Version: 1.2.81a3832.dev11
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -1,13 +1,13 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=aMsoGp7JafbllKBjbZ_9sxh2xfd5oZMdcOt6Id_WaBU,34
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=aIG16mpdUZqV0GLMoDA4LiXMPbu3a-m72mhVqGnIww4,210860
|
7
7
|
upgini/http.py,sha256=AfaJ3c8z_tK2hZFEehNybDKE0mp1tYcyAP_l0_p8bLQ,43933
|
8
8
|
upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
|
9
|
-
upgini/metrics.py,sha256=
|
10
|
-
upgini/search_task.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=Zb-AwpstEKWaIuLqfIWLF--UGQwIoLbGYnHRlyPQ_cY,43304
|
10
|
+
upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
13
13
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
|
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.81a3832.
|
74
|
-
upgini-1.2.81a3832.
|
75
|
-
upgini-1.2.81a3832.
|
76
|
-
upgini-1.2.81a3832.
|
73
|
+
upgini-1.2.81a3832.dev11.dist-info/METADATA,sha256=h9Tlze7oWU3tEfYMuF9BYZTD7hlFeZM-zjrkIzMml4k,49173
|
74
|
+
upgini-1.2.81a3832.dev11.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
75
|
+
upgini-1.2.81a3832.dev11.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.81a3832.dev11.dist-info/RECORD,,
|
File without changes
|
File without changes
|