upgini 1.2.81a3832.dev9__py3-none-any.whl → 1.2.81a3832.dev11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.81a3832.dev9"
1
+ __version__ = "1.2.81a3832.dev11"
@@ -4245,7 +4245,7 @@ if response.status_code == 200:
4245
4245
  def __show_selected_features(self, search_keys: Dict[str, SearchKey]):
4246
4246
  search_key_names = [col for col, tpe in search_keys.items() if tpe != SearchKey.CUSTOM_KEY]
4247
4247
  if self.fit_columns_renaming:
4248
- search_key_names = [self.fit_columns_renaming.get(col, col) for col in search_key_names]
4248
+ search_key_names = sorted(set([self.fit_columns_renaming.get(col, col) for col in search_key_names]))
4249
4249
  msg = self.bundle.get("features_info_header").format(len(self.feature_names_), search_key_names)
4250
4250
 
4251
4251
  try:
upgini/metrics.py CHANGED
@@ -15,7 +15,7 @@ from catboost import CatBoostClassifier, CatBoostRegressor
15
15
  from category_encoders.cat_boost import CatBoostEncoder
16
16
  from lightgbm import LGBMClassifier, LGBMRegressor
17
17
  from numpy import log1p
18
- from pandas.api.types import is_numeric_dtype
18
+ from pandas.api.types import is_numeric_dtype, is_integer_dtype, is_float_dtype
19
19
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
20
20
 
21
21
  from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
@@ -32,7 +32,10 @@ except ImportError:
32
32
  available_scorers = SCORERS
33
33
  from sklearn.metrics import mean_squared_error
34
34
  from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
35
- from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit # , TimeSeriesSplit
35
+ from sklearn.model_selection import ( # , TimeSeriesSplit
36
+ BaseCrossValidator,
37
+ TimeSeriesSplit,
38
+ )
36
39
 
37
40
  from upgini.errors import ValidationError
38
41
  from upgini.metadata import ModelTaskType
@@ -250,6 +253,47 @@ class _CrossValResults:
250
253
  return f"{self.metric:.3f} ± {self.metric_std:.3f}"
251
254
 
252
255
 
256
+ def is_numeric_object(x: pd.Series) -> bool:
257
+ try:
258
+ pd.to_numeric(x, errors="raise")
259
+ return True
260
+ except (ValueError, TypeError):
261
+ return False
262
+
263
+
264
+ def is_valid_numeric_array_data(data: pd.Series) -> bool:
265
+ data_without_na = data.dropna()
266
+ if data_without_na.empty:
267
+ return False
268
+
269
+ first_element = data_without_na.iloc[0]
270
+
271
+ # numpy.ndarray with numeric types
272
+ if isinstance(first_element, np.ndarray):
273
+ return np.issubdtype(first_element.dtype, np.number)
274
+
275
+ # DataFrame with all numeric columns
276
+ elif isinstance(first_element, pd.DataFrame):
277
+ return all(np.issubdtype(dtype, np.number) for dtype in first_element.dtypes)
278
+
279
+ # list or list of lists with numeric types
280
+ elif isinstance(first_element, list):
281
+ try:
282
+ # flat list
283
+ if all(isinstance(x, (int, float, np.number)) or pd.isna(x) for x in first_element):
284
+ return True
285
+ # list of lists
286
+ elif all(
287
+ isinstance(x, list) and all(isinstance(y, (int, float, np.number)) or pd.isna(y) for y in x)
288
+ for x in first_element
289
+ ):
290
+ return True
291
+ except Exception:
292
+ return False
293
+
294
+ return False
295
+
296
+
253
297
  class EstimatorWrapper:
254
298
  default_estimator: Literal["catboost", "lightgbm"] = "catboost"
255
299
 
@@ -279,6 +323,10 @@ class EstimatorWrapper:
279
323
  self.groups = groups
280
324
  self.text_features = text_features
281
325
  self.logger = logger or logging.getLogger()
326
+ self.droped_features = []
327
+ self.converted_to_int = []
328
+ self.converted_to_str = []
329
+ self.converted_to_numeric = []
282
330
 
283
331
  def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
284
332
  x, y, _, fit_params = self._prepare_to_fit(x, y)
@@ -286,28 +334,13 @@ class EstimatorWrapper:
286
334
  self.estimator.fit(x, y, **kwargs)
287
335
  return self
288
336
 
289
- def predict(self, **kwargs):
290
- return self.estimator.predict(**kwargs)
291
-
292
- def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
293
- x, y, groups = self._prepare_data(x, y, groups=self.groups)
294
- return x, y, groups, {}
337
+ def predict(self, x: pd.DataFrame, **kwargs):
338
+ x, _, _ = self._prepare_to_calculate(x, None)
339
+ return self.estimator.predict(x, **kwargs)
295
340
 
296
341
  def _prepare_data(
297
342
  self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
298
343
  ) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
299
- self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
300
- for c in x.columns:
301
- if c not in self.cat_features:
302
- if is_numeric_dtype(x[c]):
303
- x[c] = x[c].astype(float)
304
- elif not x[c].dtype == "category":
305
- x[c] = x[c].astype(str)
306
- else:
307
- if x[c].dtype == "category" and x[c].cat.categories.dtype == np.int64:
308
- x[c] = x[c].astype(np.int64)
309
- elif not is_numeric_dtype(x[c]):
310
- x[c] = x[c].astype(str).astype("category")
311
344
 
312
345
  if not isinstance(y, pd.Series):
313
346
  raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
@@ -335,8 +368,83 @@ class EstimatorWrapper:
335
368
 
336
369
  return x, y
337
370
 
371
+ def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
372
+ x, y, groups = self._prepare_data(x, y, groups=self.groups)
373
+
374
+ self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
375
+ self.droped_features = []
376
+ self.converted_to_int = []
377
+ self.converted_to_str = []
378
+ self.converted_to_numeric = []
379
+ for c in x.columns:
380
+ if _get_unique_count(x[c]) < 2:
381
+ self.logger.warning(f"Remove feature {c} because it has less than 2 unique values")
382
+ if c in self.cat_features:
383
+ self.cat_features.remove(c)
384
+ x.drop(columns=[c], inplace=True)
385
+ self.droped_features.append(c)
386
+ elif self.text_features is not None and c in self.text_features:
387
+ x[c] = x[c].astype(str)
388
+ self.converted_to_str.append(c)
389
+ elif c in self.cat_features:
390
+ if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
391
+ x[c] = x[c].astype(np.int64)
392
+ self.converted_to_int.append(c)
393
+ elif x[c].dtype == "category" and is_integer_dtype(x[c].cat.categories):
394
+ self.logger.info(
395
+ f"Convert categorical feature {c} with integer categories"
396
+ " to int64 and remove from cat_features"
397
+ )
398
+ x[c] = x[c].astype(np.int64)
399
+ self.converted_to_int.append(c)
400
+ self.cat_features.remove(c)
401
+ elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
402
+ self.logger.info(
403
+ f"Convert float cat feature {c} to string"
404
+ )
405
+ x[c] = x[c].astype(str)
406
+ self.converted_to_str.append(c)
407
+ elif x[c].dtype not in ["category", "int64"]:
408
+ x[c] = x[c].astype(str)
409
+ self.converted_to_str.append(c)
410
+ else:
411
+ if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
412
+ self.logger.info(f"Convert bool feature {c} to int64")
413
+ x[c] = x[c].astype(np.int64)
414
+ self.converted_to_int.append(c)
415
+ elif not is_valid_numeric_array_data(x[c]):
416
+ try:
417
+ x[c] = pd.to_numeric(x[c], errors="raise")
418
+ self.converted_to_numeric.append(c)
419
+ except (ValueError, TypeError):
420
+ self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
421
+ x.drop(columns=[c], inplace=True)
422
+ self.droped_features.append(c)
423
+
424
+ return x, y, groups, {}
425
+
338
426
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
339
427
  x, y, _ = self._prepare_data(x, y)
428
+
429
+ if self.droped_features:
430
+ self.logger.info(f"Drop features on calculate metrics: {self.droped_features}")
431
+ x = x.drop(columns=self.droped_features)
432
+
433
+ if self.converted_to_int:
434
+ self.logger.info(f"Convert to int features on calculate metrics: {self.converted_to_int}")
435
+ for c in self.converted_to_int:
436
+ x[c] = x[c].astype(np.int64)
437
+
438
+ if self.converted_to_str:
439
+ self.logger.info(f"Convert to str features on calculate metrics: {self.converted_to_str}")
440
+ for c in self.converted_to_str:
441
+ x[c] = x[c].astype(str)
442
+
443
+ if self.converted_to_numeric:
444
+ self.logger.info(f"Convert to numeric features on calculate metrics: {self.converted_to_numeric}")
445
+ for c in self.converted_to_numeric:
446
+ x[c] = pd.to_numeric(x[c], errors="coerce")
447
+
340
448
  return x, y, {}
341
449
 
342
450
  def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
@@ -552,8 +660,6 @@ class CatBoostWrapper(EstimatorWrapper):
552
660
  )
553
661
  self.emb_features = None
554
662
  self.grouped_embedding_features = None
555
- self.drop_cat_features = []
556
- self.features_to_encode = []
557
663
 
558
664
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
559
665
  x, y, groups, params = super()._prepare_to_fit(x, y)
@@ -562,55 +668,60 @@ class CatBoostWrapper(EstimatorWrapper):
562
668
  import catboost
563
669
  from catboost import CatBoostClassifier
564
670
 
565
- if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
671
+ if not hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
672
+ self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
673
+ else:
566
674
  emb_pattern = r"(.+)_emb\d+"
567
675
  self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
568
- if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
569
- self.logger.info(
570
- "Embedding features count more than 3, so group them into one vector for CatBoost: "
571
- f"{self.emb_features}"
572
- )
573
- x, self.grouped_embedding_features = self.group_embeddings(x)
676
+ x, self.grouped_embedding_features = self.group_embeddings(x)
677
+ if len(self.grouped_embedding_features) > 0:
574
678
  params["embedding_features"] = self.grouped_embedding_features
575
- else:
576
- self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
577
- self.grouped_embedding_features = None
578
- else:
579
- self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
580
679
 
581
680
  # Find text features from passed in generate_features
582
- if hasattr(CatBoostClassifier, "get_text_feature_indices"):
681
+ if not hasattr(CatBoostClassifier, "get_text_feature_indices"):
682
+ self.text_features = None
683
+ self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
684
+ else:
583
685
  if self.text_features is not None:
584
686
  self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
585
687
  self.text_features = [f for f in self.text_features if f in x.columns and not is_numeric_dtype(x[f])]
586
688
  self.logger.info(f"Rest text features after checks: {self.text_features}")
587
689
  params["text_features"] = self.text_features
588
- else:
589
- self.text_features = None
590
- self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
591
690
 
592
691
  # Find rest categorical features
593
- self.cat_features, self.features_to_encode, self.exclude_features = _get_cat_features(
594
- self.logger, x, self.cat_features, self.text_features, self.grouped_embedding_features
595
- )
596
- if self.features_to_encode:
597
- for c in self.features_to_encode:
692
+ self.cat_features = [
693
+ f
694
+ for f in self.cat_features
695
+ if f not in (self.text_features or []) and f not in (self.grouped_embedding_features or [])
696
+ ]
697
+ if self.cat_features:
698
+ for c in self.cat_features:
598
699
  if is_numeric_dtype(x[c]):
599
700
  x[c] = x[c].fillna(np.nan)
600
- else:
701
+ elif x[c].dtype != "category":
601
702
  x[c] = x[c].fillna("NA")
602
- params["cat_features"] = self.features_to_encode
703
+ params["cat_features"] = self.cat_features
603
704
 
604
705
  return x, y, groups, params
605
706
 
606
707
  def group_embeddings(self, df: pd.DataFrame):
607
- emb_name = "__grouped_embeddings"
608
- df = df.copy()
609
- df[self.emb_features] = df[self.emb_features].fillna(0.0)
610
- embeddings_series = pd.Series(df[self.emb_features].values.tolist(), index=df.index)
611
- df = pd.concat([df.drop(columns=self.emb_features), pd.DataFrame({emb_name: embeddings_series})], axis=1)
612
-
613
- return df, [emb_name]
708
+ embeddings_columns = []
709
+ if len(self.emb_features) > 3:
710
+ self.logger.info(
711
+ "Embedding features count more than 3, so group them into one vector for CatBoost: "
712
+ f"{self.emb_features}"
713
+ )
714
+ emb_name = "__grouped_embeddings"
715
+ df = df.copy()
716
+ df[self.emb_features] = df[self.emb_features].fillna(0.0)
717
+ embeddings_series = pd.Series(df[self.emb_features].values.tolist(), index=df.index)
718
+ df = pd.concat([df.drop(columns=self.emb_features), pd.DataFrame({emb_name: embeddings_series})], axis=1)
719
+ embeddings_columns.append(emb_name)
720
+ for c in df.columns:
721
+ if is_valid_numeric_array_data(df[c]):
722
+ embeddings_columns.append(c)
723
+
724
+ return df, embeddings_columns
614
725
 
615
726
  def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
616
727
  if "__grouped_embeddings" in shap_values:
@@ -620,8 +731,6 @@ class CatBoostWrapper(EstimatorWrapper):
620
731
  return shap_values
621
732
 
622
733
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
623
- if self.exclude_features:
624
- x = x.drop(columns=self.exclude_features)
625
734
  x, y, params = super()._prepare_to_calculate(x, y)
626
735
  if self.text_features:
627
736
  params["text_features"] = self.text_features
@@ -629,13 +738,13 @@ class CatBoostWrapper(EstimatorWrapper):
629
738
  x, emb_columns = self.group_embeddings(x)
630
739
  params["embedding_features"] = emb_columns
631
740
 
632
- if self.features_to_encode:
633
- for c in self.features_to_encode:
741
+ if self.cat_features:
742
+ for c in self.cat_features:
634
743
  if is_numeric_dtype(x[c]):
635
744
  x[c] = x[c].fillna(np.nan)
636
- else:
745
+ elif x[c].dtype != "category":
637
746
  x[c] = x[c].fillna("NA")
638
- params["cat_features"] = self.features_to_encode
747
+ params["cat_features"] = self.cat_features
639
748
 
640
749
  return x, y, params
641
750
 
@@ -660,7 +769,7 @@ class CatBoostWrapper(EstimatorWrapper):
660
769
  )
661
770
  for f in high_cardinality_features:
662
771
  self.text_features.remove(f)
663
- self.exclude_features.append(f)
772
+ self.droped_features.append(f)
664
773
  x = x.drop(columns=f, errors="ignore")
665
774
  return super().cross_val_predict(x, y, baseline_score_column)
666
775
  else:
@@ -733,8 +842,6 @@ class LightGBMWrapper(EstimatorWrapper):
733
842
  )
734
843
  self.cat_encoder = None
735
844
  self.n_classes = None
736
- self.exclude_features = []
737
- self.features_to_encode = []
738
845
 
739
846
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
740
847
  x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
@@ -744,25 +851,23 @@ class LightGBMWrapper(EstimatorWrapper):
744
851
  if self.target_type == ModelTaskType.BINARY:
745
852
  params["eval_metric"] = "auc"
746
853
  params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
747
- self.cat_features, self.features_to_encode, self.exclude_features = _get_cat_features(
748
- self.logger, x, self.cat_features
749
- )
750
- if self.features_to_encode:
751
- encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, return_df=True)
752
- encoded = encoder.fit_transform(x[self.features_to_encode].astype("object"), y_numpy).astype("category")
753
- x[self.features_to_encode] = encoded
854
+ if self.cat_features:
855
+ encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, cols=self.cat_features, return_df=True)
856
+ encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
857
+ x[self.cat_features] = encoded
754
858
  self.cat_encoder = encoder
755
-
859
+ for c in x.columns:
860
+ if x[c].dtype not in ["category", "int64", "float64", "bool"]:
861
+ self.logger.warning(f"Feature {c} is not numeric and will be dropped")
862
+ self.droped_features.append(c)
863
+ x = x.drop(columns=c, errors="ignore")
756
864
  return x, y_numpy, groups, params
757
865
 
758
866
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
759
- if self.exclude_features:
760
- x = x.drop(columns=self.exclude_features)
761
867
  x, y_numpy, params = super()._prepare_to_calculate(x, y)
762
- if self.features_to_encode is not None and self.cat_encoder is not None:
763
- x[self.features_to_encode] = self.cat_encoder.transform(x[self.features_to_encode].astype("object")).astype(
764
- "category"
765
- )
868
+ if self.cat_features is not None and self.cat_encoder is not None:
869
+ encoded = self.cat_encoder.transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
870
+ x[self.cat_features] = encoded
766
871
  return x, y_numpy, params
767
872
 
768
873
  def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
@@ -824,9 +929,6 @@ class OtherEstimatorWrapper(EstimatorWrapper):
824
929
 
825
930
  def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
826
931
  x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
827
- self.cat_features, self.features_to_encode, self.exclude_features = _get_cat_features(
828
- self.logger, x, self.cat_features
829
- )
830
932
  num_features = [col for col in x.columns if col not in self.cat_features]
831
933
  x[num_features] = x[num_features].fillna(-999)
832
934
  if self.cat_features:
@@ -834,20 +936,23 @@ class OtherEstimatorWrapper(EstimatorWrapper):
834
936
  encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
835
937
  x[self.cat_features] = encoded
836
938
  self.cat_encoder = encoder
939
+ for c in x.columns:
940
+ if x[c].dtype not in ["category", "int64", "float64", "bool"]:
941
+ self.logger.warning(f"Feature {c} is not numeric and will be dropped")
942
+ self.droped_features.append(c)
943
+ x = x.drop(columns=c, errors="ignore")
837
944
  return x, y_numpy, groups, params
838
945
 
839
946
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
840
- if self.exclude_features:
841
- x = x.drop(columns=self.exclude_features)
842
- x, y, params = super()._prepare_to_calculate(x, y)
947
+ x, y_numpy, params = super()._prepare_to_calculate(x, y)
843
948
  if self.cat_features is not None:
844
949
  num_features = [col for col in x.columns if col not in self.cat_features]
845
950
  x[num_features] = x[num_features].fillna(-999)
846
- if self.features_to_encode and self.cat_encoder is not None:
847
- x[self.features_to_encode] = self.cat_encoder.transform(
848
- x[self.features_to_encode].astype("object")
951
+ if self.cat_features and self.cat_encoder is not None:
952
+ x[self.cat_features] = self.cat_encoder.transform(
953
+ x[self.cat_features].astype("object"), y_numpy
849
954
  ).astype("category")
850
- return x, y, params
955
+ return x, y_numpy, params
851
956
 
852
957
 
853
958
  def validate_scoring_argument(scoring: Union[Callable, str, None]):
@@ -933,40 +1038,6 @@ def define_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None
933
1038
  return scoring, metric_name, multiplier
934
1039
 
935
1040
 
936
- def _get_cat_features(
937
- logger: logging.Logger,
938
- x: pd.DataFrame,
939
- cat_features: Optional[List[str]],
940
- text_features: Optional[List[str]] = None,
941
- emb_features: Optional[List[str]] = None,
942
- ) -> List[str]:
943
- cat_features = cat_features or []
944
- text_features = text_features or []
945
- emb_features = emb_features or []
946
- exclude_features = text_features + emb_features
947
- cat_features = [c for c in cat_features if c not in exclude_features]
948
- unique_cat_features = []
949
- drop_cat_features = []
950
- for name in cat_features:
951
- # Remove constant categorical features
952
- if x[name].nunique(dropna=False) > 1:
953
- unique_cat_features.append(name)
954
- else:
955
- logger.warning(f"Drop column {name} on preparing data for fit")
956
- x.drop(columns=name, inplace=True)
957
- drop_cat_features.append(name)
958
- cat_features = unique_cat_features
959
-
960
- logger.info(f"Selected categorical features: {cat_features}")
961
-
962
- features_to_encode = list(set(x.select_dtypes(exclude=[np.number, np.datetime64, pd.CategoricalDtype()]).columns))
963
- features_to_encode = [f for f in cat_features if f in features_to_encode]
964
-
965
- logger.info(f"Features to encode: {features_to_encode}")
966
-
967
- return cat_features, features_to_encode, drop_cat_features
968
-
969
-
970
1041
  def _get_add_params(input_params, add_params):
971
1042
  output_params = dict(input_params)
972
1043
  if add_params is not None:
@@ -1052,3 +1123,10 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
1052
1123
  multioutput=multioutput,
1053
1124
  )
1054
1125
  return mse if squared else np.sqrt(mse)
1126
+
1127
+
1128
+ def _get_unique_count(series: pd.Series) -> int:
1129
+ try:
1130
+ return series.nunique(dropna=False)
1131
+ except TypeError:
1132
+ return series.astype(str).nunique(dropna=False)
upgini/search_task.py CHANGED
@@ -179,6 +179,7 @@ class SearchTask:
179
179
  for f in meta.generated_features
180
180
  for c in f.base_columns
181
181
  if c.ads_definition_id is None
182
+ and not c.original_name.endswith("_emb") # embeddings already added
182
183
  )
183
184
  return list(features_for_transform)
184
185
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.81a3832.dev9
3
+ Version: 1.2.81a3832.dev11
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,13 +1,13 @@
1
- upgini/__about__.py,sha256=wEcwloV3XNyxWA40HLqEb4PIXttvc8pREucBfzAKW0c,33
1
+ upgini/__about__.py,sha256=aMsoGp7JafbllKBjbZ_9sxh2xfd5oZMdcOt6Id_WaBU,34
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=ZSSukaq4_mngCkJyQe-XCssXbH8nOD7ByWfSHi9nypc,210847
6
+ upgini/features_enricher.py,sha256=aIG16mpdUZqV0GLMoDA4LiXMPbu3a-m72mhVqGnIww4,210860
7
7
  upgini/http.py,sha256=AfaJ3c8z_tK2hZFEehNybDKE0mp1tYcyAP_l0_p8bLQ,43933
8
8
  upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
9
- upgini/metrics.py,sha256=4ehQO8VEebKLiCuBq2LRqC2QbPIqswoe7b1pnR_-zQA,39985
10
- upgini/search_task.py,sha256=RcvAE785yksWTsTNWuZFVNlk32jHElMoEna1T_C5N8Q,17823
9
+ upgini/metrics.py,sha256=Zb-AwpstEKWaIuLqfIWLF--UGQwIoLbGYnHRlyPQ_cY,43304
10
+ upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
13
13
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.81a3832.dev9.dist-info/METADATA,sha256=6jP4TJl2tN98P8wuWIBARzrPtZVRT48uPukgTvZOvlA,49172
74
- upgini-1.2.81a3832.dev9.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
- upgini-1.2.81a3832.dev9.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.81a3832.dev9.dist-info/RECORD,,
73
+ upgini-1.2.81a3832.dev11.dist-info/METADATA,sha256=h9Tlze7oWU3tEfYMuF9BYZTD7hlFeZM-zjrkIzMml4k,49173
74
+ upgini-1.2.81a3832.dev11.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.81a3832.dev11.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.81a3832.dev11.dist-info/RECORD,,