upgini 1.2.81a3832.dev10__py3-none-any.whl → 1.2.81a3832.dev12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.81a3832.dev10"
1
+ __version__ = "1.2.81a3832.dev12"
@@ -1017,6 +1017,12 @@ class FeaturesEnricher(TransformerMixin):
1017
1017
  else:
1018
1018
  client_cat_features = []
1019
1019
 
1020
+ # rename baseline_score_column
1021
+ reversed_renaming = {v: k for k, v in columns_renaming.items()}
1022
+ baseline_score_column = self.baseline_score_column
1023
+ if baseline_score_column is not None:
1024
+ baseline_score_column = reversed_renaming[baseline_score_column]
1025
+
1020
1026
  gc.collect()
1021
1027
 
1022
1028
  if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
@@ -1069,7 +1075,7 @@ class FeaturesEnricher(TransformerMixin):
1069
1075
  has_date=has_date,
1070
1076
  )
1071
1077
  etalon_cv_result = baseline_estimator.cross_val_predict(
1072
- fitting_X, y_sorted, self.baseline_score_column
1078
+ fitting_X, y_sorted, baseline_score_column
1073
1079
  )
1074
1080
  etalon_metric = etalon_cv_result.get_display_metric()
1075
1081
  if etalon_metric is None:
@@ -1165,7 +1171,7 @@ class FeaturesEnricher(TransformerMixin):
1165
1171
  f"on client features: {eval_X_sorted.columns.to_list()}"
1166
1172
  )
1167
1173
  etalon_eval_results = baseline_estimator.calculate_metric(
1168
- eval_X_sorted, eval_y_sorted, self.baseline_score_column
1174
+ eval_X_sorted, eval_y_sorted, baseline_score_column
1169
1175
  )
1170
1176
  etalon_eval_metric = etalon_eval_results.get_display_metric()
1171
1177
  self.logger.info(
@@ -1959,6 +1965,14 @@ class FeaturesEnricher(TransformerMixin):
1959
1965
  enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
1960
1966
  )
1961
1967
 
1968
+ # Add hash-suffixes because output of transform has original names
1969
+ reversed_renaming = {v: k for k, v in columns_renaming.items()}
1970
+ X_sampled.rename(columns=reversed_renaming, inplace=True)
1971
+ enriched_X.rename(columns=reversed_renaming, inplace=True)
1972
+ for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
1973
+ eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
1974
+ enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
1975
+
1962
1976
  # Cache and return results
1963
1977
  datasets_hash = hash_input(validated_X, validated_y, eval_set)
1964
1978
  return self.__cache_and_return_results(
@@ -4245,7 +4259,7 @@ if response.status_code == 200:
4245
4259
  def __show_selected_features(self, search_keys: Dict[str, SearchKey]):
4246
4260
  search_key_names = [col for col, tpe in search_keys.items() if tpe != SearchKey.CUSTOM_KEY]
4247
4261
  if self.fit_columns_renaming:
4248
- search_key_names = [self.fit_columns_renaming.get(col, col) for col in search_key_names]
4262
+ search_key_names = sorted(set([self.fit_columns_renaming.get(col, col) for col in search_key_names]))
4249
4263
  msg = self.bundle.get("features_info_header").format(len(self.feature_names_), search_key_names)
4250
4264
 
4251
4265
  try:
upgini/metrics.py CHANGED
@@ -15,7 +15,7 @@ from catboost import CatBoostClassifier, CatBoostRegressor
15
15
  from category_encoders.cat_boost import CatBoostEncoder
16
16
  from lightgbm import LGBMClassifier, LGBMRegressor
17
17
  from numpy import log1p
18
- from pandas.api.types import is_numeric_dtype
18
+ from pandas.api.types import is_numeric_dtype, is_integer_dtype, is_float_dtype
19
19
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
20
20
 
21
21
  from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
@@ -324,6 +324,9 @@ class EstimatorWrapper:
324
324
  self.text_features = text_features
325
325
  self.logger = logger or logging.getLogger()
326
326
  self.droped_features = []
327
+ self.converted_to_int = []
328
+ self.converted_to_str = []
329
+ self.converted_to_numeric = []
327
330
 
328
331
  def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
329
332
  x, y, _, fit_params = self._prepare_to_fit(x, y)
@@ -335,44 +338,6 @@ class EstimatorWrapper:
335
338
  x, _, _ = self._prepare_to_calculate(x, None)
336
339
  return self.estimator.predict(x, **kwargs)
337
340
 
338
- def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
339
- x, y, groups = self._prepare_data(x, y, groups=self.groups)
340
-
341
- self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
342
- self.droped_features = []
343
- for c in x.columns:
344
- if _get_unique_count(x[c]) < 2:
345
- self.logger.warning(f"Remove feature {c} because it has less than 2 unique values")
346
- self.droped_features.append(c)
347
- if c in self.cat_features:
348
- self.cat_features.remove(c)
349
- x.drop(columns=[c], inplace=True)
350
- elif c in self.cat_features:
351
- if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
352
- x[c] = x[c].astype(np.int64)
353
- elif is_numeric_object(x[c]):
354
- self.logger.warning(
355
- f"Convert numeric feature {c} of type {x[c].dtype} to numeric and remove from cat_features"
356
- )
357
- x[c] = pd.to_numeric(x[c], errors="coerce")
358
- self.cat_features.remove(c)
359
- elif x[c].dtype != "category":
360
- x[c] = x[c].astype(str)
361
- elif self.text_features is not None and c in self.text_features:
362
- x[c] = x[c].astype(str)
363
- else:
364
- if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
365
- x[c] = x[c].astype(np.int64)
366
- elif not is_valid_numeric_array_data(x[c]):
367
- try:
368
- x[c] = pd.to_numeric(x[c], errors="raise")
369
- except (ValueError, TypeError):
370
- self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
371
- self.droped_features.append(c)
372
- x.drop(columns=[c], inplace=True)
373
-
374
- return x, y, groups, {}
375
-
376
341
  def _prepare_data(
377
342
  self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
378
343
  ) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
@@ -403,26 +368,82 @@ class EstimatorWrapper:
403
368
 
404
369
  return x, y
405
370
 
406
- def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
407
- x, y, _ = self._prepare_data(x, y)
408
-
409
- if self.droped_features:
410
- self.logger.warning(f"Dropped features: {self.droped_features}")
411
- x = x.drop(columns=self.droped_features)
371
+ def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
372
+ x, y, groups = self._prepare_data(x, y, groups=self.groups)
412
373
 
374
+ self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
375
+ self.droped_features = []
376
+ self.converted_to_int = []
377
+ self.converted_to_str = []
378
+ self.converted_to_numeric = []
413
379
  for c in x.columns:
414
- if c in self.cat_features:
380
+ if _get_unique_count(x[c]) < 2:
381
+ self.logger.warning(f"Remove feature {c} because it has less than 2 unique values")
382
+ if c in self.cat_features:
383
+ self.cat_features.remove(c)
384
+ x.drop(columns=[c], inplace=True)
385
+ self.droped_features.append(c)
386
+ elif self.text_features is not None and c in self.text_features:
387
+ x[c] = x[c].astype(str)
388
+ self.converted_to_str.append(c)
389
+ elif c in self.cat_features:
415
390
  if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
416
391
  x[c] = x[c].astype(np.int64)
417
- elif x[c].dtype != "category":
392
+ self.converted_to_int.append(c)
393
+ elif x[c].dtype == "category" and is_integer_dtype(x[c].cat.categories):
394
+ self.logger.info(
395
+ f"Convert categorical feature {c} with integer categories"
396
+ " to int64 and remove from cat_features"
397
+ )
398
+ x[c] = x[c].astype(np.int64)
399
+ self.converted_to_int.append(c)
400
+ self.cat_features.remove(c)
401
+ elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
402
+ self.logger.info(
403
+ f"Convert float cat feature {c} to string"
404
+ )
418
405
  x[c] = x[c].astype(str)
419
- elif self.text_features is not None and c in self.text_features:
420
- x[c] = x[c].astype(str)
406
+ self.converted_to_str.append(c)
407
+ elif x[c].dtype not in ["category", "int64"]:
408
+ x[c] = x[c].astype(str)
409
+ self.converted_to_str.append(c)
421
410
  else:
422
411
  if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
412
+ self.logger.info(f"Convert bool feature {c} to int64")
423
413
  x[c] = x[c].astype(np.int64)
414
+ self.converted_to_int.append(c)
424
415
  elif not is_valid_numeric_array_data(x[c]):
425
- x[c] = pd.to_numeric(x[c], errors="coerce")
416
+ try:
417
+ x[c] = pd.to_numeric(x[c], errors="raise")
418
+ self.converted_to_numeric.append(c)
419
+ except (ValueError, TypeError):
420
+ self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
421
+ x.drop(columns=[c], inplace=True)
422
+ self.droped_features.append(c)
423
+
424
+ return x, y, groups, {}
425
+
426
+ def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
427
+ x, y, _ = self._prepare_data(x, y)
428
+
429
+ if self.droped_features:
430
+ self.logger.info(f"Drop features on calculate metrics: {self.droped_features}")
431
+ x = x.drop(columns=self.droped_features)
432
+
433
+ if self.converted_to_int:
434
+ self.logger.info(f"Convert to int features on calculate metrics: {self.converted_to_int}")
435
+ for c in self.converted_to_int:
436
+ x[c] = x[c].astype(np.int64)
437
+
438
+ if self.converted_to_str:
439
+ self.logger.info(f"Convert to str features on calculate metrics: {self.converted_to_str}")
440
+ for c in self.converted_to_str:
441
+ x[c] = x[c].astype(str)
442
+
443
+ if self.converted_to_numeric:
444
+ self.logger.info(f"Convert to numeric features on calculate metrics: {self.converted_to_numeric}")
445
+ for c in self.converted_to_numeric:
446
+ x[c] = pd.to_numeric(x[c], errors="coerce")
426
447
 
427
448
  return x, y, {}
428
449
 
@@ -443,6 +464,8 @@ class EstimatorWrapper:
443
464
  if baseline_score_column is not None and self.metric_name == "GINI":
444
465
  self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
445
466
  metric = roc_auc_score(y, x[baseline_score_column])
467
+ metric_std = None
468
+ average_shap_values = None
446
469
  else:
447
470
  self.logger.info(f"Cross validate with estimeator: {self.estimator}")
448
471
  cv_results = cross_validate(
upgini/search_task.py CHANGED
@@ -179,6 +179,7 @@ class SearchTask:
179
179
  for f in meta.generated_features
180
180
  for c in f.base_columns
181
181
  if c.ads_definition_id is None
182
+ and not c.original_name.endswith("_emb") # embeddings already added
182
183
  )
183
184
  return list(features_for_transform)
184
185
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.81a3832.dev10
3
+ Version: 1.2.81a3832.dev12
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,13 +1,13 @@
1
- upgini/__about__.py,sha256=dUnN248oLg0rBaOttshEyx0_AtLIiP6ku5lXmtwrlQo,34
1
+ upgini/__about__.py,sha256=QoAMu0gkmwzsYvsLvBmcg4CfaE-sL6T-rz9s8HCGZY4,34
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=ZSSukaq4_mngCkJyQe-XCssXbH8nOD7ByWfSHi9nypc,210847
6
+ upgini/features_enricher.py,sha256=cbQydnSOr7-ioQuEs-X3KYd0ays1BPuwFE_sKmOQc5E,211702
7
7
  upgini/http.py,sha256=AfaJ3c8z_tK2hZFEehNybDKE0mp1tYcyAP_l0_p8bLQ,43933
8
8
  upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
9
- upgini/metrics.py,sha256=DpXJtooXDCLTJUf3JlfIsJiwx9Hg-2vv4-k4RWkXFMU,42269
10
- upgini/search_task.py,sha256=RcvAE785yksWTsTNWuZFVNlk32jHElMoEna1T_C5N8Q,17823
9
+ upgini/metrics.py,sha256=sbxnFyMWCUsVSAy-OwNmDYJxVlGEnTArVUnTOID7miU,43373
10
+ upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
13
13
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.81a3832.dev10.dist-info/METADATA,sha256=F0Eg-CF-u-X2QDwUGlH0Fom-Ys1Br4bfoR_RBUq0ob8,49173
74
- upgini-1.2.81a3832.dev10.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
- upgini-1.2.81a3832.dev10.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.81a3832.dev10.dist-info/RECORD,,
73
+ upgini-1.2.81a3832.dev12.dist-info/METADATA,sha256=2cf3_AwHclmjPzAluKb_Y2I_4OecghsB-DqKoJVODls,49173
74
+ upgini-1.2.81a3832.dev12.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.81a3832.dev12.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.81a3832.dev12.dist-info/RECORD,,