upgini 1.2.81a3832.dev10__py3-none-any.whl → 1.2.81a3832.dev11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.81a3832.dev10"
1
+ __version__ = "1.2.81a3832.dev11"
@@ -4245,7 +4245,7 @@ if response.status_code == 200:
4245
4245
  def __show_selected_features(self, search_keys: Dict[str, SearchKey]):
4246
4246
  search_key_names = [col for col, tpe in search_keys.items() if tpe != SearchKey.CUSTOM_KEY]
4247
4247
  if self.fit_columns_renaming:
4248
- search_key_names = [self.fit_columns_renaming.get(col, col) for col in search_key_names]
4248
+ search_key_names = sorted(set([self.fit_columns_renaming.get(col, col) for col in search_key_names]))
4249
4249
  msg = self.bundle.get("features_info_header").format(len(self.feature_names_), search_key_names)
4250
4250
 
4251
4251
  try:
upgini/metrics.py CHANGED
@@ -15,7 +15,7 @@ from catboost import CatBoostClassifier, CatBoostRegressor
15
15
  from category_encoders.cat_boost import CatBoostEncoder
16
16
  from lightgbm import LGBMClassifier, LGBMRegressor
17
17
  from numpy import log1p
18
- from pandas.api.types import is_numeric_dtype
18
+ from pandas.api.types import is_numeric_dtype, is_integer_dtype, is_float_dtype
19
19
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
20
20
 
21
21
  from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
@@ -324,6 +324,9 @@ class EstimatorWrapper:
324
324
  self.text_features = text_features
325
325
  self.logger = logger or logging.getLogger()
326
326
  self.droped_features = []
327
+ self.converted_to_int = []
328
+ self.converted_to_str = []
329
+ self.converted_to_numeric = []
327
330
 
328
331
  def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
329
332
  x, y, _, fit_params = self._prepare_to_fit(x, y)
@@ -335,44 +338,6 @@ class EstimatorWrapper:
335
338
  x, _, _ = self._prepare_to_calculate(x, None)
336
339
  return self.estimator.predict(x, **kwargs)
337
340
 
338
- def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
339
- x, y, groups = self._prepare_data(x, y, groups=self.groups)
340
-
341
- self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
342
- self.droped_features = []
343
- for c in x.columns:
344
- if _get_unique_count(x[c]) < 2:
345
- self.logger.warning(f"Remove feature {c} because it has less than 2 unique values")
346
- self.droped_features.append(c)
347
- if c in self.cat_features:
348
- self.cat_features.remove(c)
349
- x.drop(columns=[c], inplace=True)
350
- elif c in self.cat_features:
351
- if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
352
- x[c] = x[c].astype(np.int64)
353
- elif is_numeric_object(x[c]):
354
- self.logger.warning(
355
- f"Convert numeric feature {c} of type {x[c].dtype} to numeric and remove from cat_features"
356
- )
357
- x[c] = pd.to_numeric(x[c], errors="coerce")
358
- self.cat_features.remove(c)
359
- elif x[c].dtype != "category":
360
- x[c] = x[c].astype(str)
361
- elif self.text_features is not None and c in self.text_features:
362
- x[c] = x[c].astype(str)
363
- else:
364
- if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
365
- x[c] = x[c].astype(np.int64)
366
- elif not is_valid_numeric_array_data(x[c]):
367
- try:
368
- x[c] = pd.to_numeric(x[c], errors="raise")
369
- except (ValueError, TypeError):
370
- self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
371
- self.droped_features.append(c)
372
- x.drop(columns=[c], inplace=True)
373
-
374
- return x, y, groups, {}
375
-
376
341
  def _prepare_data(
377
342
  self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
378
343
  ) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
@@ -403,26 +368,82 @@ class EstimatorWrapper:
403
368
 
404
369
  return x, y
405
370
 
406
- def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
407
- x, y, _ = self._prepare_data(x, y)
408
-
409
- if self.droped_features:
410
- self.logger.warning(f"Dropped features: {self.droped_features}")
411
- x = x.drop(columns=self.droped_features)
371
+ def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
372
+ x, y, groups = self._prepare_data(x, y, groups=self.groups)
412
373
 
374
+ self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
375
+ self.droped_features = []
376
+ self.converted_to_int = []
377
+ self.converted_to_str = []
378
+ self.converted_to_numeric = []
413
379
  for c in x.columns:
414
- if c in self.cat_features:
380
+ if _get_unique_count(x[c]) < 2:
381
+ self.logger.warning(f"Remove feature {c} because it has less than 2 unique values")
382
+ if c in self.cat_features:
383
+ self.cat_features.remove(c)
384
+ x.drop(columns=[c], inplace=True)
385
+ self.droped_features.append(c)
386
+ elif self.text_features is not None and c in self.text_features:
387
+ x[c] = x[c].astype(str)
388
+ self.converted_to_str.append(c)
389
+ elif c in self.cat_features:
415
390
  if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
416
391
  x[c] = x[c].astype(np.int64)
417
- elif x[c].dtype != "category":
392
+ self.converted_to_int.append(c)
393
+ elif x[c].dtype == "category" and is_integer_dtype(x[c].cat.categories):
394
+ self.logger.info(
395
+ f"Convert categorical feature {c} with integer categories"
396
+ " to int64 and remove from cat_features"
397
+ )
398
+ x[c] = x[c].astype(np.int64)
399
+ self.converted_to_int.append(c)
400
+ self.cat_features.remove(c)
401
+ elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
402
+ self.logger.info(
403
+ f"Convert float cat feature {c} to string"
404
+ )
418
405
  x[c] = x[c].astype(str)
419
- elif self.text_features is not None and c in self.text_features:
420
- x[c] = x[c].astype(str)
406
+ self.converted_to_str.append(c)
407
+ elif x[c].dtype not in ["category", "int64"]:
408
+ x[c] = x[c].astype(str)
409
+ self.converted_to_str.append(c)
421
410
  else:
422
411
  if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
412
+ self.logger.info(f"Convert bool feature {c} to int64")
423
413
  x[c] = x[c].astype(np.int64)
414
+ self.converted_to_int.append(c)
424
415
  elif not is_valid_numeric_array_data(x[c]):
425
- x[c] = pd.to_numeric(x[c], errors="coerce")
416
+ try:
417
+ x[c] = pd.to_numeric(x[c], errors="raise")
418
+ self.converted_to_numeric.append(c)
419
+ except (ValueError, TypeError):
420
+ self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
421
+ x.drop(columns=[c], inplace=True)
422
+ self.droped_features.append(c)
423
+
424
+ return x, y, groups, {}
425
+
426
+ def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
427
+ x, y, _ = self._prepare_data(x, y)
428
+
429
+ if self.droped_features:
430
+ self.logger.info(f"Drop features on calculate metrics: {self.droped_features}")
431
+ x = x.drop(columns=self.droped_features)
432
+
433
+ if self.converted_to_int:
434
+ self.logger.info(f"Convert to int features on calculate metrics: {self.converted_to_int}")
435
+ for c in self.converted_to_int:
436
+ x[c] = x[c].astype(np.int64)
437
+
438
+ if self.converted_to_str:
439
+ self.logger.info(f"Convert to str features on calculate metrics: {self.converted_to_str}")
440
+ for c in self.converted_to_str:
441
+ x[c] = x[c].astype(str)
442
+
443
+ if self.converted_to_numeric:
444
+ self.logger.info(f"Convert to numeric features on calculate metrics: {self.converted_to_numeric}")
445
+ for c in self.converted_to_numeric:
446
+ x[c] = pd.to_numeric(x[c], errors="coerce")
426
447
 
427
448
  return x, y, {}
428
449
 
upgini/search_task.py CHANGED
@@ -179,6 +179,7 @@ class SearchTask:
179
179
  for f in meta.generated_features
180
180
  for c in f.base_columns
181
181
  if c.ads_definition_id is None
182
+ and not c.original_name.endswith("_emb") # embeddings already added
182
183
  )
183
184
  return list(features_for_transform)
184
185
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.81a3832.dev10
3
+ Version: 1.2.81a3832.dev11
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,13 +1,13 @@
1
- upgini/__about__.py,sha256=dUnN248oLg0rBaOttshEyx0_AtLIiP6ku5lXmtwrlQo,34
1
+ upgini/__about__.py,sha256=aMsoGp7JafbllKBjbZ_9sxh2xfd5oZMdcOt6Id_WaBU,34
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=ZSSukaq4_mngCkJyQe-XCssXbH8nOD7ByWfSHi9nypc,210847
6
+ upgini/features_enricher.py,sha256=aIG16mpdUZqV0GLMoDA4LiXMPbu3a-m72mhVqGnIww4,210860
7
7
  upgini/http.py,sha256=AfaJ3c8z_tK2hZFEehNybDKE0mp1tYcyAP_l0_p8bLQ,43933
8
8
  upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
9
- upgini/metrics.py,sha256=DpXJtooXDCLTJUf3JlfIsJiwx9Hg-2vv4-k4RWkXFMU,42269
10
- upgini/search_task.py,sha256=RcvAE785yksWTsTNWuZFVNlk32jHElMoEna1T_C5N8Q,17823
9
+ upgini/metrics.py,sha256=Zb-AwpstEKWaIuLqfIWLF--UGQwIoLbGYnHRlyPQ_cY,43304
10
+ upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
13
13
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.81a3832.dev10.dist-info/METADATA,sha256=F0Eg-CF-u-X2QDwUGlH0Fom-Ys1Br4bfoR_RBUq0ob8,49173
74
- upgini-1.2.81a3832.dev10.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
- upgini-1.2.81a3832.dev10.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.81a3832.dev10.dist-info/RECORD,,
73
+ upgini-1.2.81a3832.dev11.dist-info/METADATA,sha256=h9Tlze7oWU3tEfYMuF9BYZTD7hlFeZM-zjrkIzMml4k,49173
74
+ upgini-1.2.81a3832.dev11.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.81a3832.dev11.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.81a3832.dev11.dist-info/RECORD,,