upgini 1.2.9__tar.gz → 1.2.9a2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (65) hide show
  1. {upgini-1.2.9 → upgini-1.2.9a2}/PKG-INFO +1 -2
  2. {upgini-1.2.9 → upgini-1.2.9a2}/pyproject.toml +1 -1
  3. upgini-1.2.9a2/src/upgini/__about__.py +1 -0
  4. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/__init__.py +2 -2
  5. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/features_enricher.py +22 -13
  6. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/metrics.py +1 -5
  7. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/features_validator.py +1 -1
  8. upgini-1.2.9/src/upgini/__about__.py +0 -1
  9. {upgini-1.2.9 → upgini-1.2.9a2}/.gitignore +0 -0
  10. {upgini-1.2.9 → upgini-1.2.9a2}/LICENSE +0 -0
  11. {upgini-1.2.9 → upgini-1.2.9a2}/README.md +0 -0
  12. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/ads.py +0 -0
  13. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/ads_management/__init__.py +0 -0
  14. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/ads_management/ads_manager.py +0 -0
  15. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/autofe/__init__.py +0 -0
  16. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/autofe/all_operands.py +0 -0
  17. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/autofe/binary.py +0 -0
  18. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/autofe/date.py +0 -0
  19. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/autofe/feature.py +0 -0
  20. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/autofe/groupby.py +0 -0
  21. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/autofe/operand.py +0 -0
  22. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/autofe/unary.py +0 -0
  23. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/autofe/vector.py +0 -0
  24. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/data_source/__init__.py +0 -0
  25. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/data_source/data_source_publisher.py +0 -0
  26. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/dataset.py +0 -0
  27. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/errors.py +0 -0
  28. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/http.py +0 -0
  29. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/lazy_import.py +0 -0
  30. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/mdc/__init__.py +0 -0
  31. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/mdc/context.py +0 -0
  32. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/metadata.py +0 -0
  33. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/normalizer/__init__.py +0 -0
  34. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/normalizer/normalize_utils.py +0 -0
  35. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/resource_bundle/__init__.py +0 -0
  36. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/resource_bundle/exceptions.py +0 -0
  37. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/resource_bundle/strings.properties +0 -0
  38. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  39. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/sampler/__init__.py +0 -0
  40. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/sampler/base.py +0 -0
  41. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/sampler/random_under_sampler.py +0 -0
  42. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/sampler/utils.py +0 -0
  43. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/search_task.py +0 -0
  44. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/spinner.py +0 -0
  45. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/__init__.py +0 -0
  46. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/base_search_key_detector.py +0 -0
  47. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/blocked_time_series.py +0 -0
  48. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/country_utils.py +0 -0
  49. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/custom_loss_utils.py +0 -0
  50. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/cv_utils.py +0 -0
  51. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/datetime_utils.py +0 -0
  52. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/deduplicate_utils.py +0 -0
  53. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/display_utils.py +0 -0
  54. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/email_utils.py +0 -0
  55. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/fallback_progress_bar.py +0 -0
  56. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/format.py +0 -0
  57. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/ip_utils.py +0 -0
  58. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/phone_utils.py +0 -0
  59. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/postal_code_utils.py +0 -0
  60. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/progress_bar.py +0 -0
  61. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/sklearn_ext.py +0 -0
  62. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/target_utils.py +0 -0
  63. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/track_info.py +0 -0
  64. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/utils/warning_counter.py +0 -0
  65. {upgini-1.2.9 → upgini-1.2.9a2}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.9
3
+ Version: 1.2.9a2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -28,7 +28,6 @@ Requires-Dist: fastparquet>=0.8.1
28
28
  Requires-Dist: ipywidgets>=8.1.0
29
29
  Requires-Dist: jarowinkler>=2.0.0
30
30
  Requires-Dist: levenshtein>=0.25.1
31
- Requires-Dist: lightgbm>=3.3.2
32
31
  Requires-Dist: numpy<=1.26.4,>=1.19.0
33
32
  Requires-Dist: pandas<3.0.0,>=1.1.0
34
33
  Requires-Dist: pydantic<3.0.0,>1.0.0
@@ -38,7 +38,7 @@ dependencies = [
38
38
  "catboost>=1.0.3",
39
39
  "fastparquet>=0.8.1",
40
40
  "ipywidgets>=8.1.0",
41
- "lightgbm>=3.3.2",
41
+
42
42
  "numpy>=1.19.0,<=1.26.4",
43
43
  "pandas>=1.1.0,<3.0.0",
44
44
  "pydantic>1.0.0,<3.0.0",
@@ -0,0 +1 @@
1
+ __version__ = "1.2.9a2"
@@ -1,7 +1,7 @@
1
1
  import os
2
2
 
3
- from upgini.features_enricher import FeaturesEnricher # noqa: F401
4
- from upgini.metadata import SearchKey, CVType, RuntimeParameters, ModelTaskType # noqa: F401
3
+ # from upgini.features_enricher import FeaturesEnricher # noqa: F401
4
+ # from upgini.metadata import SearchKey, CVType, RuntimeParameters, ModelTaskType # noqa: F401
5
5
  # from .lazy_import import LazyImport
6
6
 
7
7
  os.environ["SETUPTOOLS_USE_DISTUTILS"] = "stdlib"
@@ -1103,7 +1103,7 @@ class FeaturesEnricher(TransformerMixin):
1103
1103
  else:
1104
1104
  eval_uplift = None
1105
1105
 
1106
- # effective_eval_set = eval_set if eval_set is not None else self.eval_set
1106
+ effective_eval_set = eval_set if eval_set is not None else self.eval_set
1107
1107
  eval_metrics = {
1108
1108
  self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
1109
1109
  "quality_metrics_eval_segment"
@@ -1369,7 +1369,6 @@ class FeaturesEnricher(TransformerMixin):
1369
1369
  + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
1370
1370
  )
1371
1371
  ]
1372
- self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
1373
1372
 
1374
1373
  filtered_enriched_features = self.__filtered_enriched_features(
1375
1374
  importance_threshold,
@@ -1436,19 +1435,31 @@ class FeaturesEnricher(TransformerMixin):
1436
1435
  )
1437
1436
 
1438
1437
  fitting_eval_set_dict = {}
1439
- fitting_x_columns = fitting_X.columns.to_list()
1440
- self.logger.info(f"Final list of fitting X columns: {fitting_x_columns}")
1441
- fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
1442
- self.logger.info(f"Final list of fitting enriched X columns: {fitting_enriched_x_columns}")
1443
1438
  for idx, eval_tuple in eval_set_sampled_dict.items():
1444
1439
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
1445
1440
  eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
1446
1441
  enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
1447
1442
  enriched_eval_X, eval_y_sampled, self.cv
1448
1443
  )
1449
- fitting_eval_X = eval_X_sorted[fitting_x_columns].copy()
1450
- fitting_enriched_eval_X = enriched_eval_X_sorted[fitting_enriched_x_columns].copy()
1451
-
1444
+ fitting_eval_X = eval_X_sorted[client_features].copy()
1445
+ fitting_enriched_eval_X = enriched_eval_X_sorted[
1446
+ client_features + existing_filtered_enriched_features
1447
+ ].copy()
1448
+
1449
+ # # Drop high cardinality features in eval set
1450
+ if len(columns_with_high_cardinality) > 0:
1451
+ fitting_eval_X = fitting_eval_X.drop(columns=columns_with_high_cardinality, errors="ignore")
1452
+ fitting_enriched_eval_X = fitting_enriched_eval_X.drop(
1453
+ columns=columns_with_high_cardinality, errors="ignore"
1454
+ )
1455
+ # Drop constant features in eval_set
1456
+ if len(constant_columns) > 0:
1457
+ fitting_eval_X = fitting_eval_X.drop(columns=constant_columns, errors="ignore")
1458
+ fitting_enriched_eval_X = fitting_enriched_eval_X.drop(columns=constant_columns, errors="ignore")
1459
+ # Drop datetime features in eval_set
1460
+ if len(datetime_features) > 0:
1461
+ fitting_eval_X = fitting_eval_X.drop(columns=datetime_features, errors="ignore")
1462
+ fitting_enriched_eval_X = fitting_enriched_eval_X.drop(columns=datetime_features, errors="ignore")
1452
1463
  # Convert bool to string in eval_set
1453
1464
  if len(bool_columns) > 0:
1454
1465
  fitting_eval_X[col] = fitting_eval_X[col].astype(str)
@@ -1669,7 +1680,6 @@ class FeaturesEnricher(TransformerMixin):
1669
1680
  X_sampled = enriched_Xy[x_columns].copy()
1670
1681
  y_sampled = enriched_Xy[TARGET].copy()
1671
1682
  enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
1672
- enriched_X_columns = enriched_X.columns.to_list()
1673
1683
 
1674
1684
  self.logger.info(f"Shape of enriched_X: {enriched_X.shape}")
1675
1685
  self.logger.info(f"Shape of X after sampling: {X_sampled.shape}")
@@ -1684,7 +1694,7 @@ class FeaturesEnricher(TransformerMixin):
1684
1694
  for idx in range(len(eval_set)):
1685
1695
  eval_X_sampled = enriched_eval_sets[idx + 1][x_columns].copy()
1686
1696
  eval_y_sampled = enriched_eval_sets[idx + 1][TARGET].copy()
1687
- enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
1697
+ enriched_eval_X = enriched_eval_sets[idx + 1].drop(columns=[TARGET, EVAL_SET_INDEX])
1688
1698
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1689
1699
 
1690
1700
  self.__cached_sampled_datasets = (
@@ -1763,13 +1773,12 @@ class FeaturesEnricher(TransformerMixin):
1763
1773
  X_sampled = enriched_Xy[x_columns].copy()
1764
1774
  y_sampled = enriched_Xy[TARGET].copy()
1765
1775
  enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX])
1766
- enriched_X_columns = enriched_X.columns.tolist()
1767
1776
 
1768
1777
  for idx in range(len(eval_set)):
1769
1778
  enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1770
1779
  eval_x_sampled = enriched_eval_xy[x_columns].copy()
1771
1780
  eval_y_sampled = enriched_eval_xy[TARGET].copy()
1772
- enriched_eval_x = enriched_eval_xy[enriched_X_columns].copy()
1781
+ enriched_eval_x = enriched_eval_xy.drop(columns=[TARGET, EVAL_SET_INDEX])
1773
1782
  eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
1774
1783
  else:
1775
1784
  self.logger.info("Transform without eval_set")
@@ -10,7 +10,6 @@ import catboost
10
10
  import numpy as np
11
11
  import pandas as pd
12
12
  from catboost import CatBoostClassifier, CatBoostRegressor
13
- from lightgbm import LGBMClassifier, LGBMRegressor
14
13
  from numpy import log1p
15
14
  from pandas.api.types import is_numeric_dtype
16
15
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
@@ -254,7 +253,6 @@ class EstimatorWrapper:
254
253
  def _prepare_data(
255
254
  self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
256
255
  ) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
257
- self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
258
256
  for c in x.columns:
259
257
  if is_numeric_dtype(x[c]):
260
258
  x[c] = x[c].astype(float)
@@ -273,7 +271,6 @@ class EstimatorWrapper:
273
271
  else:
274
272
  x, y = self._remove_empty_target_rows(x, y)
275
273
 
276
- self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
277
274
  return x, y, groups
278
275
 
279
276
  def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
@@ -408,6 +405,7 @@ class EstimatorWrapper:
408
405
  estimator = CatBoostWrapper(**kwargs)
409
406
  else:
410
407
  try:
408
+ from lightgbm import LGBMClassifier, LGBMRegressor
411
409
  if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
412
410
  estimator = LightGBMWrapper(**kwargs)
413
411
  else:
@@ -495,9 +493,7 @@ class CatBoostWrapper(EstimatorWrapper):
495
493
  if x[name].nunique() > 1:
496
494
  unique_cat_features.append(name)
497
495
  else:
498
- self.logger.info(f"Drop column {name} on preparing data for fit")
499
496
  x = x.drop(columns=name)
500
- self.exclude_features.append(name)
501
497
  self.cat_features = unique_cat_features
502
498
  if (
503
499
  hasattr(self.estimator, "get_param")
@@ -87,4 +87,4 @@ class FeaturesValidator:
87
87
 
88
88
  @staticmethod
89
89
  def find_constant_features(df: pd.DataFrame) -> List[str]:
90
- return [i for i in df if df[i].nunique() <= 1]
90
+ return [i for i in df if df[i].nunique() == 1]
@@ -1 +0,0 @@
1
- __version__ = "1.2.9"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes