upgini 1.1.102a1__tar.gz → 1.1.104__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (52) hide show
  1. {upgini-1.1.102a1/src/upgini.egg-info → upgini-1.1.104}/PKG-INFO +2 -2
  2. {upgini-1.1.102a1 → upgini-1.1.104}/README.md +1 -1
  3. {upgini-1.1.102a1 → upgini-1.1.104}/setup.py +1 -1
  4. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/data_source/data_source_publisher.py +2 -0
  5. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/features_enricher.py +17 -5
  6. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/http.py +1 -3
  7. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/metrics.py +27 -3
  8. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/utils/datetime_utils.py +3 -0
  9. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/utils/email_utils.py +4 -7
  10. {upgini-1.1.102a1 → upgini-1.1.104/src/upgini.egg-info}/PKG-INFO +2 -2
  11. {upgini-1.1.102a1 → upgini-1.1.104}/LICENSE +0 -0
  12. {upgini-1.1.102a1 → upgini-1.1.104}/pyproject.toml +0 -0
  13. {upgini-1.1.102a1 → upgini-1.1.104}/setup.cfg +0 -0
  14. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/__init__.py +0 -0
  15. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/ads.py +0 -0
  16. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/ads_management/__init__.py +0 -0
  17. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/ads_management/ads_manager.py +0 -0
  18. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/data_source/__init__.py +0 -0
  19. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/dataset.py +0 -0
  20. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/errors.py +0 -0
  21. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/mdc/__init__.py +0 -0
  22. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/mdc/context.py +0 -0
  23. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/metadata.py +0 -0
  24. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/normalizer/__init__.py +0 -0
  25. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/normalizer/phone_normalizer.py +0 -0
  26. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/resource_bundle/__init__.py +0 -0
  27. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/resource_bundle/exceptions.py +0 -0
  28. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/resource_bundle/strings.properties +0 -0
  29. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/sampler/__init__.py +0 -0
  30. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/sampler/base.py +0 -0
  31. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/sampler/random_under_sampler.py +0 -0
  32. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/sampler/utils.py +0 -0
  33. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/search_task.py +0 -0
  34. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/spinner.py +0 -0
  35. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/utils/__init__.py +0 -0
  36. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/utils/base_search_key_detector.py +0 -0
  37. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/utils/blocked_time_series.py +0 -0
  38. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/utils/country_utils.py +0 -0
  39. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/utils/cv_utils.py +0 -0
  40. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/utils/display_utils.py +0 -0
  41. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/utils/features_validator.py +0 -0
  42. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/utils/format.py +0 -0
  43. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/utils/phone_utils.py +0 -0
  44. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/utils/postal_code_utils.py +0 -0
  45. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/utils/target_utils.py +0 -0
  46. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/utils/track_info.py +0 -0
  47. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/utils/warning_counter.py +0 -0
  48. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini/version_validator.py +0 -0
  49. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini.egg-info/SOURCES.txt +0 -0
  50. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini.egg-info/dependency_links.txt +0 -0
  51. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini.egg-info/requires.txt +0 -0
  52. {upgini-1.1.102a1 → upgini-1.1.104}/src/upgini.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.102a1
3
+ Version: 1.1.104
4
4
  Summary: Low-code feature search and enrichment library for machine learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -25,7 +25,7 @@ Description:
25
25
  </p>
26
26
  <p align=center>
27
27
  <a href="/LICENSE"><img alt="BSD-3 license" src="https://img.shields.io/badge/license-BSD--3%20Clause-green"></a>
28
- <a href="https://www.python.org/downloads/release/python-380/"><img alt="Python 3.8" src="https://img.shields.io/badge/python_version-3.8-red?logo=python&logoColor=white"></a>
28
+ <a href="https://pypi.org/project/upgini/"><img alt="PyPI - Python Version" src="https://img.shields.io/pypi/pyversions/upgini"></a>
29
29
  <a href="https://pypi.org/project/upgini/"><img alt="PyPI" src="https://img.shields.io/pypi/v/upgini?label=Release"></a>
30
30
  <a href="https://pypistats.org/packages/upgini"><img alt="Downloads from pypistats" src="https://pepy.tech/badge/upgini"></a>
31
31
  <a href="https://4mlg.short.gy/join-upgini-community"><img alt="Upgini slack community" src="https://img.shields.io/badge/slack-@upgini-orange.svg?logo=slack"></a>
@@ -15,7 +15,7 @@ enriches your ML pipeline with only the relevant features</b> </p>
15
15
  </p>
16
16
  <p align=center>
17
17
  <a href="/LICENSE"><img alt="BSD-3 license" src="https://img.shields.io/badge/license-BSD--3%20Clause-green"></a>
18
- <a href="https://www.python.org/downloads/release/python-380/"><img alt="Python 3.8" src="https://img.shields.io/badge/python_version-3.8-red?logo=python&logoColor=white"></a>
18
+ <a href="https://pypi.org/project/upgini/"><img alt="PyPI - Python Version" src="https://img.shields.io/pypi/pyversions/upgini"></a>
19
19
  <a href="https://pypi.org/project/upgini/"><img alt="PyPI" src="https://img.shields.io/pypi/v/upgini?label=Release"></a>
20
20
  <a href="https://pypistats.org/packages/upgini"><img alt="Downloads from pypistats" src="https://pepy.tech/badge/upgini"></a>
21
21
  <a href="https://4mlg.short.gy/join-upgini-community"><img alt="Upgini slack community" src="https://img.shields.io/badge/slack-@upgini-orange.svg?logo=slack"></a>
@@ -35,7 +35,7 @@ def send_log(msg: str):
35
35
 
36
36
 
37
37
  here = Path(__file__).parent.resolve()
38
- version = "1.1.102a1"
38
+ version = "1.1.104"
39
39
  try:
40
40
  send_log(f"Start setup PyLib version {version}")
41
41
  setup(
@@ -44,6 +44,7 @@ class DataSourcePublisher:
44
44
  exclude_columns: Optional[List[str]] = None,
45
45
  hash_feature_names=False,
46
46
  snapshot_frequency_days: Optional[int] = None,
47
+ features_for_embeddings: Optional[List[str]] = None,
47
48
  ) -> str:
48
49
  trace_id = str(uuid.uuid4())
49
50
 
@@ -65,6 +66,7 @@ class DataSourcePublisher:
65
66
  "excludeColumns": exclude_columns,
66
67
  "hashFeatureNames": hash_feature_names,
67
68
  "snapshotFrequencyDays": snapshot_frequency_days,
69
+ "featuresForEmbeddings": features_for_embeddings,
68
70
  }
69
71
  self.logger.info(f"Start registering data table {request}")
70
72
 
@@ -639,8 +639,8 @@ class FeaturesEnricher(TransformerMixin):
639
639
  if (
640
640
  self._search_task is None
641
641
  or self._search_task.initial_max_hit_rate_v2() is None
642
- or self.X is None
643
- or self.y is None
642
+ or (self.X is None and X is None)
643
+ or (self.y is None and y is None)
644
644
  ):
645
645
  raise ValidationError(bundle.get("metrics_unfitted_enricher"))
646
646
 
@@ -653,6 +653,18 @@ class FeaturesEnricher(TransformerMixin):
653
653
  self.__display_slack_community_link(msg)
654
654
  return None
655
655
 
656
+ if (
657
+ estimator is not None
658
+ and hasattr(estimator, "get_param")
659
+ and estimator.get_param("cat_features") is not None
660
+ ):
661
+ cat_features = estimator.get_param("cat_features")
662
+ if len(cat_features) > 0 and isinstance(cat_features[0], int):
663
+ effectiveX = X or self.X
664
+ cat_features = [effectiveX.columns[i] for i in cat_features]
665
+ else:
666
+ cat_features = None
667
+
656
668
  prepared_data = self._prepare_data_for_metrics(
657
669
  trace_id, X, y, eval_set, exclude_features_sources, importance_threshold, max_features
658
670
  )
@@ -699,7 +711,7 @@ class FeaturesEnricher(TransformerMixin):
699
711
  f"Calculate baseline {metric} on client features: {fitting_X.columns.to_list()}"
700
712
  )
701
713
  baseline_estimator = EstimatorWrapper.create(
702
- estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring
714
+ estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring, cat_features
703
715
  )
704
716
  etalon_metric = baseline_estimator.cross_val_predict(fitting_X, y_sorted)
705
717
 
@@ -711,7 +723,7 @@ class FeaturesEnricher(TransformerMixin):
711
723
  f"Calculate enriched {metric} on combined features: {fitting_enriched_X.columns.to_list()}"
712
724
  )
713
725
  enriched_estimator = EstimatorWrapper.create(
714
- estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring
726
+ estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring, cat_features
715
727
  )
716
728
  enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
717
729
  if etalon_metric is not None:
@@ -2224,7 +2236,7 @@ class FeaturesEnricher(TransformerMixin):
2224
2236
 
2225
2237
  def _validate_binary_observations(self, y):
2226
2238
  task_type = self.model_task_type or define_task(y, self.logger, silent=True)
2227
- if task_type == ModelTaskType.BINARY and _num_samples(y) < 1000:
2239
+ if task_type == ModelTaskType.BINARY and (y.value_counts() < 1000).any():
2228
2240
  msg = bundle.get("binary_small_dataset")
2229
2241
  self.logger.warning(msg)
2230
2242
  print(msg)
@@ -790,7 +790,6 @@ class BackendLogHandler(logging.Handler):
790
790
  text = self.format(record)
791
791
  tags = get_track_metrics()
792
792
  tags["version"] = __version__
793
- print(f"Sending log to server: {text}")
794
793
  self.rest_client.send_log_event(
795
794
  LogEvent(
796
795
  source="python",
@@ -800,8 +799,7 @@ class BackendLogHandler(logging.Handler):
800
799
  service="PyLib",
801
800
  )
802
801
  )
803
- except Exception as e:
804
- print(f"Failed to send log: {e}")
802
+ except Exception:
805
803
  pass
806
804
 
807
805
  thread = threading.Thread(target=task)
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Callable, List, Tuple, Union
2
+ from typing import Callable, List, Optional, Tuple, Union
3
3
 
4
4
  import numpy as np
5
5
  import pandas as pd
@@ -14,6 +14,7 @@ from sklearn.metrics._regression import (
14
14
  mean_squared_error,
15
15
  )
16
16
  from sklearn.model_selection import BaseCrossValidator, cross_validate
17
+ from copy import deepcopy
17
18
 
18
19
  from upgini.errors import ValidationError
19
20
  from upgini.metadata import ModelTaskType
@@ -36,6 +37,11 @@ CATBOOST_PARAMS = {
36
37
 
37
38
  LIGHTGBM_PARAMS = {
38
39
  "random_state": DEFAULT_RANDOM_STATE,
40
+ "num_leaves": 16,
41
+ "max_depth": 4,
42
+ "n_estimators": 150,
43
+ "learning_rate": 0.05,
44
+ "min_child_weight": 1,
39
45
  }
40
46
 
41
47
  N_FOLDS = 5
@@ -124,6 +130,7 @@ class EstimatorWrapper:
124
130
  cv: BaseCrossValidator,
125
131
  X: pd.DataFrame,
126
132
  scoring: Union[Callable, str, None] = None,
133
+ cat_features: Optional[List[str]] = None,
127
134
  ) -> "EstimatorWrapper":
128
135
  scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
129
136
  kwargs = {
@@ -144,8 +151,16 @@ class EstimatorWrapper:
144
151
  else:
145
152
  raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
146
153
  else:
147
- kwargs["estimator"] = estimator
154
+ if hasattr(estimator, "copy"):
155
+ estimator_copy = estimator.copy()
156
+ else:
157
+ estimator_copy = deepcopy(estimator)
158
+ kwargs["estimator"] = estimator_copy
148
159
  if isinstance(estimator, CatBoostClassifier) or isinstance(estimator, CatBoostRegressor):
160
+ if cat_features is not None:
161
+ estimator_copy.set_params(cat_features=[
162
+ X.columns.get_loc(cat_feature) for cat_feature in cat_features
163
+ ])
149
164
  estimator = CatBoostWrapper(**kwargs)
150
165
  else:
151
166
  try:
@@ -192,6 +207,15 @@ class CatBoostWrapper(EstimatorWrapper):
192
207
  else:
193
208
  X = X.drop(columns=name)
194
209
  cat_features_idx = [X.columns.get_loc(c) for c in unique_cat_features]
210
+ if (
211
+ hasattr(self.estimator, "get_param")
212
+ and hasattr(self.estimator, "_init_params")
213
+ and self.estimator.get_param("cat_features") is not None
214
+ ):
215
+ cat_features_set = set(cat_features_idx)
216
+ cat_features_set.update(self.estimator.get_param("cat_features"))
217
+ cat_features_idx = list(cat_features_set)
218
+ del self.estimator._init_params["cat_features"]
195
219
 
196
220
  params.update({"cat_features": cat_features_idx})
197
221
  return X, y, params
@@ -371,6 +395,6 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
371
395
  def _is_too_many_categorical_values(X: pd.DataFrame) -> bool:
372
396
  many_values_features_count = 0
373
397
  for f in _get_cat_features(X):
374
- if X[f].nunique() > 100:
398
+ if X[f].astype("string").nunique() > 100:
375
399
  many_values_features_count += 1
376
400
  return many_values_features_count >= 2
@@ -5,6 +5,7 @@ import numpy as np
5
5
  import pandas as pd
6
6
  from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
7
7
  from dateutil.relativedelta import relativedelta
8
+ import datetime
8
9
 
9
10
  from upgini.errors import ValidationError
10
11
 
@@ -29,6 +30,8 @@ class DateTimeSearchKeyConverter:
29
30
 
30
31
  def convert(self, df: pd.DataFrame) -> pd.DataFrame:
31
32
  df = df.copy()
33
+ if df[self.date_column].apply(lambda x: isinstance(x, datetime.datetime)).all():
34
+ df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
32
35
  if is_string_dtype(df[self.date_column]):
33
36
  try:
34
37
  df[self.date_column] = pd.to_datetime(df[self.date_column], format=self.date_format)
@@ -9,8 +9,7 @@ from pandas.api.types import is_string_dtype
9
9
  from upgini.metadata import SearchKey
10
10
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
11
11
 
12
-
13
- EMAIL_REGEX = re.compile(r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+')
12
+ EMAIL_REGEX = re.compile(r"^[a-zA-Z0-9.!#$%&’*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*$")
14
13
 
15
14
 
16
15
  class EmailSearchKeyDetector(BaseSearchKeyDetector):
@@ -20,13 +19,11 @@ class EmailSearchKeyDetector(BaseSearchKeyDetector):
20
19
  def _is_search_key_by_values(self, column: pd.Series) -> bool:
21
20
  if not is_string_dtype(column):
22
21
  return False
22
+ if not column.astype("string").str.contains("@").any():
23
+ return False
23
24
 
24
25
  all_count = len(column)
25
- is_email_count = len(
26
- column.loc[
27
- column.astype("string").str.fullmatch(EMAIL_REGEX)
28
- ]
29
- )
26
+ is_email_count = len(column.loc[column.astype("string").str.fullmatch(EMAIL_REGEX)])
30
27
  return is_email_count / all_count > 0.1
31
28
 
32
29
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.102a1
3
+ Version: 1.1.104
4
4
  Summary: Low-code feature search and enrichment library for machine learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -25,7 +25,7 @@ Description:
25
25
  </p>
26
26
  <p align=center>
27
27
  <a href="/LICENSE"><img alt="BSD-3 license" src="https://img.shields.io/badge/license-BSD--3%20Clause-green"></a>
28
- <a href="https://www.python.org/downloads/release/python-380/"><img alt="Python 3.8" src="https://img.shields.io/badge/python_version-3.8-red?logo=python&logoColor=white"></a>
28
+ <a href="https://pypi.org/project/upgini/"><img alt="PyPI - Python Version" src="https://img.shields.io/pypi/pyversions/upgini"></a>
29
29
  <a href="https://pypi.org/project/upgini/"><img alt="PyPI" src="https://img.shields.io/pypi/v/upgini?label=Release"></a>
30
30
  <a href="https://pypistats.org/packages/upgini"><img alt="Downloads from pypistats" src="https://pepy.tech/badge/upgini"></a>
31
31
  <a href="https://4mlg.short.gy/join-upgini-community"><img alt="Upgini slack community" src="https://img.shields.io/badge/slack-@upgini-orange.svg?logo=slack"></a>
File without changes
File without changes
File without changes
File without changes