upgini 1.1.103__py3-none-any.whl → 1.1.104__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -44,6 +44,7 @@ class DataSourcePublisher:
44
44
  exclude_columns: Optional[List[str]] = None,
45
45
  hash_feature_names=False,
46
46
  snapshot_frequency_days: Optional[int] = None,
47
+ features_for_embeddings: Optional[List[str]] = None,
47
48
  ) -> str:
48
49
  trace_id = str(uuid.uuid4())
49
50
 
@@ -65,6 +66,7 @@ class DataSourcePublisher:
65
66
  "excludeColumns": exclude_columns,
66
67
  "hashFeatureNames": hash_feature_names,
67
68
  "snapshotFrequencyDays": snapshot_frequency_days,
69
+ "featuresForEmbeddings": features_for_embeddings,
68
70
  }
69
71
  self.logger.info(f"Start registering data table {request}")
70
72
 
@@ -639,8 +639,8 @@ class FeaturesEnricher(TransformerMixin):
639
639
  if (
640
640
  self._search_task is None
641
641
  or self._search_task.initial_max_hit_rate_v2() is None
642
- or self.X is None
643
- or self.y is None
642
+ or (self.X is None and X is None)
643
+ or (self.y is None and y is None)
644
644
  ):
645
645
  raise ValidationError(bundle.get("metrics_unfitted_enricher"))
646
646
 
@@ -653,6 +653,18 @@ class FeaturesEnricher(TransformerMixin):
653
653
  self.__display_slack_community_link(msg)
654
654
  return None
655
655
 
656
+ if (
657
+ estimator is not None
658
+ and hasattr(estimator, "get_param")
659
+ and estimator.get_param("cat_features") is not None
660
+ ):
661
+ cat_features = estimator.get_param("cat_features")
662
+ if len(cat_features) > 0 and isinstance(cat_features[0], int):
663
+ effectiveX = X or self.X
664
+ cat_features = [effectiveX.columns[i] for i in cat_features]
665
+ else:
666
+ cat_features = None
667
+
656
668
  prepared_data = self._prepare_data_for_metrics(
657
669
  trace_id, X, y, eval_set, exclude_features_sources, importance_threshold, max_features
658
670
  )
@@ -699,7 +711,7 @@ class FeaturesEnricher(TransformerMixin):
699
711
  f"Calculate baseline {metric} on client features: {fitting_X.columns.to_list()}"
700
712
  )
701
713
  baseline_estimator = EstimatorWrapper.create(
702
- estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring
714
+ estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring, cat_features
703
715
  )
704
716
  etalon_metric = baseline_estimator.cross_val_predict(fitting_X, y_sorted)
705
717
 
@@ -711,7 +723,7 @@ class FeaturesEnricher(TransformerMixin):
711
723
  f"Calculate enriched {metric} on combined features: {fitting_enriched_X.columns.to_list()}"
712
724
  )
713
725
  enriched_estimator = EstimatorWrapper.create(
714
- estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring
726
+ estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring, cat_features
715
727
  )
716
728
  enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
717
729
  if etalon_metric is not None:
upgini/metrics.py CHANGED
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Callable, List, Tuple, Union
2
+ from typing import Callable, List, Optional, Tuple, Union
3
3
 
4
4
  import numpy as np
5
5
  import pandas as pd
@@ -14,6 +14,7 @@ from sklearn.metrics._regression import (
14
14
  mean_squared_error,
15
15
  )
16
16
  from sklearn.model_selection import BaseCrossValidator, cross_validate
17
+ from copy import deepcopy
17
18
 
18
19
  from upgini.errors import ValidationError
19
20
  from upgini.metadata import ModelTaskType
@@ -40,7 +41,7 @@ LIGHTGBM_PARAMS = {
40
41
  "max_depth": 4,
41
42
  "n_estimators": 150,
42
43
  "learning_rate": 0.05,
43
- "min_child_weight": 1
44
+ "min_child_weight": 1,
44
45
  }
45
46
 
46
47
  N_FOLDS = 5
@@ -129,6 +130,7 @@ class EstimatorWrapper:
129
130
  cv: BaseCrossValidator,
130
131
  X: pd.DataFrame,
131
132
  scoring: Union[Callable, str, None] = None,
133
+ cat_features: Optional[List[str]] = None,
132
134
  ) -> "EstimatorWrapper":
133
135
  scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
134
136
  kwargs = {
@@ -149,8 +151,16 @@ class EstimatorWrapper:
149
151
  else:
150
152
  raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
151
153
  else:
152
- kwargs["estimator"] = estimator
154
+ if hasattr(estimator, "copy"):
155
+ estimator_copy = estimator.copy()
156
+ else:
157
+ estimator_copy = deepcopy(estimator)
158
+ kwargs["estimator"] = estimator_copy
153
159
  if isinstance(estimator, CatBoostClassifier) or isinstance(estimator, CatBoostRegressor):
160
+ if cat_features is not None:
161
+ estimator_copy.set_params(cat_features=[
162
+ X.columns.get_loc(cat_feature) for cat_feature in cat_features
163
+ ])
154
164
  estimator = CatBoostWrapper(**kwargs)
155
165
  else:
156
166
  try:
@@ -197,6 +207,15 @@ class CatBoostWrapper(EstimatorWrapper):
197
207
  else:
198
208
  X = X.drop(columns=name)
199
209
  cat_features_idx = [X.columns.get_loc(c) for c in unique_cat_features]
210
+ if (
211
+ hasattr(self.estimator, "get_param")
212
+ and hasattr(self.estimator, "_init_params")
213
+ and self.estimator.get_param("cat_features") is not None
214
+ ):
215
+ cat_features_set = set(cat_features_idx)
216
+ cat_features_set.update(self.estimator.get_param("cat_features"))
217
+ cat_features_idx = list(cat_features_set)
218
+ del self.estimator._init_params["cat_features"]
200
219
 
201
220
  params.update({"cat_features": cat_features_idx})
202
221
  return X, y, params
@@ -376,6 +395,6 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
376
395
  def _is_too_many_categorical_values(X: pd.DataFrame) -> bool:
377
396
  many_values_features_count = 0
378
397
  for f in _get_cat_features(X):
379
- if X[f].nunique() > 100:
398
+ if X[f].astype("string").nunique() > 100:
380
399
  many_values_features_count += 1
381
400
  return many_values_features_count >= 2
@@ -9,8 +9,7 @@ from pandas.api.types import is_string_dtype
9
9
  from upgini.metadata import SearchKey
10
10
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
11
11
 
12
-
13
- EMAIL_REGEX = re.compile(r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+')
12
+ EMAIL_REGEX = re.compile(r"^[a-zA-Z0-9.!#$%&’*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*$")
14
13
 
15
14
 
16
15
  class EmailSearchKeyDetector(BaseSearchKeyDetector):
@@ -20,13 +19,11 @@ class EmailSearchKeyDetector(BaseSearchKeyDetector):
20
19
  def _is_search_key_by_values(self, column: pd.Series) -> bool:
21
20
  if not is_string_dtype(column):
22
21
  return False
22
+ if not column.astype("string").str.contains("@").any():
23
+ return False
23
24
 
24
25
  all_count = len(column)
25
- is_email_count = len(
26
- column.loc[
27
- column.astype("string").str.fullmatch(EMAIL_REGEX)
28
- ]
29
- )
26
+ is_email_count = len(column.loc[column.astype("string").str.fullmatch(EMAIL_REGEX)])
30
27
  return is_email_count / all_count > 0.1
31
28
 
32
29
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.103
3
+ Version: 1.1.104
4
4
  Summary: Low-code feature search and enrichment library for machine learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -2,17 +2,17 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
3
  upgini/dataset.py,sha256=fl01WTbM2smgskjrHQJS9oTzymEj5ZulGngCU_d5PnQ,42110
4
4
  upgini/errors.py,sha256=BqpvfhW2jJW5fa5KXj0alhXatGl-WK4xTl309-QNLp8,959
5
- upgini/features_enricher.py,sha256=bjPiQDGn2ULS1dEFq7vIFxF49QM8UjJWFvTewqjDdHw,109147
5
+ upgini/features_enricher.py,sha256=PtZIiOtpLItYRm3U7e5gsWAwAiTze4rznuKFFHjFpuQ,109768
6
6
  upgini/http.py,sha256=kgWj6wU1PbGPoGAbRvK35umXQ5zwEfEKeGy5Az0fss0,35479
7
7
  upgini/metadata.py,sha256=GPGsaGi5UtePQR2Qiqc7OJZn-ewvHmvepn3P_wJDW7Y,5856
8
- upgini/metrics.py,sha256=3gKDUJe4IzcS32hLitbPj2G-y-F66eyzKw0DWSIkun0,13937
8
+ upgini/metrics.py,sha256=uJhtGKgUUFnvdF16xscfe9AGDoDN6LqUV97RWDP39NU,14869
9
9
  upgini/search_task.py,sha256=H7l-BhCRF9t58D0L1xNdC_qU_JFHYnAZZ165fVDQgmM,33884
10
10
  upgini/spinner.py,sha256=X9a0xhj0QVIwjVTTjXUTuAgPBnyrLbW-B6G534fxs1E,1149
11
11
  upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
12
12
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
13
13
  upgini/ads_management/ads_manager.py,sha256=Cc3v4lLLpM0g4oUH_q2DYFN3bNWpSmltAGnZQby3G74,2630
14
14
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- upgini/data_source/data_source_publisher.py,sha256=siGi3jWuP1wu4QS5g5XwNYGhCF0ILOw14qRXCfxD2jo,8415
15
+ upgini/data_source/data_source_publisher.py,sha256=-Tpqiw6xrCinxdDKIEg6aS68ZqLwxoBrg4J4PTQNs6g,8546
16
16
  upgini/mdc/__init__.py,sha256=CuKmWYCqAnmiq1S7wgMzJhSCTsXuoeiZWXSfzw0lyig,1152
17
17
  upgini/mdc/context.py,sha256=eVNEubcgkiAP139Vna2qtUBZJWoy15rWWAuB0TFv54E,1484
18
18
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -31,7 +31,7 @@ upgini/utils/country_utils.py,sha256=9BXSXoGm3nVoOZE_bRENY-KMkwMUFvAF3Au0zxUNA1o
31
31
  upgini/utils/cv_utils.py,sha256=PeexQVPWrpUNlmwGtfU1FWA-aI1UyrMDgMT594ErpxA,2252
32
32
  upgini/utils/datetime_utils.py,sha256=PK1Fc5rJ_UhCJc1TNOZPSrtsYxjD7v9dsBYOZj1RKvo,4292
33
33
  upgini/utils/display_utils.py,sha256=iG3-hdv8_rJDWKwnQYIi1SHF-gLPAEi8jjk_05-qtMg,1934
34
- upgini/utils/email_utils.py,sha256=H05wMKVZML36Ipwxv1C4StDUyPJVaXWgU5NA06UDJMo,3048
34
+ upgini/utils/email_utils.py,sha256=2IUxP1e8DsmU4qS1BN3n1JmuziZO_cV35fNf4Di0yxc,3090
35
35
  upgini/utils/features_validator.py,sha256=LZAKTWtmINWII09UHF0R0muEz7yHLGlJkLUk8zM305Q,2190
36
36
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
37
37
  upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,408
@@ -39,8 +39,8 @@ upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3
39
39
  upgini/utils/target_utils.py,sha256=3eHrDy_Dc9ozuOwHGnGA705m9glCxKmjB-DfLrflqiA,1370
40
40
  upgini/utils/track_info.py,sha256=O_oL4gy1jH0DVgtiUeZAW0YKCeRT4B_bzH_SZYkFaOE,4076
41
41
  upgini/utils/warning_counter.py,sha256=vnmdFo5-7GBkU2bK9h_uC0K0Y_wtfcYstxOdeRfacO0,228
42
- upgini-1.1.103.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
43
- upgini-1.1.103.dist-info/METADATA,sha256=8zAaEAC2-WpBOPqrRagZYdO1mABJzDTWhVgs7R86E1Q,41101
44
- upgini-1.1.103.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
45
- upgini-1.1.103.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
46
- upgini-1.1.103.dist-info/RECORD,,
42
+ upgini-1.1.104.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
43
+ upgini-1.1.104.dist-info/METADATA,sha256=nvFzylBFXaBafhSec5_Ja5KfVcrNc2pzAHNRVzBMzhA,41101
44
+ upgini-1.1.104.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
45
+ upgini-1.1.104.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
46
+ upgini-1.1.104.dist-info/RECORD,,