upgini 1.1.103__py3-none-any.whl → 1.1.104__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/data_source/data_source_publisher.py +2 -0
- upgini/features_enricher.py +16 -4
- upgini/metrics.py +23 -4
- upgini/utils/email_utils.py +4 -7
- {upgini-1.1.103.dist-info → upgini-1.1.104.dist-info}/METADATA +1 -1
- {upgini-1.1.103.dist-info → upgini-1.1.104.dist-info}/RECORD +9 -9
- {upgini-1.1.103.dist-info → upgini-1.1.104.dist-info}/LICENSE +0 -0
- {upgini-1.1.103.dist-info → upgini-1.1.104.dist-info}/WHEEL +0 -0
- {upgini-1.1.103.dist-info → upgini-1.1.104.dist-info}/top_level.txt +0 -0
|
@@ -44,6 +44,7 @@ class DataSourcePublisher:
|
|
|
44
44
|
exclude_columns: Optional[List[str]] = None,
|
|
45
45
|
hash_feature_names=False,
|
|
46
46
|
snapshot_frequency_days: Optional[int] = None,
|
|
47
|
+
features_for_embeddings: Optional[List[str]] = None,
|
|
47
48
|
) -> str:
|
|
48
49
|
trace_id = str(uuid.uuid4())
|
|
49
50
|
|
|
@@ -65,6 +66,7 @@ class DataSourcePublisher:
|
|
|
65
66
|
"excludeColumns": exclude_columns,
|
|
66
67
|
"hashFeatureNames": hash_feature_names,
|
|
67
68
|
"snapshotFrequencyDays": snapshot_frequency_days,
|
|
69
|
+
"featuresForEmbeddings": features_for_embeddings,
|
|
68
70
|
}
|
|
69
71
|
self.logger.info(f"Start registering data table {request}")
|
|
70
72
|
|
upgini/features_enricher.py
CHANGED
|
@@ -639,8 +639,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
639
639
|
if (
|
|
640
640
|
self._search_task is None
|
|
641
641
|
or self._search_task.initial_max_hit_rate_v2() is None
|
|
642
|
-
or self.X is None
|
|
643
|
-
or self.y is None
|
|
642
|
+
or (self.X is None and X is None)
|
|
643
|
+
or (self.y is None and y is None)
|
|
644
644
|
):
|
|
645
645
|
raise ValidationError(bundle.get("metrics_unfitted_enricher"))
|
|
646
646
|
|
|
@@ -653,6 +653,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
653
653
|
self.__display_slack_community_link(msg)
|
|
654
654
|
return None
|
|
655
655
|
|
|
656
|
+
if (
|
|
657
|
+
estimator is not None
|
|
658
|
+
and hasattr(estimator, "get_param")
|
|
659
|
+
and estimator.get_param("cat_features") is not None
|
|
660
|
+
):
|
|
661
|
+
cat_features = estimator.get_param("cat_features")
|
|
662
|
+
if len(cat_features) > 0 and isinstance(cat_features[0], int):
|
|
663
|
+
effectiveX = X or self.X
|
|
664
|
+
cat_features = [effectiveX.columns[i] for i in cat_features]
|
|
665
|
+
else:
|
|
666
|
+
cat_features = None
|
|
667
|
+
|
|
656
668
|
prepared_data = self._prepare_data_for_metrics(
|
|
657
669
|
trace_id, X, y, eval_set, exclude_features_sources, importance_threshold, max_features
|
|
658
670
|
)
|
|
@@ -699,7 +711,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
699
711
|
f"Calculate baseline {metric} on client features: {fitting_X.columns.to_list()}"
|
|
700
712
|
)
|
|
701
713
|
baseline_estimator = EstimatorWrapper.create(
|
|
702
|
-
estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring
|
|
714
|
+
estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring, cat_features
|
|
703
715
|
)
|
|
704
716
|
etalon_metric = baseline_estimator.cross_val_predict(fitting_X, y_sorted)
|
|
705
717
|
|
|
@@ -711,7 +723,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
711
723
|
f"Calculate enriched {metric} on combined features: {fitting_enriched_X.columns.to_list()}"
|
|
712
724
|
)
|
|
713
725
|
enriched_estimator = EstimatorWrapper.create(
|
|
714
|
-
estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring
|
|
726
|
+
estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring, cat_features
|
|
715
727
|
)
|
|
716
728
|
enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
|
|
717
729
|
if etalon_metric is not None:
|
upgini/metrics.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Callable, List, Tuple, Union
|
|
2
|
+
from typing import Callable, List, Optional, Tuple, Union
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
@@ -14,6 +14,7 @@ from sklearn.metrics._regression import (
|
|
|
14
14
|
mean_squared_error,
|
|
15
15
|
)
|
|
16
16
|
from sklearn.model_selection import BaseCrossValidator, cross_validate
|
|
17
|
+
from copy import deepcopy
|
|
17
18
|
|
|
18
19
|
from upgini.errors import ValidationError
|
|
19
20
|
from upgini.metadata import ModelTaskType
|
|
@@ -40,7 +41,7 @@ LIGHTGBM_PARAMS = {
|
|
|
40
41
|
"max_depth": 4,
|
|
41
42
|
"n_estimators": 150,
|
|
42
43
|
"learning_rate": 0.05,
|
|
43
|
-
"min_child_weight": 1
|
|
44
|
+
"min_child_weight": 1,
|
|
44
45
|
}
|
|
45
46
|
|
|
46
47
|
N_FOLDS = 5
|
|
@@ -129,6 +130,7 @@ class EstimatorWrapper:
|
|
|
129
130
|
cv: BaseCrossValidator,
|
|
130
131
|
X: pd.DataFrame,
|
|
131
132
|
scoring: Union[Callable, str, None] = None,
|
|
133
|
+
cat_features: Optional[List[str]] = None,
|
|
132
134
|
) -> "EstimatorWrapper":
|
|
133
135
|
scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
|
|
134
136
|
kwargs = {
|
|
@@ -149,8 +151,16 @@ class EstimatorWrapper:
|
|
|
149
151
|
else:
|
|
150
152
|
raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
|
|
151
153
|
else:
|
|
152
|
-
|
|
154
|
+
if hasattr(estimator, "copy"):
|
|
155
|
+
estimator_copy = estimator.copy()
|
|
156
|
+
else:
|
|
157
|
+
estimator_copy = deepcopy(estimator)
|
|
158
|
+
kwargs["estimator"] = estimator_copy
|
|
153
159
|
if isinstance(estimator, CatBoostClassifier) or isinstance(estimator, CatBoostRegressor):
|
|
160
|
+
if cat_features is not None:
|
|
161
|
+
estimator_copy.set_params(cat_features=[
|
|
162
|
+
X.columns.get_loc(cat_feature) for cat_feature in cat_features
|
|
163
|
+
])
|
|
154
164
|
estimator = CatBoostWrapper(**kwargs)
|
|
155
165
|
else:
|
|
156
166
|
try:
|
|
@@ -197,6 +207,15 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
197
207
|
else:
|
|
198
208
|
X = X.drop(columns=name)
|
|
199
209
|
cat_features_idx = [X.columns.get_loc(c) for c in unique_cat_features]
|
|
210
|
+
if (
|
|
211
|
+
hasattr(self.estimator, "get_param")
|
|
212
|
+
and hasattr(self.estimator, "_init_params")
|
|
213
|
+
and self.estimator.get_param("cat_features") is not None
|
|
214
|
+
):
|
|
215
|
+
cat_features_set = set(cat_features_idx)
|
|
216
|
+
cat_features_set.update(self.estimator.get_param("cat_features"))
|
|
217
|
+
cat_features_idx = list(cat_features_set)
|
|
218
|
+
del self.estimator._init_params["cat_features"]
|
|
200
219
|
|
|
201
220
|
params.update({"cat_features": cat_features_idx})
|
|
202
221
|
return X, y, params
|
|
@@ -376,6 +395,6 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
|
|
|
376
395
|
def _is_too_many_categorical_values(X: pd.DataFrame) -> bool:
|
|
377
396
|
many_values_features_count = 0
|
|
378
397
|
for f in _get_cat_features(X):
|
|
379
|
-
if X[f].nunique() > 100:
|
|
398
|
+
if X[f].astype("string").nunique() > 100:
|
|
380
399
|
many_values_features_count += 1
|
|
381
400
|
return many_values_features_count >= 2
|
upgini/utils/email_utils.py
CHANGED
|
@@ -9,8 +9,7 @@ from pandas.api.types import is_string_dtype
|
|
|
9
9
|
from upgini.metadata import SearchKey
|
|
10
10
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
11
11
|
|
|
12
|
-
|
|
13
|
-
EMAIL_REGEX = re.compile(r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+')
|
|
12
|
+
EMAIL_REGEX = re.compile(r"^[a-zA-Z0-9.!#$%&’*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*$")
|
|
14
13
|
|
|
15
14
|
|
|
16
15
|
class EmailSearchKeyDetector(BaseSearchKeyDetector):
|
|
@@ -20,13 +19,11 @@ class EmailSearchKeyDetector(BaseSearchKeyDetector):
|
|
|
20
19
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
21
20
|
if not is_string_dtype(column):
|
|
22
21
|
return False
|
|
22
|
+
if not column.astype("string").str.contains("@").any():
|
|
23
|
+
return False
|
|
23
24
|
|
|
24
25
|
all_count = len(column)
|
|
25
|
-
is_email_count = len(
|
|
26
|
-
column.loc[
|
|
27
|
-
column.astype("string").str.fullmatch(EMAIL_REGEX)
|
|
28
|
-
]
|
|
29
|
-
)
|
|
26
|
+
is_email_count = len(column.loc[column.astype("string").str.fullmatch(EMAIL_REGEX)])
|
|
30
27
|
return is_email_count / all_count > 0.1
|
|
31
28
|
|
|
32
29
|
|
|
@@ -2,17 +2,17 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
|
|
|
2
2
|
upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
|
|
3
3
|
upgini/dataset.py,sha256=fl01WTbM2smgskjrHQJS9oTzymEj5ZulGngCU_d5PnQ,42110
|
|
4
4
|
upgini/errors.py,sha256=BqpvfhW2jJW5fa5KXj0alhXatGl-WK4xTl309-QNLp8,959
|
|
5
|
-
upgini/features_enricher.py,sha256=
|
|
5
|
+
upgini/features_enricher.py,sha256=PtZIiOtpLItYRm3U7e5gsWAwAiTze4rznuKFFHjFpuQ,109768
|
|
6
6
|
upgini/http.py,sha256=kgWj6wU1PbGPoGAbRvK35umXQ5zwEfEKeGy5Az0fss0,35479
|
|
7
7
|
upgini/metadata.py,sha256=GPGsaGi5UtePQR2Qiqc7OJZn-ewvHmvepn3P_wJDW7Y,5856
|
|
8
|
-
upgini/metrics.py,sha256=
|
|
8
|
+
upgini/metrics.py,sha256=uJhtGKgUUFnvdF16xscfe9AGDoDN6LqUV97RWDP39NU,14869
|
|
9
9
|
upgini/search_task.py,sha256=H7l-BhCRF9t58D0L1xNdC_qU_JFHYnAZZ165fVDQgmM,33884
|
|
10
10
|
upgini/spinner.py,sha256=X9a0xhj0QVIwjVTTjXUTuAgPBnyrLbW-B6G534fxs1E,1149
|
|
11
11
|
upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
|
|
12
12
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
13
13
|
upgini/ads_management/ads_manager.py,sha256=Cc3v4lLLpM0g4oUH_q2DYFN3bNWpSmltAGnZQby3G74,2630
|
|
14
14
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
-
upgini/data_source/data_source_publisher.py,sha256
|
|
15
|
+
upgini/data_source/data_source_publisher.py,sha256=-Tpqiw6xrCinxdDKIEg6aS68ZqLwxoBrg4J4PTQNs6g,8546
|
|
16
16
|
upgini/mdc/__init__.py,sha256=CuKmWYCqAnmiq1S7wgMzJhSCTsXuoeiZWXSfzw0lyig,1152
|
|
17
17
|
upgini/mdc/context.py,sha256=eVNEubcgkiAP139Vna2qtUBZJWoy15rWWAuB0TFv54E,1484
|
|
18
18
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -31,7 +31,7 @@ upgini/utils/country_utils.py,sha256=9BXSXoGm3nVoOZE_bRENY-KMkwMUFvAF3Au0zxUNA1o
|
|
|
31
31
|
upgini/utils/cv_utils.py,sha256=PeexQVPWrpUNlmwGtfU1FWA-aI1UyrMDgMT594ErpxA,2252
|
|
32
32
|
upgini/utils/datetime_utils.py,sha256=PK1Fc5rJ_UhCJc1TNOZPSrtsYxjD7v9dsBYOZj1RKvo,4292
|
|
33
33
|
upgini/utils/display_utils.py,sha256=iG3-hdv8_rJDWKwnQYIi1SHF-gLPAEi8jjk_05-qtMg,1934
|
|
34
|
-
upgini/utils/email_utils.py,sha256=
|
|
34
|
+
upgini/utils/email_utils.py,sha256=2IUxP1e8DsmU4qS1BN3n1JmuziZO_cV35fNf4Di0yxc,3090
|
|
35
35
|
upgini/utils/features_validator.py,sha256=LZAKTWtmINWII09UHF0R0muEz7yHLGlJkLUk8zM305Q,2190
|
|
36
36
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
37
37
|
upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,408
|
|
@@ -39,8 +39,8 @@ upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3
|
|
|
39
39
|
upgini/utils/target_utils.py,sha256=3eHrDy_Dc9ozuOwHGnGA705m9glCxKmjB-DfLrflqiA,1370
|
|
40
40
|
upgini/utils/track_info.py,sha256=O_oL4gy1jH0DVgtiUeZAW0YKCeRT4B_bzH_SZYkFaOE,4076
|
|
41
41
|
upgini/utils/warning_counter.py,sha256=vnmdFo5-7GBkU2bK9h_uC0K0Y_wtfcYstxOdeRfacO0,228
|
|
42
|
-
upgini-1.1.
|
|
43
|
-
upgini-1.1.
|
|
44
|
-
upgini-1.1.
|
|
45
|
-
upgini-1.1.
|
|
46
|
-
upgini-1.1.
|
|
42
|
+
upgini-1.1.104.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
43
|
+
upgini-1.1.104.dist-info/METADATA,sha256=nvFzylBFXaBafhSec5_Ja5KfVcrNc2pzAHNRVzBMzhA,41101
|
|
44
|
+
upgini-1.1.104.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
|
45
|
+
upgini-1.1.104.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
|
|
46
|
+
upgini-1.1.104.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|