upgini 1.1.102a1__py3-none-any.whl → 1.1.104__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/data_source/data_source_publisher.py +2 -0
- upgini/features_enricher.py +17 -5
- upgini/http.py +1 -3
- upgini/metrics.py +27 -3
- upgini/utils/datetime_utils.py +3 -0
- upgini/utils/email_utils.py +4 -7
- {upgini-1.1.102a1.dist-info → upgini-1.1.104.dist-info}/METADATA +2 -2
- {upgini-1.1.102a1.dist-info → upgini-1.1.104.dist-info}/RECORD +11 -11
- {upgini-1.1.102a1.dist-info → upgini-1.1.104.dist-info}/LICENSE +0 -0
- {upgini-1.1.102a1.dist-info → upgini-1.1.104.dist-info}/WHEEL +0 -0
- {upgini-1.1.102a1.dist-info → upgini-1.1.104.dist-info}/top_level.txt +0 -0
|
@@ -44,6 +44,7 @@ class DataSourcePublisher:
|
|
|
44
44
|
exclude_columns: Optional[List[str]] = None,
|
|
45
45
|
hash_feature_names=False,
|
|
46
46
|
snapshot_frequency_days: Optional[int] = None,
|
|
47
|
+
features_for_embeddings: Optional[List[str]] = None,
|
|
47
48
|
) -> str:
|
|
48
49
|
trace_id = str(uuid.uuid4())
|
|
49
50
|
|
|
@@ -65,6 +66,7 @@ class DataSourcePublisher:
|
|
|
65
66
|
"excludeColumns": exclude_columns,
|
|
66
67
|
"hashFeatureNames": hash_feature_names,
|
|
67
68
|
"snapshotFrequencyDays": snapshot_frequency_days,
|
|
69
|
+
"featuresForEmbeddings": features_for_embeddings,
|
|
68
70
|
}
|
|
69
71
|
self.logger.info(f"Start registering data table {request}")
|
|
70
72
|
|
upgini/features_enricher.py
CHANGED
|
@@ -639,8 +639,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
639
639
|
if (
|
|
640
640
|
self._search_task is None
|
|
641
641
|
or self._search_task.initial_max_hit_rate_v2() is None
|
|
642
|
-
or self.X is None
|
|
643
|
-
or self.y is None
|
|
642
|
+
or (self.X is None and X is None)
|
|
643
|
+
or (self.y is None and y is None)
|
|
644
644
|
):
|
|
645
645
|
raise ValidationError(bundle.get("metrics_unfitted_enricher"))
|
|
646
646
|
|
|
@@ -653,6 +653,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
653
653
|
self.__display_slack_community_link(msg)
|
|
654
654
|
return None
|
|
655
655
|
|
|
656
|
+
if (
|
|
657
|
+
estimator is not None
|
|
658
|
+
and hasattr(estimator, "get_param")
|
|
659
|
+
and estimator.get_param("cat_features") is not None
|
|
660
|
+
):
|
|
661
|
+
cat_features = estimator.get_param("cat_features")
|
|
662
|
+
if len(cat_features) > 0 and isinstance(cat_features[0], int):
|
|
663
|
+
effectiveX = X or self.X
|
|
664
|
+
cat_features = [effectiveX.columns[i] for i in cat_features]
|
|
665
|
+
else:
|
|
666
|
+
cat_features = None
|
|
667
|
+
|
|
656
668
|
prepared_data = self._prepare_data_for_metrics(
|
|
657
669
|
trace_id, X, y, eval_set, exclude_features_sources, importance_threshold, max_features
|
|
658
670
|
)
|
|
@@ -699,7 +711,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
699
711
|
f"Calculate baseline {metric} on client features: {fitting_X.columns.to_list()}"
|
|
700
712
|
)
|
|
701
713
|
baseline_estimator = EstimatorWrapper.create(
|
|
702
|
-
estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring
|
|
714
|
+
estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring, cat_features
|
|
703
715
|
)
|
|
704
716
|
etalon_metric = baseline_estimator.cross_val_predict(fitting_X, y_sorted)
|
|
705
717
|
|
|
@@ -711,7 +723,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
711
723
|
f"Calculate enriched {metric} on combined features: {fitting_enriched_X.columns.to_list()}"
|
|
712
724
|
)
|
|
713
725
|
enriched_estimator = EstimatorWrapper.create(
|
|
714
|
-
estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring
|
|
726
|
+
estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring, cat_features
|
|
715
727
|
)
|
|
716
728
|
enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
|
|
717
729
|
if etalon_metric is not None:
|
|
@@ -2224,7 +2236,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2224
2236
|
|
|
2225
2237
|
def _validate_binary_observations(self, y):
|
|
2226
2238
|
task_type = self.model_task_type or define_task(y, self.logger, silent=True)
|
|
2227
|
-
if task_type == ModelTaskType.BINARY and
|
|
2239
|
+
if task_type == ModelTaskType.BINARY and (y.value_counts() < 1000).any():
|
|
2228
2240
|
msg = bundle.get("binary_small_dataset")
|
|
2229
2241
|
self.logger.warning(msg)
|
|
2230
2242
|
print(msg)
|
upgini/http.py
CHANGED
|
@@ -790,7 +790,6 @@ class BackendLogHandler(logging.Handler):
|
|
|
790
790
|
text = self.format(record)
|
|
791
791
|
tags = get_track_metrics()
|
|
792
792
|
tags["version"] = __version__
|
|
793
|
-
print(f"Sending log to server: {text}")
|
|
794
793
|
self.rest_client.send_log_event(
|
|
795
794
|
LogEvent(
|
|
796
795
|
source="python",
|
|
@@ -800,8 +799,7 @@ class BackendLogHandler(logging.Handler):
|
|
|
800
799
|
service="PyLib",
|
|
801
800
|
)
|
|
802
801
|
)
|
|
803
|
-
except Exception
|
|
804
|
-
print(f"Failed to send log: {e}")
|
|
802
|
+
except Exception:
|
|
805
803
|
pass
|
|
806
804
|
|
|
807
805
|
thread = threading.Thread(target=task)
|
upgini/metrics.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Callable, List, Tuple, Union
|
|
2
|
+
from typing import Callable, List, Optional, Tuple, Union
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
@@ -14,6 +14,7 @@ from sklearn.metrics._regression import (
|
|
|
14
14
|
mean_squared_error,
|
|
15
15
|
)
|
|
16
16
|
from sklearn.model_selection import BaseCrossValidator, cross_validate
|
|
17
|
+
from copy import deepcopy
|
|
17
18
|
|
|
18
19
|
from upgini.errors import ValidationError
|
|
19
20
|
from upgini.metadata import ModelTaskType
|
|
@@ -36,6 +37,11 @@ CATBOOST_PARAMS = {
|
|
|
36
37
|
|
|
37
38
|
LIGHTGBM_PARAMS = {
|
|
38
39
|
"random_state": DEFAULT_RANDOM_STATE,
|
|
40
|
+
"num_leaves": 16,
|
|
41
|
+
"max_depth": 4,
|
|
42
|
+
"n_estimators": 150,
|
|
43
|
+
"learning_rate": 0.05,
|
|
44
|
+
"min_child_weight": 1,
|
|
39
45
|
}
|
|
40
46
|
|
|
41
47
|
N_FOLDS = 5
|
|
@@ -124,6 +130,7 @@ class EstimatorWrapper:
|
|
|
124
130
|
cv: BaseCrossValidator,
|
|
125
131
|
X: pd.DataFrame,
|
|
126
132
|
scoring: Union[Callable, str, None] = None,
|
|
133
|
+
cat_features: Optional[List[str]] = None,
|
|
127
134
|
) -> "EstimatorWrapper":
|
|
128
135
|
scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
|
|
129
136
|
kwargs = {
|
|
@@ -144,8 +151,16 @@ class EstimatorWrapper:
|
|
|
144
151
|
else:
|
|
145
152
|
raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
|
|
146
153
|
else:
|
|
147
|
-
|
|
154
|
+
if hasattr(estimator, "copy"):
|
|
155
|
+
estimator_copy = estimator.copy()
|
|
156
|
+
else:
|
|
157
|
+
estimator_copy = deepcopy(estimator)
|
|
158
|
+
kwargs["estimator"] = estimator_copy
|
|
148
159
|
if isinstance(estimator, CatBoostClassifier) or isinstance(estimator, CatBoostRegressor):
|
|
160
|
+
if cat_features is not None:
|
|
161
|
+
estimator_copy.set_params(cat_features=[
|
|
162
|
+
X.columns.get_loc(cat_feature) for cat_feature in cat_features
|
|
163
|
+
])
|
|
149
164
|
estimator = CatBoostWrapper(**kwargs)
|
|
150
165
|
else:
|
|
151
166
|
try:
|
|
@@ -192,6 +207,15 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
192
207
|
else:
|
|
193
208
|
X = X.drop(columns=name)
|
|
194
209
|
cat_features_idx = [X.columns.get_loc(c) for c in unique_cat_features]
|
|
210
|
+
if (
|
|
211
|
+
hasattr(self.estimator, "get_param")
|
|
212
|
+
and hasattr(self.estimator, "_init_params")
|
|
213
|
+
and self.estimator.get_param("cat_features") is not None
|
|
214
|
+
):
|
|
215
|
+
cat_features_set = set(cat_features_idx)
|
|
216
|
+
cat_features_set.update(self.estimator.get_param("cat_features"))
|
|
217
|
+
cat_features_idx = list(cat_features_set)
|
|
218
|
+
del self.estimator._init_params["cat_features"]
|
|
195
219
|
|
|
196
220
|
params.update({"cat_features": cat_features_idx})
|
|
197
221
|
return X, y, params
|
|
@@ -371,6 +395,6 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
|
|
|
371
395
|
def _is_too_many_categorical_values(X: pd.DataFrame) -> bool:
|
|
372
396
|
many_values_features_count = 0
|
|
373
397
|
for f in _get_cat_features(X):
|
|
374
|
-
if X[f].nunique() > 100:
|
|
398
|
+
if X[f].astype("string").nunique() > 100:
|
|
375
399
|
many_values_features_count += 1
|
|
376
400
|
return many_values_features_count >= 2
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -5,6 +5,7 @@ import numpy as np
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
|
|
7
7
|
from dateutil.relativedelta import relativedelta
|
|
8
|
+
import datetime
|
|
8
9
|
|
|
9
10
|
from upgini.errors import ValidationError
|
|
10
11
|
|
|
@@ -29,6 +30,8 @@ class DateTimeSearchKeyConverter:
|
|
|
29
30
|
|
|
30
31
|
def convert(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
31
32
|
df = df.copy()
|
|
33
|
+
if df[self.date_column].apply(lambda x: isinstance(x, datetime.datetime)).all():
|
|
34
|
+
df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
|
|
32
35
|
if is_string_dtype(df[self.date_column]):
|
|
33
36
|
try:
|
|
34
37
|
df[self.date_column] = pd.to_datetime(df[self.date_column], format=self.date_format)
|
upgini/utils/email_utils.py
CHANGED
|
@@ -9,8 +9,7 @@ from pandas.api.types import is_string_dtype
|
|
|
9
9
|
from upgini.metadata import SearchKey
|
|
10
10
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
11
11
|
|
|
12
|
-
|
|
13
|
-
EMAIL_REGEX = re.compile(r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+')
|
|
12
|
+
EMAIL_REGEX = re.compile(r"^[a-zA-Z0-9.!#$%&’*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*$")
|
|
14
13
|
|
|
15
14
|
|
|
16
15
|
class EmailSearchKeyDetector(BaseSearchKeyDetector):
|
|
@@ -20,13 +19,11 @@ class EmailSearchKeyDetector(BaseSearchKeyDetector):
|
|
|
20
19
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
21
20
|
if not is_string_dtype(column):
|
|
22
21
|
return False
|
|
22
|
+
if not column.astype("string").str.contains("@").any():
|
|
23
|
+
return False
|
|
23
24
|
|
|
24
25
|
all_count = len(column)
|
|
25
|
-
is_email_count = len(
|
|
26
|
-
column.loc[
|
|
27
|
-
column.astype("string").str.fullmatch(EMAIL_REGEX)
|
|
28
|
-
]
|
|
29
|
-
)
|
|
26
|
+
is_email_count = len(column.loc[column.astype("string").str.fullmatch(EMAIL_REGEX)])
|
|
30
27
|
return is_email_count / all_count > 0.1
|
|
31
28
|
|
|
32
29
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.104
|
|
4
4
|
Summary: Low-code feature search and enrichment library for machine learning
|
|
5
5
|
Home-page: https://upgini.com/
|
|
6
6
|
Author: Upgini Developers
|
|
@@ -54,7 +54,7 @@ enriches your ML pipeline with only the relevant features</b> </p>
|
|
|
54
54
|
</p>
|
|
55
55
|
<p align=center>
|
|
56
56
|
<a href="/LICENSE"><img alt="BSD-3 license" src="https://img.shields.io/badge/license-BSD--3%20Clause-green"></a>
|
|
57
|
-
<a href="https://
|
|
57
|
+
<a href="https://pypi.org/project/upgini/"><img alt="PyPI - Python Version" src="https://img.shields.io/pypi/pyversions/upgini"></a>
|
|
58
58
|
<a href="https://pypi.org/project/upgini/"><img alt="PyPI" src="https://img.shields.io/pypi/v/upgini?label=Release"></a>
|
|
59
59
|
<a href="https://pypistats.org/packages/upgini"><img alt="Downloads from pypistats" src="https://pepy.tech/badge/upgini"></a>
|
|
60
60
|
<a href="https://4mlg.short.gy/join-upgini-community"><img alt="Upgini slack community" src="https://img.shields.io/badge/slack-@upgini-orange.svg?logo=slack"></a>
|
|
@@ -2,17 +2,17 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
|
|
|
2
2
|
upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
|
|
3
3
|
upgini/dataset.py,sha256=fl01WTbM2smgskjrHQJS9oTzymEj5ZulGngCU_d5PnQ,42110
|
|
4
4
|
upgini/errors.py,sha256=BqpvfhW2jJW5fa5KXj0alhXatGl-WK4xTl309-QNLp8,959
|
|
5
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
-
upgini/http.py,sha256=
|
|
5
|
+
upgini/features_enricher.py,sha256=PtZIiOtpLItYRm3U7e5gsWAwAiTze4rznuKFFHjFpuQ,109768
|
|
6
|
+
upgini/http.py,sha256=kgWj6wU1PbGPoGAbRvK35umXQ5zwEfEKeGy5Az0fss0,35479
|
|
7
7
|
upgini/metadata.py,sha256=GPGsaGi5UtePQR2Qiqc7OJZn-ewvHmvepn3P_wJDW7Y,5856
|
|
8
|
-
upgini/metrics.py,sha256=
|
|
8
|
+
upgini/metrics.py,sha256=uJhtGKgUUFnvdF16xscfe9AGDoDN6LqUV97RWDP39NU,14869
|
|
9
9
|
upgini/search_task.py,sha256=H7l-BhCRF9t58D0L1xNdC_qU_JFHYnAZZ165fVDQgmM,33884
|
|
10
10
|
upgini/spinner.py,sha256=X9a0xhj0QVIwjVTTjXUTuAgPBnyrLbW-B6G534fxs1E,1149
|
|
11
11
|
upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
|
|
12
12
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
13
13
|
upgini/ads_management/ads_manager.py,sha256=Cc3v4lLLpM0g4oUH_q2DYFN3bNWpSmltAGnZQby3G74,2630
|
|
14
14
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
-
upgini/data_source/data_source_publisher.py,sha256
|
|
15
|
+
upgini/data_source/data_source_publisher.py,sha256=-Tpqiw6xrCinxdDKIEg6aS68ZqLwxoBrg4J4PTQNs6g,8546
|
|
16
16
|
upgini/mdc/__init__.py,sha256=CuKmWYCqAnmiq1S7wgMzJhSCTsXuoeiZWXSfzw0lyig,1152
|
|
17
17
|
upgini/mdc/context.py,sha256=eVNEubcgkiAP139Vna2qtUBZJWoy15rWWAuB0TFv54E,1484
|
|
18
18
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -29,9 +29,9 @@ upgini/utils/base_search_key_detector.py,sha256=DGwhXLvc8i5VZWMDr0rncFfV5GEHdsCS
|
|
|
29
29
|
upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6PuMMjPg,3380
|
|
30
30
|
upgini/utils/country_utils.py,sha256=9BXSXoGm3nVoOZE_bRENY-KMkwMUFvAF3Au0zxUNA1o,6436
|
|
31
31
|
upgini/utils/cv_utils.py,sha256=PeexQVPWrpUNlmwGtfU1FWA-aI1UyrMDgMT594ErpxA,2252
|
|
32
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
32
|
+
upgini/utils/datetime_utils.py,sha256=PK1Fc5rJ_UhCJc1TNOZPSrtsYxjD7v9dsBYOZj1RKvo,4292
|
|
33
33
|
upgini/utils/display_utils.py,sha256=iG3-hdv8_rJDWKwnQYIi1SHF-gLPAEi8jjk_05-qtMg,1934
|
|
34
|
-
upgini/utils/email_utils.py,sha256=
|
|
34
|
+
upgini/utils/email_utils.py,sha256=2IUxP1e8DsmU4qS1BN3n1JmuziZO_cV35fNf4Di0yxc,3090
|
|
35
35
|
upgini/utils/features_validator.py,sha256=LZAKTWtmINWII09UHF0R0muEz7yHLGlJkLUk8zM305Q,2190
|
|
36
36
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
37
37
|
upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,408
|
|
@@ -39,8 +39,8 @@ upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3
|
|
|
39
39
|
upgini/utils/target_utils.py,sha256=3eHrDy_Dc9ozuOwHGnGA705m9glCxKmjB-DfLrflqiA,1370
|
|
40
40
|
upgini/utils/track_info.py,sha256=O_oL4gy1jH0DVgtiUeZAW0YKCeRT4B_bzH_SZYkFaOE,4076
|
|
41
41
|
upgini/utils/warning_counter.py,sha256=vnmdFo5-7GBkU2bK9h_uC0K0Y_wtfcYstxOdeRfacO0,228
|
|
42
|
-
upgini-1.1.
|
|
43
|
-
upgini-1.1.
|
|
44
|
-
upgini-1.1.
|
|
45
|
-
upgini-1.1.
|
|
46
|
-
upgini-1.1.
|
|
42
|
+
upgini-1.1.104.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
43
|
+
upgini-1.1.104.dist-info/METADATA,sha256=nvFzylBFXaBafhSec5_Ja5KfVcrNc2pzAHNRVzBMzhA,41101
|
|
44
|
+
upgini-1.1.104.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
|
45
|
+
upgini-1.1.104.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
|
|
46
|
+
upgini-1.1.104.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|