upgini 1.2.96a3906.dev2__py3-none-any.whl → 1.2.98__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/feature.py +20 -10
- upgini/autofe/unary.py +22 -1
- upgini/data_source/data_source_publisher.py +20 -3
- upgini/features_enricher.py +82 -79
- upgini/http.py +23 -0
- upgini/metrics.py +6 -6
- upgini/normalizer/normalize_utils.py +25 -0
- upgini/resource_bundle/strings.properties +2 -2
- upgini/utils/ip_utils.py +2 -2
- {upgini-1.2.96a3906.dev2.dist-info → upgini-1.2.98.dist-info}/METADATA +1 -1
- {upgini-1.2.96a3906.dev2.dist-info → upgini-1.2.98.dist-info}/RECORD +14 -14
- {upgini-1.2.96a3906.dev2.dist-info → upgini-1.2.98.dist-info}/WHEEL +0 -0
- {upgini-1.2.96a3906.dev2.dist-info → upgini-1.2.98.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.98"
|
upgini/autofe/feature.py
CHANGED
@@ -18,10 +18,7 @@ class Column:
|
|
18
18
|
self.data = data
|
19
19
|
self.calculate_all = calculate_all
|
20
20
|
|
21
|
-
def
|
22
|
-
return self.name
|
23
|
-
|
24
|
-
def set_op_params(self, params: Dict[str, str]) -> "Column":
|
21
|
+
def set_op_params(self, params: Dict[str, str], **kwargs) -> "Column":
|
25
22
|
return self
|
26
23
|
|
27
24
|
def get_op_params(self, **kwargs):
|
@@ -37,8 +34,21 @@ class Column:
|
|
37
34
|
def get_column_nodes(self) -> List["Column"]:
|
38
35
|
return [self]
|
39
36
|
|
40
|
-
def get_columns(self, **kwargs)
|
41
|
-
|
37
|
+
def get_columns(self, unhash=False, **kwargs):
|
38
|
+
name = self.name
|
39
|
+
return [self._unhash(name) if unhash else name]
|
40
|
+
|
41
|
+
def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
|
42
|
+
return self.get_columns(**kwargs)[0]
|
43
|
+
|
44
|
+
def _unhash(self, feature_name: str) -> str:
|
45
|
+
last_component_idx = feature_name.rfind("_")
|
46
|
+
if not feature_name.startswith("f_"):
|
47
|
+
return feature_name # etalon feature
|
48
|
+
elif last_component_idx == 1:
|
49
|
+
return feature_name[2:] # fully hashed name, cannot unhash
|
50
|
+
else:
|
51
|
+
return feature_name[2:last_component_idx]
|
42
52
|
|
43
53
|
@property
|
44
54
|
def children(self) -> List[Union["Feature", "Column"]]:
|
@@ -81,7 +91,7 @@ class Feature:
|
|
81
91
|
self.cached_display_name = cached_display_name
|
82
92
|
self.alias = alias
|
83
93
|
|
84
|
-
def set_op_params(self, params: Optional[Dict[str, str]]) -> "Feature":
|
94
|
+
def set_op_params(self, params: Optional[Dict[str, str]], **kwargs) -> "Feature":
|
85
95
|
obj_dict = pydantic_dump_method(self.op)().copy()
|
86
96
|
obj_dict.update(params or {})
|
87
97
|
self.op = pydantic_parse_method(self.op.__class__)(obj_dict)
|
@@ -89,13 +99,13 @@ class Feature:
|
|
89
99
|
|
90
100
|
for child in self.children:
|
91
101
|
child_params = {
|
92
|
-
k[len(child.get_display_name()) + 1 :]: v
|
102
|
+
k[len(child.get_display_name(**kwargs)) + 1 :]: v
|
93
103
|
for k, v in params.items()
|
94
|
-
if k.startswith(child.get_display_name())
|
104
|
+
if k.startswith(child.get_display_name(**kwargs))
|
95
105
|
}
|
96
106
|
if not child_params:
|
97
107
|
child_params = params
|
98
|
-
child.set_op_params(child_params)
|
108
|
+
child.set_op_params(child_params, **kwargs)
|
99
109
|
return self
|
100
110
|
|
101
111
|
def get_op_params(self, **kwargs) -> Dict[str, str]:
|
upgini/autofe/unary.py
CHANGED
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional
|
|
3
3
|
import numpy as np
|
4
4
|
import pandas as pd
|
5
5
|
|
6
|
-
from upgini.autofe.operator import PandasOperator, VectorizableMixin
|
6
|
+
from upgini.autofe.operator import PandasOperator, ParametrizedOperator, VectorizableMixin
|
7
7
|
from upgini.autofe.utils import pydantic_validator
|
8
8
|
|
9
9
|
|
@@ -198,3 +198,24 @@ class Cluster(PandasOperator):
|
|
198
198
|
input_type: Optional[str] = "vector"
|
199
199
|
output_type: Optional[str] = "category"
|
200
200
|
is_categorical: bool = True
|
201
|
+
|
202
|
+
|
203
|
+
class OutlierDistance(PandasOperator, ParametrizedOperator):
|
204
|
+
name: str = "outlier_dist"
|
205
|
+
is_unary: bool = True
|
206
|
+
input_type: Optional[str] = "vector"
|
207
|
+
output_type: Optional[str] = "float"
|
208
|
+
class_value: Optional[str] = None
|
209
|
+
|
210
|
+
def to_formula(self) -> str:
|
211
|
+
return f"outlier_dist_{self.class_value if self.class_value is not None else 'all'}"
|
212
|
+
|
213
|
+
@classmethod
|
214
|
+
def from_formula(cls, formula: str) -> Optional["OutlierDistance"]:
|
215
|
+
if formula == "outlier_dist":
|
216
|
+
return cls()
|
217
|
+
|
218
|
+
if formula.startswith("outlier_dist_"):
|
219
|
+
class_value = formula.split("_")[-1]
|
220
|
+
return cls(class_value=None if class_value == "all" else class_value)
|
221
|
+
return None
|
@@ -149,9 +149,8 @@ class DataSourcePublisher:
|
|
149
149
|
existing_secondary_keys = {item for sublist in row["secondarySearchKeys"] for item in sublist}
|
150
150
|
if existing_secondary_keys == {v.value.name for v in secondary_search_keys.values()}:
|
151
151
|
existing_search_keys = {item for sublist in row["searchKeys"] for item in sublist}
|
152
|
-
if (
|
153
|
-
|
154
|
-
or ("IP" in str(existing_search_keys) and "IP" in str(search_keys.values()))
|
152
|
+
if existing_search_keys == {v.value.name for v in search_keys.values()} or (
|
153
|
+
"IP" in str(existing_search_keys) and "IP" in str(search_keys.values())
|
155
154
|
):
|
156
155
|
raise ValidationError(
|
157
156
|
"ADS with the same PRIMARY_KEYS -> SECONDARY_KEYS mapping "
|
@@ -494,3 +493,21 @@ class DataSourcePublisher:
|
|
494
493
|
raise Exception("Failed to reannounce all ADS: " + status_response["errorMessage"])
|
495
494
|
except Exception:
|
496
495
|
self.logger.exception("Failed to reannounce all ADS-es")
|
496
|
+
|
497
|
+
def upload_autofe_model(
|
498
|
+
self, file_path: str, name: str, model_type: Optional[Literal["ONNX"]] = None, description: str = ""
|
499
|
+
):
|
500
|
+
if model_type is not None and model_type not in ["ONNX"]:
|
501
|
+
raise ValueError(f"Invalid model type: {model_type}. Available values: ONNX")
|
502
|
+
metadata = {
|
503
|
+
"modelName": name,
|
504
|
+
"modelType": model_type or "ONNX",
|
505
|
+
"description": description,
|
506
|
+
}
|
507
|
+
trace_id = str(uuid.uuid4())
|
508
|
+
with MDC(trace_id=trace_id):
|
509
|
+
try:
|
510
|
+
self._rest_client.upload_autofe_model(file_path, metadata, trace_id)
|
511
|
+
except Exception:
|
512
|
+
self.logger.exception("Failed to upload autofe model")
|
513
|
+
raise
|
upgini/features_enricher.py
CHANGED
@@ -71,10 +71,7 @@ from upgini.search_task import SearchTask
|
|
71
71
|
from upgini.spinner import Spinner
|
72
72
|
from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
|
73
73
|
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|
74
|
-
from upgini.utils.country_utils import
|
75
|
-
CountrySearchKeyConverter,
|
76
|
-
CountrySearchKeyDetector,
|
77
|
-
)
|
74
|
+
from upgini.utils.country_utils import CountrySearchKeyDetector
|
78
75
|
from upgini.utils.custom_loss_utils import (
|
79
76
|
get_additional_params_custom_loss,
|
80
77
|
get_runtime_params_custom_loss,
|
@@ -105,11 +102,8 @@ from upgini.utils.feature_info import FeatureInfo, _round_shap_value
|
|
105
102
|
from upgini.utils.features_validator import FeaturesValidator
|
106
103
|
from upgini.utils.format import Format
|
107
104
|
from upgini.utils.ip_utils import IpSearchKeyConverter
|
108
|
-
from upgini.utils.phone_utils import
|
109
|
-
from upgini.utils.postal_code_utils import
|
110
|
-
PostalCodeSearchKeyConverter,
|
111
|
-
PostalCodeSearchKeyDetector,
|
112
|
-
)
|
105
|
+
from upgini.utils.phone_utils import PhoneSearchKeyDetector
|
106
|
+
from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
|
113
107
|
|
114
108
|
try:
|
115
109
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
@@ -1122,6 +1116,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1122
1116
|
# and calculate final metric (and uplift)
|
1123
1117
|
enriched_metric = None
|
1124
1118
|
uplift = None
|
1119
|
+
uplift_perc = None
|
1125
1120
|
enriched_estimator = None
|
1126
1121
|
if set(fitting_X.columns) != set(fitting_enriched_X.columns):
|
1127
1122
|
self.logger.info(
|
@@ -1153,6 +1148,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1153
1148
|
self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
|
1154
1149
|
if baseline_metric is not None and enriched_metric is not None:
|
1155
1150
|
uplift = (enriched_cv_result.metric - baseline_cv_result.metric) * multiplier
|
1151
|
+
uplift_perc = uplift / abs(baseline_cv_result.metric) * 100
|
1156
1152
|
|
1157
1153
|
train_metrics = {
|
1158
1154
|
self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
|
@@ -1179,7 +1175,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
1179
1175
|
enriched_metric
|
1180
1176
|
)
|
1181
1177
|
if uplift is not None:
|
1182
|
-
train_metrics[self.bundle.get("quality_metrics_uplift_header")] = uplift
|
1178
|
+
train_metrics[self.bundle.get("quality_metrics_uplift_header")] = round(uplift, 3)
|
1179
|
+
train_metrics[self.bundle.get("quality_metrics_uplift_perc_header")] = (
|
1180
|
+
f"{round(uplift_perc, 1)}%"
|
1181
|
+
)
|
1183
1182
|
metrics = [train_metrics]
|
1184
1183
|
|
1185
1184
|
# 3 If eval_set is presented - fit final model on train enriched data and score each
|
@@ -1228,8 +1227,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
1228
1227
|
|
1229
1228
|
if etalon_eval_metric is not None and enriched_eval_metric is not None:
|
1230
1229
|
eval_uplift = (enriched_eval_results.metric - etalon_eval_results.metric) * multiplier
|
1230
|
+
eval_uplift_perc = eval_uplift / abs(etalon_eval_results.metric) * 100
|
1231
1231
|
else:
|
1232
1232
|
eval_uplift = None
|
1233
|
+
eval_uplift_perc = None
|
1233
1234
|
|
1234
1235
|
eval_metrics = {
|
1235
1236
|
self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
|
@@ -1260,7 +1261,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
1260
1261
|
enriched_eval_metric
|
1261
1262
|
)
|
1262
1263
|
if eval_uplift is not None:
|
1263
|
-
eval_metrics[self.bundle.get("quality_metrics_uplift_header")] = eval_uplift
|
1264
|
+
eval_metrics[self.bundle.get("quality_metrics_uplift_header")] = round(eval_uplift, 3)
|
1265
|
+
eval_metrics[self.bundle.get("quality_metrics_uplift_perc_header")] = (
|
1266
|
+
f"{round(eval_uplift_perc, 1)}%"
|
1267
|
+
)
|
1264
1268
|
|
1265
1269
|
metrics.append(eval_metrics)
|
1266
1270
|
|
@@ -2495,21 +2499,6 @@ if response.status_code == 200:
|
|
2495
2499
|
)
|
2496
2500
|
df = converter.convert(df)
|
2497
2501
|
|
2498
|
-
phone_column = self._get_phone_column(search_keys)
|
2499
|
-
country_column = self._get_country_column(search_keys)
|
2500
|
-
if phone_column:
|
2501
|
-
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
2502
|
-
df = converter.convert(df)
|
2503
|
-
|
2504
|
-
if country_column:
|
2505
|
-
converter = CountrySearchKeyConverter(country_column)
|
2506
|
-
df = converter.convert(df)
|
2507
|
-
|
2508
|
-
postal_code = self._get_postal_column(search_keys)
|
2509
|
-
if postal_code:
|
2510
|
-
converter = PostalCodeSearchKeyConverter(postal_code)
|
2511
|
-
df = converter.convert(df)
|
2512
|
-
|
2513
2502
|
meaning_types = {}
|
2514
2503
|
meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
|
2515
2504
|
meaning_types.update({col: key.value for col, key in search_keys.items()})
|
@@ -2904,6 +2893,7 @@ if response.status_code == 200:
|
|
2904
2893
|
self.fit_generated_features.extend(converter.generated_features)
|
2905
2894
|
else:
|
2906
2895
|
self.logger.info("Input dataset hasn't date column")
|
2896
|
+
# TODO remove when this logic will be implemented on the back
|
2907
2897
|
if self.__should_add_date_column():
|
2908
2898
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
2909
2899
|
|
@@ -2935,6 +2925,26 @@ if response.status_code == 200:
|
|
2935
2925
|
if normalizer.removed_features:
|
2936
2926
|
self.__log_warning(self.bundle.get("dataset_date_features").format(normalizer.removed_features))
|
2937
2927
|
|
2928
|
+
non_feature_columns = [
|
2929
|
+
self.TARGET_NAME,
|
2930
|
+
EVAL_SET_INDEX,
|
2931
|
+
] + list(self.fit_search_keys.keys())
|
2932
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
2933
|
+
non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
|
2934
|
+
|
2935
|
+
features_columns = [c for c in df.columns if c not in non_feature_columns]
|
2936
|
+
|
2937
|
+
features_to_drop, feature_validator_warnings = FeaturesValidator(self.logger).validate(
|
2938
|
+
df, features_columns, self.generate_features, self.fit_columns_renaming
|
2939
|
+
)
|
2940
|
+
if feature_validator_warnings:
|
2941
|
+
for warning in feature_validator_warnings:
|
2942
|
+
self.__log_warning(warning)
|
2943
|
+
self.fit_dropped_features.update(features_to_drop)
|
2944
|
+
df = df.drop(columns=features_to_drop)
|
2945
|
+
|
2946
|
+
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
2947
|
+
|
2938
2948
|
self.__adjust_cv(df)
|
2939
2949
|
|
2940
2950
|
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
@@ -2974,6 +2984,7 @@ if response.status_code == 200:
|
|
2974
2984
|
# Convert EMAIL to HEM etc after unnesting to do it only with one column
|
2975
2985
|
df = self.__convert_unnestable_keys(df, unnest_search_keys)
|
2976
2986
|
|
2987
|
+
# refresh features columns
|
2977
2988
|
non_feature_columns = [
|
2978
2989
|
self.TARGET_NAME,
|
2979
2990
|
EVAL_SET_INDEX,
|
@@ -2985,17 +2996,6 @@ if response.status_code == 200:
|
|
2985
2996
|
|
2986
2997
|
features_columns = [c for c in df.columns if c not in non_feature_columns]
|
2987
2998
|
|
2988
|
-
features_to_drop, feature_validator_warnings = FeaturesValidator(self.logger).validate(
|
2989
|
-
df, features_columns, self.generate_features, self.fit_columns_renaming
|
2990
|
-
)
|
2991
|
-
if feature_validator_warnings:
|
2992
|
-
for warning in feature_validator_warnings:
|
2993
|
-
self.__log_warning(warning)
|
2994
|
-
self.fit_dropped_features.update(features_to_drop)
|
2995
|
-
df = df.drop(columns=features_to_drop)
|
2996
|
-
|
2997
|
-
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
2998
|
-
|
2999
2999
|
meaning_types = {
|
3000
3000
|
**{col: key.value for col, key in self.fit_search_keys.items()},
|
3001
3001
|
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
@@ -3225,20 +3225,6 @@ if response.status_code == 200:
|
|
3225
3225
|
self.logger,
|
3226
3226
|
)
|
3227
3227
|
df = converter.convert(df)
|
3228
|
-
phone_column = self._get_phone_column(self.fit_search_keys)
|
3229
|
-
country_column = self._get_country_column(self.fit_search_keys)
|
3230
|
-
if phone_column:
|
3231
|
-
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
3232
|
-
df = converter.convert(df)
|
3233
|
-
|
3234
|
-
if country_column:
|
3235
|
-
converter = CountrySearchKeyConverter(country_column)
|
3236
|
-
df = converter.convert(df)
|
3237
|
-
|
3238
|
-
postal_code = self._get_postal_column(self.fit_search_keys)
|
3239
|
-
if postal_code:
|
3240
|
-
converter = PostalCodeSearchKeyConverter(postal_code)
|
3241
|
-
df = converter.convert(df)
|
3242
3228
|
|
3243
3229
|
return df
|
3244
3230
|
|
@@ -4188,7 +4174,7 @@ if response.status_code == 200:
|
|
4188
4174
|
|
4189
4175
|
description = {}
|
4190
4176
|
|
4191
|
-
feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True))
|
4177
|
+
feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True, unhash=True))
|
4192
4178
|
if feature_meta is None:
|
4193
4179
|
self.logger.warning(f"Feature meta for display index {m.display_index} not found")
|
4194
4180
|
continue
|
@@ -4642,42 +4628,59 @@ if response.status_code == 200:
|
|
4642
4628
|
if isinstance(X_, pd.Series):
|
4643
4629
|
X_ = X_.to_frame()
|
4644
4630
|
|
4645
|
-
# TODO check that this file was already uploaded
|
4646
|
-
|
4647
4631
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
4648
4632
|
X_.to_parquet(f"{tmp_dir}/x.parquet", compression="zstd")
|
4633
|
+
x_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/x.parquet")
|
4634
|
+
if self.rest_client.is_file_uploaded(trace_id, x_digest_sha256):
|
4635
|
+
self.logger.info(f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping")
|
4636
|
+
else:
|
4637
|
+
self.rest_client.dump_input_file(trace_id, f"{tmp_dir}/x.parquet", "x.parquet")
|
4649
4638
|
|
4650
4639
|
if y_ is not None:
|
4651
4640
|
if isinstance(y_, pd.Series):
|
4652
4641
|
y_ = y_.to_frame()
|
4653
4642
|
y_.to_parquet(f"{tmp_dir}/y.parquet", compression="zstd")
|
4654
|
-
|
4655
|
-
|
4656
|
-
|
4657
|
-
|
4658
|
-
eval_x_ = eval_x_.to_frame()
|
4659
|
-
eval_x_.to_parquet(f"{tmp_dir}/eval_x.parquet", compression="zstd")
|
4660
|
-
if isinstance(eval_y_, pd.Series):
|
4661
|
-
eval_y_ = eval_y_.to_frame()
|
4662
|
-
eval_y_.to_parquet(f"{tmp_dir}/eval_y.parquet", compression="zstd")
|
4663
|
-
self.rest_client.dump_input_files(
|
4664
|
-
trace_id,
|
4665
|
-
f"{tmp_dir}/x.parquet",
|
4666
|
-
f"{tmp_dir}/y.parquet",
|
4667
|
-
f"{tmp_dir}/eval_x.parquet",
|
4668
|
-
f"{tmp_dir}/eval_y.parquet",
|
4643
|
+
y_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/y.parquet")
|
4644
|
+
if self.rest_client.is_file_uploaded(trace_id, y_digest_sha256):
|
4645
|
+
self.logger.info(
|
4646
|
+
f"File y.parquet was already uploaded with digest {y_digest_sha256}, skipping"
|
4669
4647
|
)
|
4670
4648
|
else:
|
4671
|
-
self.rest_client.
|
4672
|
-
|
4673
|
-
|
4674
|
-
|
4675
|
-
|
4676
|
-
|
4677
|
-
|
4678
|
-
|
4679
|
-
|
4680
|
-
|
4649
|
+
self.rest_client.dump_input_file(trace_id, f"{tmp_dir}/y.parquet", "y.parquet")
|
4650
|
+
|
4651
|
+
if eval_set_ is not None and len(eval_set_) > 0:
|
4652
|
+
for idx, (eval_x_, eval_y_) in enumerate(eval_set_):
|
4653
|
+
if isinstance(eval_x_, pd.Series):
|
4654
|
+
eval_x_ = eval_x_.to_frame()
|
4655
|
+
eval_x_.to_parquet(f"{tmp_dir}/eval_x_{idx}.parquet", compression="zstd")
|
4656
|
+
eval_x_digest_sha256 = self.rest_client.compute_file_digest(
|
4657
|
+
f"{tmp_dir}/eval_x_{idx}.parquet"
|
4658
|
+
)
|
4659
|
+
if self.rest_client.is_file_uploaded(trace_id, eval_x_digest_sha256):
|
4660
|
+
self.logger.info(
|
4661
|
+
f"File eval_x_{idx}.parquet was already uploaded with"
|
4662
|
+
f" digest {eval_x_digest_sha256}, skipping"
|
4663
|
+
)
|
4664
|
+
else:
|
4665
|
+
self.rest_client.dump_input_file(
|
4666
|
+
trace_id, f"{tmp_dir}/eval_x_{idx}.parquet", f"eval_x_{idx}.parquet"
|
4667
|
+
)
|
4668
|
+
|
4669
|
+
if isinstance(eval_y_, pd.Series):
|
4670
|
+
eval_y_ = eval_y_.to_frame()
|
4671
|
+
eval_y_.to_parquet(f"{tmp_dir}/eval_y_{idx}.parquet", compression="zstd")
|
4672
|
+
eval_y_digest_sha256 = self.rest_client.compute_file_digest(
|
4673
|
+
f"{tmp_dir}/eval_y_{idx}.parquet"
|
4674
|
+
)
|
4675
|
+
if self.rest_client.is_file_uploaded(trace_id, eval_y_digest_sha256):
|
4676
|
+
self.logger.info(
|
4677
|
+
f"File eval_y_{idx}.parquet was already uploaded"
|
4678
|
+
f" with digest {eval_y_digest_sha256}, skipping"
|
4679
|
+
)
|
4680
|
+
else:
|
4681
|
+
self.rest_client.dump_input_file(
|
4682
|
+
trace_id, f"{tmp_dir}/eval_y_{idx}.parquet", f"eval_y_{idx}.parquet"
|
4683
|
+
)
|
4681
4684
|
except Exception:
|
4682
4685
|
self.logger.warning("Failed to dump input files", exc_info=True)
|
4683
4686
|
|
upgini/http.py
CHANGED
@@ -12,6 +12,7 @@ from enum import Enum
|
|
12
12
|
from functools import lru_cache
|
13
13
|
from http.client import HTTPConnection
|
14
14
|
from json import dumps
|
15
|
+
from pathlib import Path
|
15
16
|
from typing import Any, Dict, List, Optional, Tuple
|
16
17
|
from urllib.parse import urljoin
|
17
18
|
|
@@ -292,6 +293,7 @@ class _RestClient:
|
|
292
293
|
UPLOAD_ONLINE_URI = "private/api/v2/ads/upload-online"
|
293
294
|
STOP_ADS_MANAGEMENT_TASK_URI_FMT = "private/api/v2/ads/management-task/{0}/stop"
|
294
295
|
UNION_SEARCH_TASKS_URI_FMT = SERVICE_ROOT_V2 + "search/merge"
|
296
|
+
UPLOAD_AUTOFE_MODEL_URI_FMT = "private/api/v2/autofe/model/upload"
|
295
297
|
|
296
298
|
ACCESS_TOKEN_HEADER_NAME = "Authorization"
|
297
299
|
CONTENT_TYPE_HEADER_NAME = "Content-Type"
|
@@ -404,6 +406,16 @@ class _RestClient:
|
|
404
406
|
meaning_types = [_RestClient.meaning_type_by_name(name, metadata) for name in search_key_names]
|
405
407
|
return [meaning_type.value for meaning_type in meaning_types if meaning_type is not None]
|
406
408
|
|
409
|
+
def dump_input_file(self, trace_id: str, path: str, file_name: str):
|
410
|
+
api_path = self.SEARCH_DUMP_INPUT_FILE_FMT
|
411
|
+
with open(path, "rb") as file:
|
412
|
+
files = {"file": (file_name, file, "application/octet-stream")}
|
413
|
+
self._with_unauth_retry(
|
414
|
+
lambda: self._send_post_file_req_v2(
|
415
|
+
api_path, files, trace_id=trace_id, need_json_response=False
|
416
|
+
)
|
417
|
+
)
|
418
|
+
|
407
419
|
def dump_input_files(
|
408
420
|
self,
|
409
421
|
trace_id: str,
|
@@ -811,6 +823,17 @@ class _RestClient:
|
|
811
823
|
api_path = self.UNION_SEARCH_TASKS_URI_FMT
|
812
824
|
return self._with_unauth_retry(lambda: self._send_post_req(api_path, trace_id, request, result_format=None))
|
813
825
|
|
826
|
+
def upload_autofe_model(self, file_path: str, metadata: dict, trace_id: str):
|
827
|
+
api_path = self.UPLOAD_AUTOFE_MODEL_URI_FMT
|
828
|
+
with open(file_path, "rb") as file:
|
829
|
+
files = {
|
830
|
+
"meta": ("metadata.json", dumps(metadata).encode(), "application/json"),
|
831
|
+
"model": (Path(file_path).name, file, "application/octet-stream"),
|
832
|
+
}
|
833
|
+
return self._with_unauth_retry(
|
834
|
+
lambda: self._send_post_file_req_v2(api_path, files, trace_id=trace_id, need_json_response=False)
|
835
|
+
)
|
836
|
+
|
814
837
|
# ---
|
815
838
|
|
816
839
|
def _send_get_req(self, api_path: str, trace_id: Optional[str], additional_headers: Optional[dict] = None):
|
upgini/metrics.py
CHANGED
@@ -399,14 +399,14 @@ class EstimatorWrapper:
|
|
399
399
|
self.converted_to_str.append(c)
|
400
400
|
elif c in self.cat_features:
|
401
401
|
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
402
|
-
x[c] = x[c].astype(
|
402
|
+
x[c] = x[c].astype(pd.Int64Dtype())
|
403
403
|
self.converted_to_int.append(c)
|
404
404
|
elif x[c].dtype == "category" and is_integer_dtype(x[c].cat.categories):
|
405
405
|
self.logger.info(
|
406
406
|
f"Convert categorical feature {c} with integer categories"
|
407
407
|
" to int64 and remove from cat_features"
|
408
408
|
)
|
409
|
-
x[c] = x[c].astype(
|
409
|
+
x[c] = x[c].astype(pd.Int64Dtype())
|
410
410
|
self.converted_to_int.append(c)
|
411
411
|
self.cat_features.remove(c)
|
412
412
|
elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
|
@@ -419,7 +419,7 @@ class EstimatorWrapper:
|
|
419
419
|
else:
|
420
420
|
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
421
421
|
self.logger.info(f"Convert bool feature {c} to int64")
|
422
|
-
x[c] = x[c].astype(
|
422
|
+
x[c] = x[c].astype(pd.Int64Dtype())
|
423
423
|
self.converted_to_int.append(c)
|
424
424
|
elif not is_valid_numeric_array_data(x[c]) and not is_numeric_dtype(x[c]):
|
425
425
|
try:
|
@@ -442,7 +442,7 @@ class EstimatorWrapper:
|
|
442
442
|
if self.converted_to_int:
|
443
443
|
self.logger.info(f"Convert to int features on calculate metrics: {self.converted_to_int}")
|
444
444
|
for c in self.converted_to_int:
|
445
|
-
x[c] = x[c].astype(
|
445
|
+
x[c] = x[c].astype(pd.Int64Dtype())
|
446
446
|
|
447
447
|
if self.converted_to_str:
|
448
448
|
self.logger.info(f"Convert to str features on calculate metrics: {self.converted_to_str}")
|
@@ -896,7 +896,7 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
896
896
|
x[c] = x[c].astype("category")
|
897
897
|
|
898
898
|
for c in x.columns:
|
899
|
-
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
899
|
+
if x[c].dtype not in ["category", "int64", "float64", "bool", "Int64"]:
|
900
900
|
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
901
901
|
self.dropped_features.append(c)
|
902
902
|
x = x.drop(columns=c, errors="ignore")
|
@@ -987,7 +987,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
987
987
|
x[c] = x[c].astype("category")
|
988
988
|
params["cat_features"] = self.cat_features
|
989
989
|
for c in x.columns:
|
990
|
-
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
990
|
+
if x[c].dtype not in ["category", "int64", "float64", "bool", "Int64"]:
|
991
991
|
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
992
992
|
self.dropped_features.append(c)
|
993
993
|
x = x.drop(columns=c, errors="ignore")
|
@@ -24,8 +24,11 @@ from upgini.metadata import (
|
|
24
24
|
)
|
25
25
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
26
26
|
from upgini.utils import find_numbers_with_decimal_comma
|
27
|
+
from upgini.utils.country_utils import CountrySearchKeyConverter
|
27
28
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
29
|
+
from upgini.utils.ip_utils import IpSearchKeyConverter
|
28
30
|
from upgini.utils.phone_utils import PhoneSearchKeyConverter
|
31
|
+
from upgini.utils.postal_code_utils import PostalCodeSearchKeyConverter
|
29
32
|
|
30
33
|
|
31
34
|
class Normalizer:
|
@@ -65,6 +68,12 @@ class Normalizer:
|
|
65
68
|
|
66
69
|
df = self._convert_phone_numbers(df)
|
67
70
|
|
71
|
+
df = self._convert_ip_addresses(df)
|
72
|
+
|
73
|
+
df = self._convert_postal_codes(df)
|
74
|
+
|
75
|
+
df = self._convert_countries(df)
|
76
|
+
|
68
77
|
df = self.__convert_features_types(df)
|
69
78
|
|
70
79
|
return df, self.search_keys, self.generated_features
|
@@ -177,6 +186,22 @@ class Normalizer:
|
|
177
186
|
df = converter.convert(df)
|
178
187
|
return df
|
179
188
|
|
189
|
+
def _convert_ip_addresses(self, df: pd.DataFrame) -> pd.DataFrame:
|
190
|
+
for ip_col in SearchKey.find_all_keys(self.search_keys, SearchKey.IP):
|
191
|
+
df[ip_col] = df[ip_col].apply(IpSearchKeyConverter.safe_ip_parse)
|
192
|
+
return df
|
193
|
+
|
194
|
+
def _convert_postal_codes(self, df: pd.DataFrame) -> pd.DataFrame:
|
195
|
+
for postal_code_col in SearchKey.find_all_keys(self.search_keys, SearchKey.POSTAL_CODE):
|
196
|
+
df = PostalCodeSearchKeyConverter(postal_code_col).convert(df)
|
197
|
+
return df
|
198
|
+
|
199
|
+
def _convert_countries(self, df: pd.DataFrame) -> pd.DataFrame:
|
200
|
+
maybe_country_col = SearchKey.find_key(self.search_keys, SearchKey.COUNTRY)
|
201
|
+
if maybe_country_col:
|
202
|
+
df = CountrySearchKeyConverter(maybe_country_col).convert(df)
|
203
|
+
return df
|
204
|
+
|
180
205
|
def __convert_features_types(self, df: pd.DataFrame):
|
181
206
|
# self.logger.info("Convert features to supported data types")
|
182
207
|
|
@@ -284,8 +284,8 @@ quality_metrics_segment_header=Dataset type
|
|
284
284
|
quality_metrics_match_rate_header=Match rate
|
285
285
|
quality_metrics_baseline_header=Baseline {}
|
286
286
|
quality_metrics_enriched_header=Enriched {}
|
287
|
-
quality_metrics_uplift_header=Uplift
|
288
|
-
|
287
|
+
quality_metrics_uplift_header=Uplift, abs
|
288
|
+
quality_metrics_uplift_perc_header=Uplift, %
|
289
289
|
|
290
290
|
# Legacy native api messages
|
291
291
|
dataset_dataframe_or_path_empty=Either `df` or `path` must be provided
|
upgini/utils/ip_utils.py
CHANGED
@@ -79,7 +79,7 @@ class IpSearchKeyConverter:
|
|
79
79
|
pass
|
80
80
|
|
81
81
|
@staticmethod
|
82
|
-
def
|
82
|
+
def safe_ip_parse(ip: Union[str, int, IPv4Address, IPv6Address, bytes]) -> Optional[_BaseAddress]:
|
83
83
|
try:
|
84
84
|
return ip_address(ip)
|
85
85
|
except ValueError:
|
@@ -110,7 +110,7 @@ class IpSearchKeyConverter:
|
|
110
110
|
self.logger.info("Convert ip address to int")
|
111
111
|
original_ip = self.columns_renaming[self.ip_column]
|
112
112
|
|
113
|
-
df[self.ip_column] = df[self.ip_column].apply(self.
|
113
|
+
df[self.ip_column] = df[self.ip_column].apply(self.safe_ip_parse)
|
114
114
|
if df[self.ip_column].isnull().all():
|
115
115
|
raise ValidationError(self.bundle.get("invalid_ip").format(self.ip_column))
|
116
116
|
|
@@ -1,12 +1,12 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=lf7CSxsvho_1d6v9vU7WGiP7TC1XTdLXyd8LGFkkMoU,23
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=e6JDYTZ2AwC5aF-dqclKZKkiKrHo2f6cFmMQO2ZZmjM,32724
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
7
|
-
upgini/http.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=KSOEzO29nY79RIW0hdbf1qXQGxa3itKZ0PkcwVPPf9U,219954
|
7
|
+
upgini/http.py,sha256=DNcoS7qdxG0mOJn6I8r6O5I6XdIJTdzDzW3hkz3NgG4,45443
|
8
8
|
upgini/metadata.py,sha256=vsbbHyPCP3Rs8WkeDgQg99uAA_zmsbDStAT-NwDYhO4,12455
|
9
|
-
upgini/metrics.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=gXr2aiw5j9QBWBo1hZp40Is679hef5q8MrT6LJfjsBk,45661
|
10
10
|
upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
@@ -16,10 +16,10 @@ upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
16
|
upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
|
17
17
|
upgini/autofe/binary.py,sha256=oOEECc4nRzZN2tYaiqx8F2XHnfWpk1bVvb7ZkZJ0lO8,7709
|
18
18
|
upgini/autofe/date.py,sha256=MM1S-6imNSzCDOhbNnmsc_bwSqUWBcS8vWAdHF8j1kY,11134
|
19
|
-
upgini/autofe/feature.py,sha256=
|
19
|
+
upgini/autofe/feature.py,sha256=1jiy9_aiaQdVGIh5UbnIGF8St5BkiikOUh5KywMLYRY,16056
|
20
20
|
upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
|
21
21
|
upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
|
22
|
-
upgini/autofe/unary.py,sha256=
|
22
|
+
upgini/autofe/unary.py,sha256=FFtvkQaT0cu_zPZ1jCLcsjik-UUh12qQFF3tUW8NqsE,6675
|
23
23
|
upgini/autofe/utils.py,sha256=dYrtyAM8Vcc_R8u4dNo54IsGrHKagTHDJTKhGho0bRg,2967
|
24
24
|
upgini/autofe/vector.py,sha256=jHs0nNTOaHspYUlxW7fjQepk4cvr_JDQ65L1OCiVsds,1360
|
25
25
|
upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
|
@@ -31,14 +31,14 @@ upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_a
|
|
31
31
|
upgini/autofe/timeseries/trend.py,sha256=K1_iw2ko_LIUU8YCUgrvN3n0MkHtsi7-63-8x9er1k4,2129
|
32
32
|
upgini/autofe/timeseries/volatility.py,sha256=SvZfhM_ZAWCNpTf87WjSnZsnlblARgruDlu4By4Zvhc,8078
|
33
33
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
34
|
-
upgini/data_source/data_source_publisher.py,sha256=
|
34
|
+
upgini/data_source/data_source_publisher.py,sha256=iqcDx2oRiVyeuzQqqPBnfqWhzQSXTyya0wk2ltibBAA,25010
|
35
35
|
upgini/mdc/__init__.py,sha256=iHJlXQg6xRM1-ZOUtaPSJqw5SpQDszvxp4LyqviNLIQ,1027
|
36
36
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
37
37
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
|
-
upgini/normalizer/normalize_utils.py,sha256=
|
38
|
+
upgini/normalizer/normalize_utils.py,sha256=mDh2mBW3aQMB4EFP2aHbf2dGMVkOcWnp4sKKvKDBh8w,8511
|
39
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
40
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
41
|
-
upgini/resource_bundle/strings.properties,sha256=
|
41
|
+
upgini/resource_bundle/strings.properties,sha256=UO6K0wwvutyOyClOnJYlFYAETzMSen6hHnj3--5AIAs,28497
|
42
42
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
43
43
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
44
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
@@ -59,7 +59,7 @@ upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0-
|
|
59
59
|
upgini/utils/feature_info.py,sha256=b3RvAeOHSEu-ZXWTrf42Dll_3ZUBL0pw7sdk7hgUKD0,7284
|
60
60
|
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
61
61
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
62
|
-
upgini/utils/ip_utils.py,sha256=
|
62
|
+
upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
|
63
63
|
upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
64
64
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
65
65
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
@@ -71,7 +71,7 @@ upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,
|
|
71
71
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
72
72
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
73
73
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
74
|
-
upgini-1.2.
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
77
|
-
upgini-1.2.
|
74
|
+
upgini-1.2.98.dist-info/METADATA,sha256=3-L9c9p3ul3Cz1l1zFF2XTxlkN21bygcCWccOd6KbZo,49528
|
75
|
+
upgini-1.2.98.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
76
|
+
upgini-1.2.98.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
77
|
+
upgini-1.2.98.dist-info/RECORD,,
|
File without changes
|
File without changes
|