upgini 1.2.96a3906.dev1__py3-none-any.whl → 1.2.97__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/data_source/data_source_publisher.py +20 -3
- upgini/features_enricher.py +81 -78
- upgini/http.py +23 -0
- upgini/normalizer/normalize_utils.py +50 -23
- upgini/resource_bundle/strings.properties +2 -2
- upgini/utils/ip_utils.py +2 -2
- {upgini-1.2.96a3906.dev1.dist-info → upgini-1.2.97.dist-info}/METADATA +1 -1
- {upgini-1.2.96a3906.dev1.dist-info → upgini-1.2.97.dist-info}/RECORD +11 -11
- {upgini-1.2.96a3906.dev1.dist-info → upgini-1.2.97.dist-info}/WHEEL +0 -0
- {upgini-1.2.96a3906.dev1.dist-info → upgini-1.2.97.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.97"
|
@@ -149,9 +149,8 @@ class DataSourcePublisher:
|
|
149
149
|
existing_secondary_keys = {item for sublist in row["secondarySearchKeys"] for item in sublist}
|
150
150
|
if existing_secondary_keys == {v.value.name for v in secondary_search_keys.values()}:
|
151
151
|
existing_search_keys = {item for sublist in row["searchKeys"] for item in sublist}
|
152
|
-
if (
|
153
|
-
|
154
|
-
or ("IP" in str(existing_search_keys) and "IP" in str(search_keys.values()))
|
152
|
+
if existing_search_keys == {v.value.name for v in search_keys.values()} or (
|
153
|
+
"IP" in str(existing_search_keys) and "IP" in str(search_keys.values())
|
155
154
|
):
|
156
155
|
raise ValidationError(
|
157
156
|
"ADS with the same PRIMARY_KEYS -> SECONDARY_KEYS mapping "
|
@@ -494,3 +493,21 @@ class DataSourcePublisher:
|
|
494
493
|
raise Exception("Failed to reannounce all ADS: " + status_response["errorMessage"])
|
495
494
|
except Exception:
|
496
495
|
self.logger.exception("Failed to reannounce all ADS-es")
|
496
|
+
|
497
|
+
def upload_autofe_model(
|
498
|
+
self, file_path: str, name: str, model_type: Optional[Literal["ONNX"]] = None, description: str = ""
|
499
|
+
):
|
500
|
+
if model_type is not None and model_type not in ["ONNX"]:
|
501
|
+
raise ValueError(f"Invalid model type: {model_type}. Available values: ONNX")
|
502
|
+
metadata = {
|
503
|
+
"modelName": name,
|
504
|
+
"modelType": model_type or "ONNX",
|
505
|
+
"description": description,
|
506
|
+
}
|
507
|
+
trace_id = str(uuid.uuid4())
|
508
|
+
with MDC(trace_id=trace_id):
|
509
|
+
try:
|
510
|
+
self._rest_client.upload_autofe_model(file_path, metadata, trace_id)
|
511
|
+
except Exception:
|
512
|
+
self.logger.exception("Failed to upload autofe model")
|
513
|
+
raise
|
upgini/features_enricher.py
CHANGED
@@ -71,10 +71,7 @@ from upgini.search_task import SearchTask
|
|
71
71
|
from upgini.spinner import Spinner
|
72
72
|
from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
|
73
73
|
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|
74
|
-
from upgini.utils.country_utils import
|
75
|
-
CountrySearchKeyConverter,
|
76
|
-
CountrySearchKeyDetector,
|
77
|
-
)
|
74
|
+
from upgini.utils.country_utils import CountrySearchKeyDetector
|
78
75
|
from upgini.utils.custom_loss_utils import (
|
79
76
|
get_additional_params_custom_loss,
|
80
77
|
get_runtime_params_custom_loss,
|
@@ -105,11 +102,8 @@ from upgini.utils.feature_info import FeatureInfo, _round_shap_value
|
|
105
102
|
from upgini.utils.features_validator import FeaturesValidator
|
106
103
|
from upgini.utils.format import Format
|
107
104
|
from upgini.utils.ip_utils import IpSearchKeyConverter
|
108
|
-
from upgini.utils.phone_utils import
|
109
|
-
from upgini.utils.postal_code_utils import
|
110
|
-
PostalCodeSearchKeyConverter,
|
111
|
-
PostalCodeSearchKeyDetector,
|
112
|
-
)
|
105
|
+
from upgini.utils.phone_utils import PhoneSearchKeyDetector
|
106
|
+
from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
|
113
107
|
|
114
108
|
try:
|
115
109
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
@@ -1122,6 +1116,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1122
1116
|
# and calculate final metric (and uplift)
|
1123
1117
|
enriched_metric = None
|
1124
1118
|
uplift = None
|
1119
|
+
uplift_perc = None
|
1125
1120
|
enriched_estimator = None
|
1126
1121
|
if set(fitting_X.columns) != set(fitting_enriched_X.columns):
|
1127
1122
|
self.logger.info(
|
@@ -1153,6 +1148,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1153
1148
|
self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
|
1154
1149
|
if baseline_metric is not None and enriched_metric is not None:
|
1155
1150
|
uplift = (enriched_cv_result.metric - baseline_cv_result.metric) * multiplier
|
1151
|
+
uplift_perc = uplift / abs(baseline_cv_result.metric) * 100
|
1156
1152
|
|
1157
1153
|
train_metrics = {
|
1158
1154
|
self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
|
@@ -1179,7 +1175,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
1179
1175
|
enriched_metric
|
1180
1176
|
)
|
1181
1177
|
if uplift is not None:
|
1182
|
-
train_metrics[self.bundle.get("quality_metrics_uplift_header")] = uplift
|
1178
|
+
train_metrics[self.bundle.get("quality_metrics_uplift_header")] = round(uplift, 3)
|
1179
|
+
train_metrics[self.bundle.get("quality_metrics_uplift_perc_header")] = (
|
1180
|
+
f"{round(uplift_perc, 1)}%"
|
1181
|
+
)
|
1183
1182
|
metrics = [train_metrics]
|
1184
1183
|
|
1185
1184
|
# 3 If eval_set is presented - fit final model on train enriched data and score each
|
@@ -1228,8 +1227,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
1228
1227
|
|
1229
1228
|
if etalon_eval_metric is not None and enriched_eval_metric is not None:
|
1230
1229
|
eval_uplift = (enriched_eval_results.metric - etalon_eval_results.metric) * multiplier
|
1230
|
+
eval_uplift_perc = eval_uplift / abs(etalon_eval_results.metric) * 100
|
1231
1231
|
else:
|
1232
1232
|
eval_uplift = None
|
1233
|
+
eval_uplift_perc = None
|
1233
1234
|
|
1234
1235
|
eval_metrics = {
|
1235
1236
|
self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
|
@@ -1260,7 +1261,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
1260
1261
|
enriched_eval_metric
|
1261
1262
|
)
|
1262
1263
|
if eval_uplift is not None:
|
1263
|
-
eval_metrics[self.bundle.get("quality_metrics_uplift_header")] = eval_uplift
|
1264
|
+
eval_metrics[self.bundle.get("quality_metrics_uplift_header")] = round(eval_uplift, 3)
|
1265
|
+
eval_metrics[self.bundle.get("quality_metrics_uplift_perc_header")] = (
|
1266
|
+
f"{round(eval_uplift_perc, 1)}%"
|
1267
|
+
)
|
1264
1268
|
|
1265
1269
|
metrics.append(eval_metrics)
|
1266
1270
|
|
@@ -2495,21 +2499,6 @@ if response.status_code == 200:
|
|
2495
2499
|
)
|
2496
2500
|
df = converter.convert(df)
|
2497
2501
|
|
2498
|
-
phone_column = self._get_phone_column(search_keys)
|
2499
|
-
country_column = self._get_country_column(search_keys)
|
2500
|
-
if phone_column:
|
2501
|
-
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
2502
|
-
df = converter.convert(df)
|
2503
|
-
|
2504
|
-
if country_column:
|
2505
|
-
converter = CountrySearchKeyConverter(country_column)
|
2506
|
-
df = converter.convert(df)
|
2507
|
-
|
2508
|
-
postal_code = self._get_postal_column(search_keys)
|
2509
|
-
if postal_code:
|
2510
|
-
converter = PostalCodeSearchKeyConverter(postal_code)
|
2511
|
-
df = converter.convert(df)
|
2512
|
-
|
2513
2502
|
meaning_types = {}
|
2514
2503
|
meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
|
2515
2504
|
meaning_types.update({col: key.value for col, key in search_keys.items()})
|
@@ -2904,6 +2893,7 @@ if response.status_code == 200:
|
|
2904
2893
|
self.fit_generated_features.extend(converter.generated_features)
|
2905
2894
|
else:
|
2906
2895
|
self.logger.info("Input dataset hasn't date column")
|
2896
|
+
# TODO remove when this logic will be implemented on the back
|
2907
2897
|
if self.__should_add_date_column():
|
2908
2898
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
2909
2899
|
|
@@ -2935,6 +2925,26 @@ if response.status_code == 200:
|
|
2935
2925
|
if normalizer.removed_features:
|
2936
2926
|
self.__log_warning(self.bundle.get("dataset_date_features").format(normalizer.removed_features))
|
2937
2927
|
|
2928
|
+
non_feature_columns = [
|
2929
|
+
self.TARGET_NAME,
|
2930
|
+
EVAL_SET_INDEX,
|
2931
|
+
] + list(self.fit_search_keys.keys())
|
2932
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
2933
|
+
non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
|
2934
|
+
|
2935
|
+
features_columns = [c for c in df.columns if c not in non_feature_columns]
|
2936
|
+
|
2937
|
+
features_to_drop, feature_validator_warnings = FeaturesValidator(self.logger).validate(
|
2938
|
+
df, features_columns, self.generate_features, self.fit_columns_renaming
|
2939
|
+
)
|
2940
|
+
if feature_validator_warnings:
|
2941
|
+
for warning in feature_validator_warnings:
|
2942
|
+
self.__log_warning(warning)
|
2943
|
+
self.fit_dropped_features.update(features_to_drop)
|
2944
|
+
df = df.drop(columns=features_to_drop)
|
2945
|
+
|
2946
|
+
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
2947
|
+
|
2938
2948
|
self.__adjust_cv(df)
|
2939
2949
|
|
2940
2950
|
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
@@ -2974,6 +2984,7 @@ if response.status_code == 200:
|
|
2974
2984
|
# Convert EMAIL to HEM etc after unnesting to do it only with one column
|
2975
2985
|
df = self.__convert_unnestable_keys(df, unnest_search_keys)
|
2976
2986
|
|
2987
|
+
# refresh features columns
|
2977
2988
|
non_feature_columns = [
|
2978
2989
|
self.TARGET_NAME,
|
2979
2990
|
EVAL_SET_INDEX,
|
@@ -2985,17 +2996,6 @@ if response.status_code == 200:
|
|
2985
2996
|
|
2986
2997
|
features_columns = [c for c in df.columns if c not in non_feature_columns]
|
2987
2998
|
|
2988
|
-
features_to_drop, feature_validator_warnings = FeaturesValidator(self.logger).validate(
|
2989
|
-
df, features_columns, self.generate_features, self.fit_columns_renaming
|
2990
|
-
)
|
2991
|
-
if feature_validator_warnings:
|
2992
|
-
for warning in feature_validator_warnings:
|
2993
|
-
self.__log_warning(warning)
|
2994
|
-
self.fit_dropped_features.update(features_to_drop)
|
2995
|
-
df = df.drop(columns=features_to_drop)
|
2996
|
-
|
2997
|
-
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
2998
|
-
|
2999
2999
|
meaning_types = {
|
3000
3000
|
**{col: key.value for col, key in self.fit_search_keys.items()},
|
3001
3001
|
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
@@ -3225,20 +3225,6 @@ if response.status_code == 200:
|
|
3225
3225
|
self.logger,
|
3226
3226
|
)
|
3227
3227
|
df = converter.convert(df)
|
3228
|
-
phone_column = self._get_phone_column(self.fit_search_keys)
|
3229
|
-
country_column = self._get_country_column(self.fit_search_keys)
|
3230
|
-
if phone_column:
|
3231
|
-
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
3232
|
-
df = converter.convert(df)
|
3233
|
-
|
3234
|
-
if country_column:
|
3235
|
-
converter = CountrySearchKeyConverter(country_column)
|
3236
|
-
df = converter.convert(df)
|
3237
|
-
|
3238
|
-
postal_code = self._get_postal_column(self.fit_search_keys)
|
3239
|
-
if postal_code:
|
3240
|
-
converter = PostalCodeSearchKeyConverter(postal_code)
|
3241
|
-
df = converter.convert(df)
|
3242
3228
|
|
3243
3229
|
return df
|
3244
3230
|
|
@@ -4642,42 +4628,59 @@ if response.status_code == 200:
|
|
4642
4628
|
if isinstance(X_, pd.Series):
|
4643
4629
|
X_ = X_.to_frame()
|
4644
4630
|
|
4645
|
-
# TODO check that this file was already uploaded
|
4646
|
-
|
4647
4631
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
4648
4632
|
X_.to_parquet(f"{tmp_dir}/x.parquet", compression="zstd")
|
4633
|
+
x_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/x.parquet")
|
4634
|
+
if self.rest_client.is_file_uploaded(trace_id, x_digest_sha256):
|
4635
|
+
self.logger.info(f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping")
|
4636
|
+
else:
|
4637
|
+
self.rest_client.dump_input_file(trace_id, f"{tmp_dir}/x.parquet", "x.parquet")
|
4649
4638
|
|
4650
4639
|
if y_ is not None:
|
4651
4640
|
if isinstance(y_, pd.Series):
|
4652
4641
|
y_ = y_.to_frame()
|
4653
4642
|
y_.to_parquet(f"{tmp_dir}/y.parquet", compression="zstd")
|
4654
|
-
|
4655
|
-
|
4656
|
-
|
4657
|
-
|
4658
|
-
eval_x_ = eval_x_.to_frame()
|
4659
|
-
eval_x_.to_parquet(f"{tmp_dir}/eval_x.parquet", compression="zstd")
|
4660
|
-
if isinstance(eval_y_, pd.Series):
|
4661
|
-
eval_y_ = eval_y_.to_frame()
|
4662
|
-
eval_y_.to_parquet(f"{tmp_dir}/eval_y.parquet", compression="zstd")
|
4663
|
-
self.rest_client.dump_input_files(
|
4664
|
-
trace_id,
|
4665
|
-
f"{tmp_dir}/x.parquet",
|
4666
|
-
f"{tmp_dir}/y.parquet",
|
4667
|
-
f"{tmp_dir}/eval_x.parquet",
|
4668
|
-
f"{tmp_dir}/eval_y.parquet",
|
4643
|
+
y_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/y.parquet")
|
4644
|
+
if self.rest_client.is_file_uploaded(trace_id, y_digest_sha256):
|
4645
|
+
self.logger.info(
|
4646
|
+
f"File y.parquet was already uploaded with digest {y_digest_sha256}, skipping"
|
4669
4647
|
)
|
4670
4648
|
else:
|
4671
|
-
self.rest_client.
|
4672
|
-
|
4673
|
-
|
4674
|
-
|
4675
|
-
|
4676
|
-
|
4677
|
-
|
4678
|
-
|
4679
|
-
|
4680
|
-
|
4649
|
+
self.rest_client.dump_input_file(trace_id, f"{tmp_dir}/y.parquet", "y.parquet")
|
4650
|
+
|
4651
|
+
if eval_set_ is not None and len(eval_set_) > 0:
|
4652
|
+
for idx, (eval_x_, eval_y_) in enumerate(eval_set_):
|
4653
|
+
if isinstance(eval_x_, pd.Series):
|
4654
|
+
eval_x_ = eval_x_.to_frame()
|
4655
|
+
eval_x_.to_parquet(f"{tmp_dir}/eval_x_{idx}.parquet", compression="zstd")
|
4656
|
+
eval_x_digest_sha256 = self.rest_client.compute_file_digest(
|
4657
|
+
f"{tmp_dir}/eval_x_{idx}.parquet"
|
4658
|
+
)
|
4659
|
+
if self.rest_client.is_file_uploaded(trace_id, eval_x_digest_sha256):
|
4660
|
+
self.logger.info(
|
4661
|
+
f"File eval_x_{idx}.parquet was already uploaded with"
|
4662
|
+
f" digest {eval_x_digest_sha256}, skipping"
|
4663
|
+
)
|
4664
|
+
else:
|
4665
|
+
self.rest_client.dump_input_file(
|
4666
|
+
trace_id, f"{tmp_dir}/eval_x_{idx}.parquet", f"eval_x_{idx}.parquet"
|
4667
|
+
)
|
4668
|
+
|
4669
|
+
if isinstance(eval_y_, pd.Series):
|
4670
|
+
eval_y_ = eval_y_.to_frame()
|
4671
|
+
eval_y_.to_parquet(f"{tmp_dir}/eval_y_{idx}.parquet", compression="zstd")
|
4672
|
+
eval_y_digest_sha256 = self.rest_client.compute_file_digest(
|
4673
|
+
f"{tmp_dir}/eval_y_{idx}.parquet"
|
4674
|
+
)
|
4675
|
+
if self.rest_client.is_file_uploaded(trace_id, eval_y_digest_sha256):
|
4676
|
+
self.logger.info(
|
4677
|
+
f"File eval_y_{idx}.parquet was already uploaded"
|
4678
|
+
f" with digest {eval_y_digest_sha256}, skipping"
|
4679
|
+
)
|
4680
|
+
else:
|
4681
|
+
self.rest_client.dump_input_file(
|
4682
|
+
trace_id, f"{tmp_dir}/eval_y_{idx}.parquet", f"eval_y_{idx}.parquet"
|
4683
|
+
)
|
4681
4684
|
except Exception:
|
4682
4685
|
self.logger.warning("Failed to dump input files", exc_info=True)
|
4683
4686
|
|
upgini/http.py
CHANGED
@@ -12,6 +12,7 @@ from enum import Enum
|
|
12
12
|
from functools import lru_cache
|
13
13
|
from http.client import HTTPConnection
|
14
14
|
from json import dumps
|
15
|
+
from pathlib import Path
|
15
16
|
from typing import Any, Dict, List, Optional, Tuple
|
16
17
|
from urllib.parse import urljoin
|
17
18
|
|
@@ -292,6 +293,7 @@ class _RestClient:
|
|
292
293
|
UPLOAD_ONLINE_URI = "private/api/v2/ads/upload-online"
|
293
294
|
STOP_ADS_MANAGEMENT_TASK_URI_FMT = "private/api/v2/ads/management-task/{0}/stop"
|
294
295
|
UNION_SEARCH_TASKS_URI_FMT = SERVICE_ROOT_V2 + "search/merge"
|
296
|
+
UPLOAD_AUTOFE_MODEL_URI_FMT = "private/api/v2/autofe/model/upload"
|
295
297
|
|
296
298
|
ACCESS_TOKEN_HEADER_NAME = "Authorization"
|
297
299
|
CONTENT_TYPE_HEADER_NAME = "Content-Type"
|
@@ -404,6 +406,16 @@ class _RestClient:
|
|
404
406
|
meaning_types = [_RestClient.meaning_type_by_name(name, metadata) for name in search_key_names]
|
405
407
|
return [meaning_type.value for meaning_type in meaning_types if meaning_type is not None]
|
406
408
|
|
409
|
+
def dump_input_file(self, trace_id: str, path: str, file_name: str):
|
410
|
+
api_path = self.SEARCH_DUMP_INPUT_FILE_FMT
|
411
|
+
with open(path, "rb") as file:
|
412
|
+
files = {"file": (file_name, file, "application/octet-stream")}
|
413
|
+
self._with_unauth_retry(
|
414
|
+
lambda: self._send_post_file_req_v2(
|
415
|
+
api_path, files, trace_id=trace_id, need_json_response=False
|
416
|
+
)
|
417
|
+
)
|
418
|
+
|
407
419
|
def dump_input_files(
|
408
420
|
self,
|
409
421
|
trace_id: str,
|
@@ -811,6 +823,17 @@ class _RestClient:
|
|
811
823
|
api_path = self.UNION_SEARCH_TASKS_URI_FMT
|
812
824
|
return self._with_unauth_retry(lambda: self._send_post_req(api_path, trace_id, request, result_format=None))
|
813
825
|
|
826
|
+
def upload_autofe_model(self, file_path: str, metadata: dict, trace_id: str):
|
827
|
+
api_path = self.UPLOAD_AUTOFE_MODEL_URI_FMT
|
828
|
+
with open(file_path, "rb") as file:
|
829
|
+
files = {
|
830
|
+
"meta": ("metadata.json", dumps(metadata).encode(), "application/json"),
|
831
|
+
"model": (Path(file_path).name, file, "application/octet-stream"),
|
832
|
+
}
|
833
|
+
return self._with_unauth_retry(
|
834
|
+
lambda: self._send_post_file_req_v2(api_path, files, trace_id=trace_id, need_json_response=False)
|
835
|
+
)
|
836
|
+
|
814
837
|
# ---
|
815
838
|
|
816
839
|
def _send_get_req(self, api_path: str, trace_id: Optional[str], additional_headers: Optional[dict] = None):
|
@@ -24,8 +24,11 @@ from upgini.metadata import (
|
|
24
24
|
)
|
25
25
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
26
26
|
from upgini.utils import find_numbers_with_decimal_comma
|
27
|
+
from upgini.utils.country_utils import CountrySearchKeyConverter
|
27
28
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
29
|
+
from upgini.utils.ip_utils import IpSearchKeyConverter
|
28
30
|
from upgini.utils.phone_utils import PhoneSearchKeyConverter
|
31
|
+
from upgini.utils.postal_code_utils import PostalCodeSearchKeyConverter
|
29
32
|
|
30
33
|
|
31
34
|
class Normalizer:
|
@@ -65,6 +68,12 @@ class Normalizer:
|
|
65
68
|
|
66
69
|
df = self._convert_phone_numbers(df)
|
67
70
|
|
71
|
+
df = self._convert_ip_addresses(df)
|
72
|
+
|
73
|
+
df = self._convert_postal_codes(df)
|
74
|
+
|
75
|
+
df = self._convert_countries(df)
|
76
|
+
|
68
77
|
df = self.__convert_features_types(df)
|
69
78
|
|
70
79
|
return df, self.search_keys, self.generated_features
|
@@ -74,33 +83,19 @@ class Normalizer:
|
|
74
83
|
new_columns = []
|
75
84
|
dup_counter = 0
|
76
85
|
for column in df.columns:
|
77
|
-
if
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
DateTimeSearchKeyConverter.DATETIME_COL,
|
86
|
-
]
|
87
|
-
):
|
86
|
+
if column in [
|
87
|
+
TARGET,
|
88
|
+
EVAL_SET_INDEX,
|
89
|
+
SYSTEM_RECORD_ID,
|
90
|
+
ENTITY_SYSTEM_RECORD_ID,
|
91
|
+
SEARCH_KEY_UNNEST,
|
92
|
+
DateTimeSearchKeyConverter.DATETIME_COL,
|
93
|
+
]:
|
88
94
|
self.columns_renaming[column] = column
|
89
95
|
new_columns.append(column)
|
90
96
|
continue
|
91
97
|
|
92
|
-
new_column =
|
93
|
-
suffix = hashlib.sha256(new_column.encode()).hexdigest()[:6]
|
94
|
-
if len(new_column) == 0:
|
95
|
-
raise ValidationError(self.bundle.get("dataset_empty_column_names"))
|
96
|
-
# db limit for column length
|
97
|
-
if len(new_column) > 250:
|
98
|
-
new_column = new_column[:250]
|
99
|
-
|
100
|
-
# make column name unique relative to server features
|
101
|
-
new_column = f"{new_column}_{suffix}"
|
102
|
-
|
103
|
-
new_column = new_column.lower()
|
98
|
+
new_column = add_hash_suffix(column, self.bundle)
|
104
99
|
|
105
100
|
# if column starts with non alphabetic symbol then add "a" to the beginning of string
|
106
101
|
if ord(new_column[0]) not in range(ord("a"), ord("z") + 1):
|
@@ -191,6 +186,22 @@ class Normalizer:
|
|
191
186
|
df = converter.convert(df)
|
192
187
|
return df
|
193
188
|
|
189
|
+
def _convert_ip_addresses(self, df: pd.DataFrame) -> pd.DataFrame:
|
190
|
+
for ip_col in SearchKey.find_all_keys(self.search_keys, SearchKey.IP):
|
191
|
+
df[ip_col] = df[ip_col].apply(IpSearchKeyConverter.safe_ip_parse)
|
192
|
+
return df
|
193
|
+
|
194
|
+
def _convert_postal_codes(self, df: pd.DataFrame) -> pd.DataFrame:
|
195
|
+
for postal_code_col in SearchKey.find_all_keys(self.search_keys, SearchKey.POSTAL_CODE):
|
196
|
+
df = PostalCodeSearchKeyConverter(postal_code_col).convert(df)
|
197
|
+
return df
|
198
|
+
|
199
|
+
def _convert_countries(self, df: pd.DataFrame) -> pd.DataFrame:
|
200
|
+
maybe_country_col = SearchKey.find_key(self.search_keys, SearchKey.COUNTRY)
|
201
|
+
if maybe_country_col:
|
202
|
+
df = CountrySearchKeyConverter(maybe_country_col).convert(df)
|
203
|
+
return df
|
204
|
+
|
194
205
|
def __convert_features_types(self, df: pd.DataFrame):
|
195
206
|
# self.logger.info("Convert features to supported data types")
|
196
207
|
|
@@ -198,3 +209,19 @@ class Normalizer:
|
|
198
209
|
if not is_numeric_dtype(df[f]):
|
199
210
|
df[f] = df[f].astype("string")
|
200
211
|
return df
|
212
|
+
|
213
|
+
|
214
|
+
def add_hash_suffix(column: str, bundle: ResourceBundle | None = None) -> str:
|
215
|
+
new_column = str(column)
|
216
|
+
suffix = hashlib.sha256(new_column.encode()).hexdigest()[:6]
|
217
|
+
if bundle is not None and len(new_column) == 0:
|
218
|
+
raise ValidationError(bundle.get("dataset_empty_column_names"))
|
219
|
+
# db limit for column length
|
220
|
+
if len(new_column) > 250:
|
221
|
+
new_column = new_column[:250]
|
222
|
+
|
223
|
+
# make column name unique relative to server features
|
224
|
+
new_column = f"{new_column}_{suffix}"
|
225
|
+
|
226
|
+
new_column = new_column.lower()
|
227
|
+
return new_column
|
@@ -284,8 +284,8 @@ quality_metrics_segment_header=Dataset type
|
|
284
284
|
quality_metrics_match_rate_header=Match rate
|
285
285
|
quality_metrics_baseline_header=Baseline {}
|
286
286
|
quality_metrics_enriched_header=Enriched {}
|
287
|
-
quality_metrics_uplift_header=Uplift
|
288
|
-
|
287
|
+
quality_metrics_uplift_header=Uplift, abs
|
288
|
+
quality_metrics_uplift_perc_header=Uplift, %
|
289
289
|
|
290
290
|
# Legacy native api messages
|
291
291
|
dataset_dataframe_or_path_empty=Either `df` or `path` must be provided
|
upgini/utils/ip_utils.py
CHANGED
@@ -79,7 +79,7 @@ class IpSearchKeyConverter:
|
|
79
79
|
pass
|
80
80
|
|
81
81
|
@staticmethod
|
82
|
-
def
|
82
|
+
def safe_ip_parse(ip: Union[str, int, IPv4Address, IPv6Address, bytes]) -> Optional[_BaseAddress]:
|
83
83
|
try:
|
84
84
|
return ip_address(ip)
|
85
85
|
except ValueError:
|
@@ -110,7 +110,7 @@ class IpSearchKeyConverter:
|
|
110
110
|
self.logger.info("Convert ip address to int")
|
111
111
|
original_ip = self.columns_renaming[self.ip_column]
|
112
112
|
|
113
|
-
df[self.ip_column] = df[self.ip_column].apply(self.
|
113
|
+
df[self.ip_column] = df[self.ip_column].apply(self.safe_ip_parse)
|
114
114
|
if df[self.ip_column].isnull().all():
|
115
115
|
raise ValidationError(self.bundle.get("invalid_ip").format(self.ip_column))
|
116
116
|
|
@@ -1,10 +1,10 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=JMIYICLhPHLeOmqLMrl8GF2-GUkujhXH6zWRyr1_Nyw,23
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=e6JDYTZ2AwC5aF-dqclKZKkiKrHo2f6cFmMQO2ZZmjM,32724
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
7
|
-
upgini/http.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=4rKoV-3jM876Fk0fM4XlnW3fLwXvk1KN2ymcwlAfPm0,219941
|
7
|
+
upgini/http.py,sha256=DNcoS7qdxG0mOJn6I8r6O5I6XdIJTdzDzW3hkz3NgG4,45443
|
8
8
|
upgini/metadata.py,sha256=vsbbHyPCP3Rs8WkeDgQg99uAA_zmsbDStAT-NwDYhO4,12455
|
9
9
|
upgini/metrics.py,sha256=UbKEsHB7XDzoyGNqDx846zbh1t65GpqdnnhViccdoKU,45615
|
10
10
|
upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
|
@@ -31,14 +31,14 @@ upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_a
|
|
31
31
|
upgini/autofe/timeseries/trend.py,sha256=K1_iw2ko_LIUU8YCUgrvN3n0MkHtsi7-63-8x9er1k4,2129
|
32
32
|
upgini/autofe/timeseries/volatility.py,sha256=SvZfhM_ZAWCNpTf87WjSnZsnlblARgruDlu4By4Zvhc,8078
|
33
33
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
34
|
-
upgini/data_source/data_source_publisher.py,sha256=
|
34
|
+
upgini/data_source/data_source_publisher.py,sha256=iqcDx2oRiVyeuzQqqPBnfqWhzQSXTyya0wk2ltibBAA,25010
|
35
35
|
upgini/mdc/__init__.py,sha256=iHJlXQg6xRM1-ZOUtaPSJqw5SpQDszvxp4LyqviNLIQ,1027
|
36
36
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
37
37
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
|
-
upgini/normalizer/normalize_utils.py,sha256=
|
38
|
+
upgini/normalizer/normalize_utils.py,sha256=mDh2mBW3aQMB4EFP2aHbf2dGMVkOcWnp4sKKvKDBh8w,8511
|
39
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
40
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
41
|
-
upgini/resource_bundle/strings.properties,sha256=
|
41
|
+
upgini/resource_bundle/strings.properties,sha256=UO6K0wwvutyOyClOnJYlFYAETzMSen6hHnj3--5AIAs,28497
|
42
42
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
43
43
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
44
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
@@ -59,7 +59,7 @@ upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0-
|
|
59
59
|
upgini/utils/feature_info.py,sha256=b3RvAeOHSEu-ZXWTrf42Dll_3ZUBL0pw7sdk7hgUKD0,7284
|
60
60
|
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
61
61
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
62
|
-
upgini/utils/ip_utils.py,sha256=
|
62
|
+
upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
|
63
63
|
upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
64
64
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
65
65
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
@@ -71,7 +71,7 @@ upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,
|
|
71
71
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
72
72
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
73
73
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
74
|
-
upgini-1.2.
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
77
|
-
upgini-1.2.
|
74
|
+
upgini-1.2.97.dist-info/METADATA,sha256=J026R7Cl6GQ2NHGHzjxhrnvDLM_mXQvQy2jqvveKvaI,49528
|
75
|
+
upgini-1.2.97.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
76
|
+
upgini-1.2.97.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
77
|
+
upgini-1.2.97.dist-info/RECORD,,
|
File without changes
|
File without changes
|