upgini 1.2.120a1__py3-none-any.whl → 1.2.121a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/data_source/data_source_publisher.py +6 -3
- upgini/dataset.py +0 -2
- upgini/features_enricher.py +7 -10
- upgini/http.py +1 -11
- upgini/utils/features_validator.py +39 -15
- upgini/utils/sklearn_ext.py +3 -4
- {upgini-1.2.120a1.dist-info → upgini-1.2.121a1.dist-info}/METADATA +1 -1
- {upgini-1.2.120a1.dist-info → upgini-1.2.121a1.dist-info}/RECORD +11 -11
- {upgini-1.2.120a1.dist-info → upgini-1.2.121a1.dist-info}/WHEEL +0 -0
- {upgini-1.2.120a1.dist-info → upgini-1.2.121a1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.121a1"
|
@@ -519,21 +519,24 @@ class DataSourcePublisher:
|
|
519
519
|
description: str = "",
|
520
520
|
):
|
521
521
|
if model_type is not None and model_type not in ["ONNX", "CATBOOST"]:
|
522
|
-
raise ValueError(f"Invalid model type: {model_type}. Available values: ONNX")
|
522
|
+
raise ValueError(f"Invalid model type: {model_type}. Available values: ONNX, CATBOOST")
|
523
523
|
metadata = {
|
524
524
|
"modelName": name,
|
525
525
|
"inputNames": input_names,
|
526
526
|
"dateColumn": date_column,
|
527
527
|
"scoreName": score_name,
|
528
528
|
"searchTaskId": search_id,
|
529
|
-
"modelType": model_type or "
|
529
|
+
"modelType": model_type or "CATBOOST",
|
530
530
|
"description": description,
|
531
531
|
}
|
532
532
|
|
533
533
|
trace_id = str(uuid.uuid4())
|
534
534
|
with MDC(trace_id=trace_id):
|
535
535
|
try:
|
536
|
-
self._rest_client.upload_autofe_model(file_path, metadata, trace_id)
|
536
|
+
result = self._rest_client.upload_autofe_model(file_path, metadata, trace_id)
|
537
|
+
if "ERROR" in result:
|
538
|
+
raise Exception(result)
|
539
|
+
print(result)
|
537
540
|
except Exception:
|
538
541
|
self.logger.exception("Failed to upload autofe model")
|
539
542
|
raise
|
upgini/dataset.py
CHANGED
@@ -694,9 +694,7 @@ class Dataset:
|
|
694
694
|
|
695
695
|
def prepare_uploading_file(self, base_path: str) -> str:
|
696
696
|
parquet_file_path = f"{base_path}/{self.dataset_name}.parquet"
|
697
|
-
print("Before saving parquet file")
|
698
697
|
self.data.to_parquet(path=parquet_file_path, index=False, compression="gzip", engine="fastparquet")
|
699
|
-
print("After saving parquet file")
|
700
698
|
uploading_file_size = Path(parquet_file_path).stat().st_size
|
701
699
|
self.logger.info(f"Size of prepared uploading file: {uploading_file_size}. {len(self.data)} rows")
|
702
700
|
if uploading_file_size > self.MAX_UPLOADING_FILE_SIZE:
|
upgini/features_enricher.py
CHANGED
@@ -1028,13 +1028,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1028
1028
|
columns_renaming,
|
1029
1029
|
_,
|
1030
1030
|
) = prepared_data
|
1031
|
-
|
1032
|
-
# rename baseline_score_column
|
1033
|
-
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
1034
|
-
baseline_score_column = self.baseline_score_column
|
1035
|
-
if baseline_score_column is not None:
|
1036
|
-
baseline_score_column = reversed_renaming[baseline_score_column]
|
1037
|
-
|
1031
|
+
|
1038
1032
|
gc.collect()
|
1039
1033
|
|
1040
1034
|
if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
|
@@ -1089,7 +1083,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1089
1083
|
has_time=has_time,
|
1090
1084
|
)
|
1091
1085
|
baseline_cv_result = baseline_estimator.cross_val_predict(
|
1092
|
-
fitting_X, y_sorted, baseline_score_column
|
1086
|
+
fitting_X, y_sorted, self.baseline_score_column
|
1093
1087
|
)
|
1094
1088
|
baseline_metric = baseline_cv_result.get_display_metric()
|
1095
1089
|
if baseline_metric is None:
|
@@ -1192,7 +1186,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1192
1186
|
f"on client features: {eval_X_sorted.columns.to_list()}"
|
1193
1187
|
)
|
1194
1188
|
etalon_eval_results = baseline_estimator.calculate_metric(
|
1195
|
-
eval_X_sorted, eval_y_sorted, baseline_score_column
|
1189
|
+
eval_X_sorted, eval_y_sorted, self.baseline_score_column
|
1196
1190
|
)
|
1197
1191
|
etalon_eval_metric = etalon_eval_results.get_display_metric()
|
1198
1192
|
self.logger.info(
|
@@ -2502,6 +2496,9 @@ if response.status_code == 200:
|
|
2502
2496
|
) -> tuple[pd.DataFrame, dict[str, str], list[str], dict[str, SearchKey]]:
|
2503
2497
|
if self._search_task is None:
|
2504
2498
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
2499
|
+
features_meta = self._search_task.get_all_features_metadata_v2()
|
2500
|
+
if features_meta is None:
|
2501
|
+
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
2505
2502
|
|
2506
2503
|
start_time = time.time()
|
2507
2504
|
search_id = self.search_id or (self._search_task.search_task_id if self._search_task is not None else None)
|
@@ -2531,7 +2528,6 @@ if response.status_code == 200:
|
|
2531
2528
|
self.__display_support_link(msg)
|
2532
2529
|
return None, {}, [], self.search_keys
|
2533
2530
|
|
2534
|
-
features_meta = self._search_task.get_all_features_metadata_v2()
|
2535
2531
|
online_api_features = [fm.name for fm in features_meta if fm.from_online_api and fm.shap_value > 0]
|
2536
2532
|
if len(online_api_features) > 0:
|
2537
2533
|
self.logger.warning(
|
@@ -3382,6 +3378,7 @@ if response.status_code == 200:
|
|
3382
3378
|
except KeyboardInterrupt as e:
|
3383
3379
|
print(self.bundle.get("search_stopping"))
|
3384
3380
|
self.rest_client.stop_search_task_v2(trace_id, self._search_task.search_task_id)
|
3381
|
+
self._search_task = None
|
3385
3382
|
self.logger.warning(f"Search {self._search_task.search_task_id} stopped by user")
|
3386
3383
|
print(self.bundle.get("search_stopped"))
|
3387
3384
|
raise e
|
upgini/http.py
CHANGED
@@ -426,26 +426,19 @@ class _RestClient:
|
|
426
426
|
) -> SearchTaskResponse:
|
427
427
|
api_path = self.INITIAL_SEARCH_URI_FMT_V2
|
428
428
|
|
429
|
-
print("Before getting track metrics")
|
430
429
|
track_metrics = get_track_metrics(self.client_ip, self.client_visitorid)
|
431
|
-
print("After getting track metrics")
|
432
430
|
|
433
431
|
def open_and_send():
|
434
432
|
md5_hash = hashlib.md5()
|
435
|
-
print("Before opening file to calculate hashes")
|
436
433
|
with open(file_path, "rb") as file:
|
437
434
|
content = file.read()
|
438
435
|
md5_hash.update(content)
|
439
436
|
digest = md5_hash.hexdigest()
|
440
437
|
metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
|
441
|
-
print("After calculating md5")
|
442
438
|
|
443
|
-
print("Before calculating sha256")
|
444
439
|
digest_sha256 = file_hash(file_path)
|
445
|
-
print("After calculating sha256")
|
446
440
|
metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
|
447
441
|
|
448
|
-
print("Before opening file to send")
|
449
442
|
with open(file_path, "rb") as file:
|
450
443
|
files = {
|
451
444
|
"metadata": (
|
@@ -473,12 +466,9 @@ class _RestClient:
|
|
473
466
|
)
|
474
467
|
additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
|
475
468
|
|
476
|
-
|
477
|
-
response = self._send_post_file_req_v2(
|
469
|
+
return self._send_post_file_req_v2(
|
478
470
|
api_path, files, trace_id=trace_id, additional_headers=additional_headers
|
479
471
|
)
|
480
|
-
print("After sending request")
|
481
|
-
return response
|
482
472
|
|
483
473
|
response = self._with_unauth_retry(open_and_send)
|
484
474
|
return SearchTaskResponse(response)
|
@@ -24,7 +24,7 @@ class FeaturesValidator:
|
|
24
24
|
features_for_generate: Optional[List[str]] = None,
|
25
25
|
columns_renaming: Optional[Dict[str, str]] = None,
|
26
26
|
) -> Tuple[List[str], List[str]]:
|
27
|
-
|
27
|
+
one_hot_encoded_features = []
|
28
28
|
empty_or_constant_features = []
|
29
29
|
high_cardinality_features = []
|
30
30
|
warnings = []
|
@@ -39,20 +39,17 @@ class FeaturesValidator:
|
|
39
39
|
if most_frequent_percent >= 0.99:
|
40
40
|
empty_or_constant_features.append(f)
|
41
41
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
# if one_hot_encoded_features:
|
54
|
-
# msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
|
55
|
-
# warnings.append(msg)
|
42
|
+
if len(value_counts) == 1:
|
43
|
+
empty_or_constant_features.append(f)
|
44
|
+
elif most_frequent_percent >= 0.99:
|
45
|
+
if self.is_one_hot_encoded(column):
|
46
|
+
one_hot_encoded_features.append(f)
|
47
|
+
else:
|
48
|
+
empty_or_constant_features.append(f)
|
49
|
+
|
50
|
+
if one_hot_encoded_features:
|
51
|
+
msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
|
52
|
+
self.logger.info(msg)
|
56
53
|
|
57
54
|
columns_renaming = columns_renaming or {}
|
58
55
|
|
@@ -102,3 +99,30 @@ class FeaturesValidator:
|
|
102
99
|
@staticmethod
|
103
100
|
def find_constant_features(df: pd.DataFrame) -> List[str]:
|
104
101
|
return [i for i in df if df[i].nunique() <= 1]
|
102
|
+
|
103
|
+
@staticmethod
|
104
|
+
def is_one_hot_encoded(series: pd.Series) -> bool:
|
105
|
+
try:
|
106
|
+
# Column contains only 0 and 1 (as strings or numbers)
|
107
|
+
series = series.astype(float)
|
108
|
+
if set(series.unique()) != {0.0, 1.0}:
|
109
|
+
return False
|
110
|
+
|
111
|
+
series = series.astype(int)
|
112
|
+
|
113
|
+
# Column doesn't contain any NaN, np.NaN, space, null, etc.
|
114
|
+
if not (series.isin([0, 1])).all():
|
115
|
+
return False
|
116
|
+
|
117
|
+
vc = series.value_counts()
|
118
|
+
# Column should contain both 0 and 1
|
119
|
+
if len(vc) != 2:
|
120
|
+
return False
|
121
|
+
|
122
|
+
# Minority class is 1
|
123
|
+
if vc[1] >= vc[0]:
|
124
|
+
return False
|
125
|
+
|
126
|
+
return True
|
127
|
+
except ValueError:
|
128
|
+
return False
|
upgini/utils/sklearn_ext.py
CHANGED
@@ -1301,6 +1301,7 @@ def _encode_cat_features(X_train, y_train, X_test, y_test, cat_features, estimat
|
|
1301
1301
|
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
|
1302
1302
|
encoder.fit(X_train[cat_features], y_train)
|
1303
1303
|
|
1304
|
+
# OrdinalEncoder doesn't support progressive encoding with target
|
1304
1305
|
X_train[cat_features] = encoder.transform(X_train[cat_features]).astype(int)
|
1305
1306
|
X_test[cat_features] = encoder.transform(X_test[cat_features]).astype(int)
|
1306
1307
|
|
@@ -1314,10 +1315,8 @@ def _encode_cat_features(X_train, y_train, X_test, y_test, cat_features, estimat
|
|
1314
1315
|
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
|
1315
1316
|
encoder.fit(X_train[cat_features], y_train)
|
1316
1317
|
|
1317
|
-
#
|
1318
|
-
X_train[cat_features] = encoder.transform(X_train[cat_features]
|
1319
|
-
|
1320
|
-
# Static encoding on validation (no y)
|
1318
|
+
# OrdinalEncoder doesn't support progressive encoding with target
|
1319
|
+
X_train[cat_features] = encoder.transform(X_train[cat_features]).astype(int)
|
1321
1320
|
X_test[cat_features] = encoder.transform(X_test[cat_features]).astype(int)
|
1322
1321
|
|
1323
1322
|
return X_train, y_train, X_test, y_test, [], encoder
|
@@ -1,10 +1,10 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=lbqEfhDGdLuugmia7aJpwXt4xpDEZT5h_07_bMMutgk,26
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
|
-
upgini/dataset.py,sha256=
|
4
|
+
upgini/dataset.py,sha256=pQ8JQe0cdygD-W9GefJmfE6bnj4EYzXsjlgWdIS9nS8,31578
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
7
|
-
upgini/http.py,sha256
|
6
|
+
upgini/features_enricher.py,sha256=Du1S72F55cqyKbHT3VGSPnJO3XicWABFVkA2-G3chdA,231696
|
7
|
+
upgini/http.py,sha256=-J_wOpnwVnT0ebPC6sOs6fN3AWtCD0LJLu6nlYmxaqk,44348
|
8
8
|
upgini/metadata.py,sha256=VzgtgEbPPtNxTrj9LM5qSDP3DujHwAXqbUSKBjPcb9c,12477
|
9
9
|
upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
|
10
10
|
upgini/search_task.py,sha256=SAiUd1AytbA2Q6PSnnztr7oTRKpud1wQZ5YtKjsmQHU,18256
|
@@ -31,7 +31,7 @@ upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_a
|
|
31
31
|
upgini/autofe/timeseries/trend.py,sha256=K1_iw2ko_LIUU8YCUgrvN3n0MkHtsi7-63-8x9er1k4,2129
|
32
32
|
upgini/autofe/timeseries/volatility.py,sha256=SvZfhM_ZAWCNpTf87WjSnZsnlblARgruDlu4By4Zvhc,8078
|
33
33
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
34
|
-
upgini/data_source/data_source_publisher.py,sha256=
|
34
|
+
upgini/data_source/data_source_publisher.py,sha256=qXQUYErhCmkWHm2FWgTL0FYZ2aJbxtSDV94OCM3eqUU,26653
|
35
35
|
upgini/mdc/__init__.py,sha256=iHJlXQg6xRM1-ZOUtaPSJqw5SpQDszvxp4LyqviNLIQ,1027
|
36
36
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
37
37
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -58,7 +58,7 @@ upgini/utils/display_utils.py,sha256=uSG3JwpwCIgRJXsp-8ktuJ0Dh-WFti7IrRLMUfHfoDc
|
|
58
58
|
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
59
59
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
60
60
|
upgini/utils/feature_info.py,sha256=6vihytwKma_TlXtTn4l6Aj4kqlOj0ouLy-yWVV6VUw8,7551
|
61
|
-
upgini/utils/features_validator.py,sha256=
|
61
|
+
upgini/utils/features_validator.py,sha256=wkPQlQFK6EQdnOd1MxFCSmb8gEqzCYJX1isLPaeRsgU,4365
|
62
62
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
63
63
|
upgini/utils/hash_utils.py,sha256=mP2yHyzvDNdpa5g3B4MHzulxBeEz_ZSoGl1YF_VnAyE,5538
|
64
64
|
upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
|
@@ -68,13 +68,13 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
|
|
68
68
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
69
69
|
upgini/utils/psi.py,sha256=vw8QEktXSx29IiMJMxmDeFU_4lJInJBXt_XL5Muekzo,11114
|
70
70
|
upgini/utils/sample_utils.py,sha256=xpfYaZ2cYP7I2JrcooVc13QNBFawB81cJRuh38451Q4,15123
|
71
|
-
upgini/utils/sklearn_ext.py,sha256=
|
71
|
+
upgini/utils/sklearn_ext.py,sha256=Pcy8sWD6f4YcE5Bu0UmXD4j0ICmXtrT8DJlTArM-_a0,49356
|
72
72
|
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
73
73
|
upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,10882
|
74
74
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
75
75
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
76
76
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
77
|
-
upgini-1.2.
|
78
|
-
upgini-1.2.
|
79
|
-
upgini-1.2.
|
80
|
-
upgini-1.2.
|
77
|
+
upgini-1.2.121a1.dist-info/METADATA,sha256=8lCLPlcxApmxxhl8DgplSrHe_Z_GHqIiOLB66OCabPo,50745
|
78
|
+
upgini-1.2.121a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
79
|
+
upgini-1.2.121a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
80
|
+
upgini-1.2.121a1.dist-info/RECORD,,
|
File without changes
|
File without changes
|