upgini 1.2.45__py3-none-any.whl → 1.2.47__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/dataset.py +2 -2
- upgini/features_enricher.py +31 -14
- upgini/http.py +19 -7
- {upgini-1.2.45.dist-info → upgini-1.2.47.dist-info}/METADATA +1 -1
- {upgini-1.2.45.dist-info → upgini-1.2.47.dist-info}/RECORD +8 -8
- {upgini-1.2.45.dist-info → upgini-1.2.47.dist-info}/WHEEL +0 -0
- {upgini-1.2.45.dist-info → upgini-1.2.47.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.47"
|
upgini/dataset.py
CHANGED
|
@@ -646,7 +646,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
646
646
|
parquet_file_path = self.prepare_uploading_file(tmp_dir)
|
|
647
647
|
time.sleep(1) # this is neccesary to avoid requests rate limit restrictions
|
|
648
648
|
# If previous steps were too fast, time estimation could be calculated incorrectly
|
|
649
|
-
time_left = max(time.time() - start_time, 20)
|
|
649
|
+
time_left = max(time.time() - start_time, 20.0)
|
|
650
650
|
search_progress = SearchProgress(1.0, ProgressStage.CREATING_FIT, time_left)
|
|
651
651
|
if progress_bar is not None:
|
|
652
652
|
progress_bar.progress = search_progress.to_progress_bar()
|
|
@@ -699,7 +699,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
699
699
|
runtime_parameters=runtime_parameters,
|
|
700
700
|
metrics_calculation=metrics_calculation,
|
|
701
701
|
)
|
|
702
|
-
seconds_left = max(time.time() - start_time, 20)
|
|
702
|
+
seconds_left = max(time.time() - start_time, 20.0)
|
|
703
703
|
search_progress = SearchProgress(1.0, ProgressStage.CREATING_TRANSFORM, seconds_left)
|
|
704
704
|
if progress_bar is not None:
|
|
705
705
|
progress_bar.progress = search_progress.to_progress_bar()
|
upgini/features_enricher.py
CHANGED
|
@@ -165,10 +165,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
165
165
|
|
|
166
166
|
shared_datasets: list of str, optional (default=None)
|
|
167
167
|
List of private shared dataset ids for custom search
|
|
168
|
-
|
|
169
|
-
select_features: bool, optional (default=False)
|
|
170
|
-
If True, return only selected features both from input and data sources.
|
|
171
|
-
Otherwise, return all features from input and only selected features from data sources.
|
|
172
168
|
"""
|
|
173
169
|
|
|
174
170
|
TARGET_NAME = "target"
|
|
@@ -235,7 +231,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
235
231
|
client_visitorid: Optional[str] = None,
|
|
236
232
|
custom_bundle_config: Optional[str] = None,
|
|
237
233
|
add_date_if_missing: bool = True,
|
|
238
|
-
select_features: bool = False,
|
|
239
234
|
disable_force_downsampling: bool = False,
|
|
240
235
|
id_columns: Optional[List[str]] = None,
|
|
241
236
|
**kwargs,
|
|
@@ -273,6 +268,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
273
268
|
self.eval_set: Optional[List[Tuple]] = None
|
|
274
269
|
self.autodetected_search_keys: Dict[str, SearchKey] = {}
|
|
275
270
|
self.imbalanced = False
|
|
271
|
+
self.fit_select_features = False
|
|
276
272
|
self.__cached_sampled_datasets: Dict[str, Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict, Dict]] = (
|
|
277
273
|
dict()
|
|
278
274
|
)
|
|
@@ -297,7 +293,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
297
293
|
self.dropped_client_feature_names_ = []
|
|
298
294
|
self.feature_importances_ = []
|
|
299
295
|
self.search_id = search_id
|
|
300
|
-
self.select_features = select_features
|
|
301
296
|
self.disable_force_downsampling = disable_force_downsampling
|
|
302
297
|
|
|
303
298
|
if search_id:
|
|
@@ -405,6 +400,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
405
400
|
remove_outliers_calc_metrics: Optional[bool] = None,
|
|
406
401
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
407
402
|
search_id_callback: Optional[Callable[[str], Any]] = None,
|
|
403
|
+
select_features: bool = False,
|
|
408
404
|
**kwargs,
|
|
409
405
|
):
|
|
410
406
|
"""Fit to data.
|
|
@@ -440,6 +436,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
440
436
|
|
|
441
437
|
remove_outliers_calc_metrics, optional (default=True)
|
|
442
438
|
If True then rows with target ouliers will be dropped on metrics calculation
|
|
439
|
+
|
|
440
|
+
select_features: bool, optional (default=False)
|
|
441
|
+
If True, return only selected features both from input and data sources.
|
|
442
|
+
Otherwise, return all features from input and only selected features from data sources.
|
|
443
443
|
"""
|
|
444
444
|
trace_id = str(uuid.uuid4())
|
|
445
445
|
start_time = time.time()
|
|
@@ -474,6 +474,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
474
474
|
self.y = y
|
|
475
475
|
self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
476
476
|
self.dump_input(trace_id, X, y, self.eval_set)
|
|
477
|
+
self.__set_select_features(select_features)
|
|
477
478
|
self.__inner_fit(
|
|
478
479
|
trace_id,
|
|
479
480
|
X,
|
|
@@ -523,6 +524,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
523
524
|
finally:
|
|
524
525
|
self.logger.info(f"Fit elapsed time: {time.time() - start_time}")
|
|
525
526
|
|
|
527
|
+
def __set_select_features(self, select_features: bool):
|
|
528
|
+
self.fit_select_features = select_features
|
|
529
|
+
self.runtime_parameters.properties["select_features"] = select_features
|
|
530
|
+
|
|
526
531
|
def fit_transform(
|
|
527
532
|
self,
|
|
528
533
|
X: Union[pd.DataFrame, pd.Series, np.ndarray],
|
|
@@ -538,6 +543,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
538
543
|
estimator: Optional[Any] = None,
|
|
539
544
|
remove_outliers_calc_metrics: Optional[bool] = None,
|
|
540
545
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
546
|
+
select_features: bool = False,
|
|
541
547
|
**kwargs,
|
|
542
548
|
) -> pd.DataFrame:
|
|
543
549
|
"""Fit to data, then transform it.
|
|
@@ -578,6 +584,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
578
584
|
remove_outliers_calc_metrics, optional (default=True)
|
|
579
585
|
If True then rows with target ouliers will be dropped on metrics calculation
|
|
580
586
|
|
|
587
|
+
select_features: bool, optional (default=False)
|
|
588
|
+
If True, return only selected features both from input and data sources.
|
|
589
|
+
Otherwise, return all features from input and only selected features from data sources.
|
|
590
|
+
|
|
581
591
|
Returns
|
|
582
592
|
-------
|
|
583
593
|
X_new: pandas.DataFrame of shape (n_samples, n_features_new)
|
|
@@ -613,6 +623,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
613
623
|
self.X = X
|
|
614
624
|
self.y = y
|
|
615
625
|
self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
626
|
+
self.__set_select_features(select_features)
|
|
616
627
|
self.dump_input(trace_id, X, y, self.eval_set)
|
|
617
628
|
|
|
618
629
|
if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
|
|
@@ -1096,7 +1107,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1096
1107
|
):
|
|
1097
1108
|
train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1098
1109
|
# np.mean(validated_y), 4
|
|
1099
|
-
np.mean(y_sorted),
|
|
1110
|
+
np.mean(y_sorted),
|
|
1111
|
+
4,
|
|
1100
1112
|
)
|
|
1101
1113
|
if etalon_metric is not None:
|
|
1102
1114
|
train_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
|
|
@@ -1174,7 +1186,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1174
1186
|
eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1175
1187
|
# np.mean(validated_eval_set[idx][1]), 4
|
|
1176
1188
|
# Use actually used for metrics dataset
|
|
1177
|
-
np.mean(eval_y_sorted),
|
|
1189
|
+
np.mean(eval_y_sorted),
|
|
1190
|
+
4,
|
|
1178
1191
|
)
|
|
1179
1192
|
if etalon_eval_metric is not None:
|
|
1180
1193
|
eval_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = (
|
|
@@ -1238,8 +1251,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1238
1251
|
self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
|
|
1239
1252
|
|
|
1240
1253
|
def _update_shap_values(self, trace_id: str, x_columns: List[str], new_shaps: Dict[str, float]):
|
|
1254
|
+
renaming = self.fit_columns_renaming or {}
|
|
1241
1255
|
new_shaps = {
|
|
1242
|
-
feature: _round_shap_value(shap)
|
|
1256
|
+
renaming.get(feature, feature): _round_shap_value(shap)
|
|
1257
|
+
for feature, shap in new_shaps.items()
|
|
1258
|
+
if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
|
|
1243
1259
|
}
|
|
1244
1260
|
self.__prepare_feature_importances(trace_id, x_columns, new_shaps, silent=True)
|
|
1245
1261
|
|
|
@@ -1458,7 +1474,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1458
1474
|
if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
|
|
1459
1475
|
excluded = set()
|
|
1460
1476
|
for sk in excluding_search_keys:
|
|
1461
|
-
|
|
1477
|
+
renamed_sk = columns_renaming.get(sk)
|
|
1478
|
+
if renamed_sk in search_keys_for_metrics or renamed_sk in self.feature_names_:
|
|
1462
1479
|
excluded.add(sk)
|
|
1463
1480
|
excluding_search_keys = [sk for sk in excluding_search_keys if sk not in excluded]
|
|
1464
1481
|
|
|
@@ -1468,7 +1485,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1468
1485
|
c
|
|
1469
1486
|
for c in X_sampled.columns.to_list()
|
|
1470
1487
|
if (
|
|
1471
|
-
not self.
|
|
1488
|
+
not self.fit_select_features
|
|
1472
1489
|
or c in self.feature_names_
|
|
1473
1490
|
or (self.fit_columns_renaming is not None and self.fit_columns_renaming.get(c) in self.feature_names_)
|
|
1474
1491
|
)
|
|
@@ -3315,8 +3332,8 @@ if response.status_code == 200:
|
|
|
3315
3332
|
f"Client ip: {self.client_ip}\n"
|
|
3316
3333
|
f"Client visitorId: {self.client_visitorid}\n"
|
|
3317
3334
|
f"Add date if missing: {self.add_date_if_missing}\n"
|
|
3318
|
-
f"Select features: {self.select_features}\n"
|
|
3319
3335
|
f"Disable force downsampling: {self.disable_force_downsampling}\n"
|
|
3336
|
+
f"Id columns: {self.id_columns}\n"
|
|
3320
3337
|
)
|
|
3321
3338
|
|
|
3322
3339
|
def sample(df):
|
|
@@ -3703,7 +3720,7 @@ if response.status_code == 200:
|
|
|
3703
3720
|
is_client_feature = feature_meta.name in x_columns
|
|
3704
3721
|
|
|
3705
3722
|
if feature_meta.shap_value == 0.0:
|
|
3706
|
-
if self.
|
|
3723
|
+
if self.fit_select_features:
|
|
3707
3724
|
self.dropped_client_feature_names_.append(feature_meta.name)
|
|
3708
3725
|
continue
|
|
3709
3726
|
|
|
@@ -3712,7 +3729,7 @@ if response.status_code == 200:
|
|
|
3712
3729
|
feature_meta.name in self.fit_generated_features
|
|
3713
3730
|
or feature_meta.name == COUNTRY
|
|
3714
3731
|
# In select_features mode we select also from etalon features and need to show them
|
|
3715
|
-
or (not self.
|
|
3732
|
+
or (not self.fit_select_features and is_client_feature)
|
|
3716
3733
|
):
|
|
3717
3734
|
continue
|
|
3718
3735
|
|
upgini/http.py
CHANGED
|
@@ -16,7 +16,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
16
16
|
from urllib.parse import urljoin
|
|
17
17
|
|
|
18
18
|
import jwt
|
|
19
|
-
import pandas as pd
|
|
19
|
+
# import pandas as pd
|
|
20
20
|
import requests
|
|
21
21
|
from pydantic import BaseModel
|
|
22
22
|
from pythonjsonlogger import jsonlogger
|
|
@@ -422,6 +422,16 @@ class _RestClient:
|
|
|
422
422
|
lambda: self._send_post_file_req_v2(api_path, files, trace_id=trace_id, need_json_response=False)
|
|
423
423
|
)
|
|
424
424
|
|
|
425
|
+
@staticmethod
|
|
426
|
+
def compute_file_digest(filepath: str, algorithm="sha256", chunk_size=4096) -> str:
|
|
427
|
+
hash_func = getattr(hashlib, algorithm)()
|
|
428
|
+
|
|
429
|
+
with open(filepath, "rb") as f:
|
|
430
|
+
for chunk in iter(lambda: f.read(chunk_size), b""):
|
|
431
|
+
hash_func.update(chunk)
|
|
432
|
+
|
|
433
|
+
return hash_func.hexdigest()
|
|
434
|
+
|
|
425
435
|
def initial_search_v2(
|
|
426
436
|
self,
|
|
427
437
|
trace_id: str,
|
|
@@ -442,9 +452,10 @@ class _RestClient:
|
|
|
442
452
|
digest = md5_hash.hexdigest()
|
|
443
453
|
metadata_with_md5 = metadata.copy(update={"checksumMD5": digest})
|
|
444
454
|
|
|
445
|
-
digest_sha256 = hashlib.sha256(
|
|
446
|
-
|
|
447
|
-
).hexdigest()
|
|
455
|
+
# digest_sha256 = hashlib.sha256(
|
|
456
|
+
# pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
|
|
457
|
+
# ).hexdigest()
|
|
458
|
+
digest_sha256 = self.compute_file_digest(file_path)
|
|
448
459
|
metadata_with_md5 = metadata_with_md5.copy(update={"digest": digest_sha256})
|
|
449
460
|
|
|
450
461
|
with open(file_path, "rb") as file:
|
|
@@ -530,9 +541,10 @@ class _RestClient:
|
|
|
530
541
|
digest = md5_hash.hexdigest()
|
|
531
542
|
metadata_with_md5 = metadata.copy(update={"checksumMD5": digest})
|
|
532
543
|
|
|
533
|
-
digest_sha256 = hashlib.sha256(
|
|
534
|
-
|
|
535
|
-
).hexdigest()
|
|
544
|
+
# digest_sha256 = hashlib.sha256(
|
|
545
|
+
# pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
|
|
546
|
+
# ).hexdigest()
|
|
547
|
+
digest_sha256 = self.compute_file_digest(file_path)
|
|
536
548
|
metadata_with_md5 = metadata_with_md5.copy(update={"digest": digest_sha256})
|
|
537
549
|
|
|
538
550
|
with open(file_path, "rb") as file:
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=o2NRe9gScRz1I1oB_R5MjkQ4w7BrDovQP2Z_Mq2c6bo,23
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=QC3jncWS3wHe4CY7pWWDMO_3HKxGbi0EyPHXMdBtoQM,33456
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
7
|
-
upgini/http.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=NWYNZtSgAR05zOZp_Wq1ltVGThCttTbVN_TP2RaWFSI,200008
|
|
7
|
+
upgini/http.py,sha256=danPeX7nTMa_70S-pk-4UUm5yOvXYlR84jgyjoHYBkU,43367
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=-ibqiNjD7dTagqg53FoEJNEqvAYbwgfyn9PGTRQ_YKU,12054
|
|
10
10
|
upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
|
|
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
59
59
|
upgini/utils/target_utils.py,sha256=RlpKGss9kMibVSlA8iZuO_qxmyeplqzn7X8g6hiGGGs,14341
|
|
60
60
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
61
61
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
62
|
+
upgini-1.2.47.dist-info/METADATA,sha256=4pKaboM3TxupdS6iw1Uh_IW9Dw0X88LnDh1pGjsc3fs,49055
|
|
63
|
+
upgini-1.2.47.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
64
|
+
upgini-1.2.47.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
65
|
+
upgini-1.2.47.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|