upgini 1.2.44__py3-none-any.whl → 1.2.46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/dataset.py +4 -4
- upgini/features_enricher.py +31 -14
- {upgini-1.2.44.dist-info → upgini-1.2.46.dist-info}/METADATA +1 -1
- {upgini-1.2.44.dist-info → upgini-1.2.46.dist-info}/RECORD +7 -7
- {upgini-1.2.44.dist-info → upgini-1.2.46.dist-info}/WHEEL +1 -1
- {upgini-1.2.44.dist-info → upgini-1.2.46.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.46"
|
upgini/dataset.py
CHANGED
|
@@ -33,7 +33,6 @@ from upgini.metadata import (
|
|
|
33
33
|
NumericInterval,
|
|
34
34
|
RuntimeParameters,
|
|
35
35
|
SearchCustomization,
|
|
36
|
-
SearchKey,
|
|
37
36
|
)
|
|
38
37
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
39
38
|
from upgini.search_task import SearchTask
|
|
@@ -646,8 +645,9 @@ class Dataset: # (pd.DataFrame):
|
|
|
646
645
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
647
646
|
parquet_file_path = self.prepare_uploading_file(tmp_dir)
|
|
648
647
|
time.sleep(1) # this is neccesary to avoid requests rate limit restrictions
|
|
649
|
-
|
|
650
|
-
|
|
648
|
+
# If previous steps were too fast, time estimation could be calculated incorrectly
|
|
649
|
+
time_left = max(time.time() - start_time, 20.0)
|
|
650
|
+
search_progress = SearchProgress(1.0, ProgressStage.CREATING_FIT, time_left)
|
|
651
651
|
if progress_bar is not None:
|
|
652
652
|
progress_bar.progress = search_progress.to_progress_bar()
|
|
653
653
|
if progress_callback is not None:
|
|
@@ -699,7 +699,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
699
699
|
runtime_parameters=runtime_parameters,
|
|
700
700
|
metrics_calculation=metrics_calculation,
|
|
701
701
|
)
|
|
702
|
-
seconds_left = time.time() - start_time
|
|
702
|
+
seconds_left = max(time.time() - start_time, 20.0)
|
|
703
703
|
search_progress = SearchProgress(1.0, ProgressStage.CREATING_TRANSFORM, seconds_left)
|
|
704
704
|
if progress_bar is not None:
|
|
705
705
|
progress_bar.progress = search_progress.to_progress_bar()
|
upgini/features_enricher.py
CHANGED
|
@@ -165,10 +165,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
165
165
|
|
|
166
166
|
shared_datasets: list of str, optional (default=None)
|
|
167
167
|
List of private shared dataset ids for custom search
|
|
168
|
-
|
|
169
|
-
select_features: bool, optional (default=False)
|
|
170
|
-
If True, return only selected features both from input and data sources.
|
|
171
|
-
Otherwise, return all features from input and only selected features from data sources.
|
|
172
168
|
"""
|
|
173
169
|
|
|
174
170
|
TARGET_NAME = "target"
|
|
@@ -235,7 +231,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
235
231
|
client_visitorid: Optional[str] = None,
|
|
236
232
|
custom_bundle_config: Optional[str] = None,
|
|
237
233
|
add_date_if_missing: bool = True,
|
|
238
|
-
select_features: bool = False,
|
|
239
234
|
disable_force_downsampling: bool = False,
|
|
240
235
|
id_columns: Optional[List[str]] = None,
|
|
241
236
|
**kwargs,
|
|
@@ -273,6 +268,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
273
268
|
self.eval_set: Optional[List[Tuple]] = None
|
|
274
269
|
self.autodetected_search_keys: Dict[str, SearchKey] = {}
|
|
275
270
|
self.imbalanced = False
|
|
271
|
+
self.fit_select_features = False
|
|
276
272
|
self.__cached_sampled_datasets: Dict[str, Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict, Dict]] = (
|
|
277
273
|
dict()
|
|
278
274
|
)
|
|
@@ -297,7 +293,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
297
293
|
self.dropped_client_feature_names_ = []
|
|
298
294
|
self.feature_importances_ = []
|
|
299
295
|
self.search_id = search_id
|
|
300
|
-
self.select_features = select_features
|
|
301
296
|
self.disable_force_downsampling = disable_force_downsampling
|
|
302
297
|
|
|
303
298
|
if search_id:
|
|
@@ -405,6 +400,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
405
400
|
remove_outliers_calc_metrics: Optional[bool] = None,
|
|
406
401
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
407
402
|
search_id_callback: Optional[Callable[[str], Any]] = None,
|
|
403
|
+
select_features: bool = False,
|
|
408
404
|
**kwargs,
|
|
409
405
|
):
|
|
410
406
|
"""Fit to data.
|
|
@@ -440,6 +436,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
440
436
|
|
|
441
437
|
remove_outliers_calc_metrics, optional (default=True)
|
|
442
438
|
If True then rows with target ouliers will be dropped on metrics calculation
|
|
439
|
+
|
|
440
|
+
select_features: bool, optional (default=False)
|
|
441
|
+
If True, return only selected features both from input and data sources.
|
|
442
|
+
Otherwise, return all features from input and only selected features from data sources.
|
|
443
443
|
"""
|
|
444
444
|
trace_id = str(uuid.uuid4())
|
|
445
445
|
start_time = time.time()
|
|
@@ -474,6 +474,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
474
474
|
self.y = y
|
|
475
475
|
self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
476
476
|
self.dump_input(trace_id, X, y, self.eval_set)
|
|
477
|
+
self.__set_select_features(select_features)
|
|
477
478
|
self.__inner_fit(
|
|
478
479
|
trace_id,
|
|
479
480
|
X,
|
|
@@ -523,6 +524,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
523
524
|
finally:
|
|
524
525
|
self.logger.info(f"Fit elapsed time: {time.time() - start_time}")
|
|
525
526
|
|
|
527
|
+
def __set_select_features(self, select_features: bool):
|
|
528
|
+
self.fit_select_features = select_features
|
|
529
|
+
self.runtime_parameters.properties["select_features"] = select_features
|
|
530
|
+
|
|
526
531
|
def fit_transform(
|
|
527
532
|
self,
|
|
528
533
|
X: Union[pd.DataFrame, pd.Series, np.ndarray],
|
|
@@ -538,6 +543,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
538
543
|
estimator: Optional[Any] = None,
|
|
539
544
|
remove_outliers_calc_metrics: Optional[bool] = None,
|
|
540
545
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
546
|
+
select_features: bool = False,
|
|
541
547
|
**kwargs,
|
|
542
548
|
) -> pd.DataFrame:
|
|
543
549
|
"""Fit to data, then transform it.
|
|
@@ -578,6 +584,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
578
584
|
remove_outliers_calc_metrics, optional (default=True)
|
|
579
585
|
If True then rows with target ouliers will be dropped on metrics calculation
|
|
580
586
|
|
|
587
|
+
select_features: bool, optional (default=False)
|
|
588
|
+
If True, return only selected features both from input and data sources.
|
|
589
|
+
Otherwise, return all features from input and only selected features from data sources.
|
|
590
|
+
|
|
581
591
|
Returns
|
|
582
592
|
-------
|
|
583
593
|
X_new: pandas.DataFrame of shape (n_samples, n_features_new)
|
|
@@ -613,6 +623,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
613
623
|
self.X = X
|
|
614
624
|
self.y = y
|
|
615
625
|
self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
626
|
+
self.__set_select_features(select_features)
|
|
616
627
|
self.dump_input(trace_id, X, y, self.eval_set)
|
|
617
628
|
|
|
618
629
|
if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
|
|
@@ -1096,7 +1107,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1096
1107
|
):
|
|
1097
1108
|
train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1098
1109
|
# np.mean(validated_y), 4
|
|
1099
|
-
np.mean(y_sorted),
|
|
1110
|
+
np.mean(y_sorted),
|
|
1111
|
+
4,
|
|
1100
1112
|
)
|
|
1101
1113
|
if etalon_metric is not None:
|
|
1102
1114
|
train_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
|
|
@@ -1174,7 +1186,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1174
1186
|
eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1175
1187
|
# np.mean(validated_eval_set[idx][1]), 4
|
|
1176
1188
|
# Use actually used for metrics dataset
|
|
1177
|
-
np.mean(eval_y_sorted),
|
|
1189
|
+
np.mean(eval_y_sorted),
|
|
1190
|
+
4,
|
|
1178
1191
|
)
|
|
1179
1192
|
if etalon_eval_metric is not None:
|
|
1180
1193
|
eval_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = (
|
|
@@ -1238,8 +1251,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1238
1251
|
self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
|
|
1239
1252
|
|
|
1240
1253
|
def _update_shap_values(self, trace_id: str, x_columns: List[str], new_shaps: Dict[str, float]):
|
|
1254
|
+
renaming = self.fit_columns_renaming or {}
|
|
1241
1255
|
new_shaps = {
|
|
1242
|
-
feature: _round_shap_value(shap)
|
|
1256
|
+
renaming.get(feature, feature): _round_shap_value(shap)
|
|
1257
|
+
for feature, shap in new_shaps.items()
|
|
1258
|
+
if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
|
|
1243
1259
|
}
|
|
1244
1260
|
self.__prepare_feature_importances(trace_id, x_columns, new_shaps, silent=True)
|
|
1245
1261
|
|
|
@@ -1458,7 +1474,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1458
1474
|
if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
|
|
1459
1475
|
excluded = set()
|
|
1460
1476
|
for sk in excluding_search_keys:
|
|
1461
|
-
|
|
1477
|
+
renamed_sk = columns_renaming.get(sk)
|
|
1478
|
+
if renamed_sk in search_keys_for_metrics or renamed_sk in self.feature_names_:
|
|
1462
1479
|
excluded.add(sk)
|
|
1463
1480
|
excluding_search_keys = [sk for sk in excluding_search_keys if sk not in excluded]
|
|
1464
1481
|
|
|
@@ -1468,7 +1485,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1468
1485
|
c
|
|
1469
1486
|
for c in X_sampled.columns.to_list()
|
|
1470
1487
|
if (
|
|
1471
|
-
not self.
|
|
1488
|
+
not self.fit_select_features
|
|
1472
1489
|
or c in self.feature_names_
|
|
1473
1490
|
or (self.fit_columns_renaming is not None and self.fit_columns_renaming.get(c) in self.feature_names_)
|
|
1474
1491
|
)
|
|
@@ -3315,8 +3332,8 @@ if response.status_code == 200:
|
|
|
3315
3332
|
f"Client ip: {self.client_ip}\n"
|
|
3316
3333
|
f"Client visitorId: {self.client_visitorid}\n"
|
|
3317
3334
|
f"Add date if missing: {self.add_date_if_missing}\n"
|
|
3318
|
-
f"Select features: {self.select_features}\n"
|
|
3319
3335
|
f"Disable force downsampling: {self.disable_force_downsampling}\n"
|
|
3336
|
+
f"Id columns: {self.id_columns}\n"
|
|
3320
3337
|
)
|
|
3321
3338
|
|
|
3322
3339
|
def sample(df):
|
|
@@ -3703,7 +3720,7 @@ if response.status_code == 200:
|
|
|
3703
3720
|
is_client_feature = feature_meta.name in x_columns
|
|
3704
3721
|
|
|
3705
3722
|
if feature_meta.shap_value == 0.0:
|
|
3706
|
-
if self.
|
|
3723
|
+
if self.fit_select_features:
|
|
3707
3724
|
self.dropped_client_feature_names_.append(feature_meta.name)
|
|
3708
3725
|
continue
|
|
3709
3726
|
|
|
@@ -3712,7 +3729,7 @@ if response.status_code == 200:
|
|
|
3712
3729
|
feature_meta.name in self.fit_generated_features
|
|
3713
3730
|
or feature_meta.name == COUNTRY
|
|
3714
3731
|
# In select_features mode we select also from etalon features and need to show them
|
|
3715
|
-
or (not self.
|
|
3732
|
+
or (not self.fit_select_features and is_client_feature)
|
|
3716
3733
|
):
|
|
3717
3734
|
continue
|
|
3718
3735
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=s7kBVARFz8lSmh7ulygN5xbxPrps18XAib1Arlvg6cw,23
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=QC3jncWS3wHe4CY7pWWDMO_3HKxGbi0EyPHXMdBtoQM,33456
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=NWYNZtSgAR05zOZp_Wq1ltVGThCttTbVN_TP2RaWFSI,200008
|
|
7
7
|
upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=-ibqiNjD7dTagqg53FoEJNEqvAYbwgfyn9PGTRQ_YKU,12054
|
|
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
59
59
|
upgini/utils/target_utils.py,sha256=RlpKGss9kMibVSlA8iZuO_qxmyeplqzn7X8g6hiGGGs,14341
|
|
60
60
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
61
61
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
62
|
+
upgini-1.2.46.dist-info/METADATA,sha256=79LcIjwGCdKUlidUqvLOy4YnlBIJEyJwLE5tAPxlKo8,49055
|
|
63
|
+
upgini-1.2.46.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
64
|
+
upgini-1.2.46.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
65
|
+
upgini-1.2.46.dist-info/RECORD,,
|
|
File without changes
|