upgini 1.2.114a5__py3-none-any.whl → 1.2.116__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +217 -207
- upgini/http.py +2 -35
- upgini/metrics.py +3 -3
- {upgini-1.2.114a5.dist-info → upgini-1.2.116.dist-info}/METADATA +32 -2
- {upgini-1.2.114a5.dist-info → upgini-1.2.116.dist-info}/RECORD +8 -8
- {upgini-1.2.114a5.dist-info → upgini-1.2.116.dist-info}/WHEEL +0 -0
- {upgini-1.2.114a5.dist-info → upgini-1.2.116.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.116"
|
upgini/features_enricher.py
CHANGED
@@ -12,7 +12,7 @@ from collections import Counter
|
|
12
12
|
from copy import deepcopy
|
13
13
|
from dataclasses import dataclass
|
14
14
|
from threading import Thread
|
15
|
-
from typing import Any, Callable
|
15
|
+
from typing import Any, Callable
|
16
16
|
|
17
17
|
import numpy as np
|
18
18
|
import pandas as pd
|
@@ -207,34 +207,34 @@ class FeaturesEnricher(TransformerMixin):
|
|
207
207
|
|
208
208
|
def __init__(
|
209
209
|
self,
|
210
|
-
search_keys:
|
211
|
-
country_code:
|
212
|
-
model_task_type:
|
213
|
-
api_key:
|
214
|
-
endpoint:
|
215
|
-
search_id:
|
216
|
-
shared_datasets:
|
217
|
-
runtime_parameters:
|
218
|
-
date_format:
|
210
|
+
search_keys: dict[str, SearchKey] | None = None,
|
211
|
+
country_code: str | None = None,
|
212
|
+
model_task_type: ModelTaskType | str | None = None,
|
213
|
+
api_key: str | None = None,
|
214
|
+
endpoint: str | None = None,
|
215
|
+
search_id: str | None = None,
|
216
|
+
shared_datasets: list[str] | None = None,
|
217
|
+
runtime_parameters: RuntimeParameters | None = None,
|
218
|
+
date_format: str | None = None,
|
219
219
|
random_state: int = 42,
|
220
|
-
cv:
|
221
|
-
loss:
|
220
|
+
cv: CVType | None = None,
|
221
|
+
loss: str | None = None,
|
222
222
|
autodetect_search_keys: bool = True,
|
223
|
-
generate_features:
|
224
|
-
columns_for_online_api:
|
225
|
-
round_embeddings:
|
223
|
+
generate_features: list[str] | None = None,
|
224
|
+
columns_for_online_api: list[str] | None = None,
|
225
|
+
round_embeddings: int | None = None,
|
226
226
|
logs_enabled: bool = True,
|
227
227
|
raise_validation_error: bool = True,
|
228
|
-
exclude_columns:
|
229
|
-
baseline_score_column:
|
230
|
-
client_ip:
|
231
|
-
client_visitorid:
|
232
|
-
custom_bundle_config:
|
228
|
+
exclude_columns: list[str] | None = None,
|
229
|
+
baseline_score_column: Any | None = None,
|
230
|
+
client_ip: str | None = None,
|
231
|
+
client_visitorid: str | None = None,
|
232
|
+
custom_bundle_config: str | None = None,
|
233
233
|
add_date_if_missing: bool = True,
|
234
234
|
disable_force_downsampling: bool = False,
|
235
|
-
id_columns:
|
235
|
+
id_columns: list[str] | None = None,
|
236
236
|
generate_search_key_features: bool = True,
|
237
|
-
sample_config:
|
237
|
+
sample_config: SampleConfig | None = None,
|
238
238
|
print_trace_id: bool = False,
|
239
239
|
**kwargs,
|
240
240
|
):
|
@@ -259,16 +259,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
259
259
|
print(msg)
|
260
260
|
|
261
261
|
self.passed_features: list[str] = []
|
262
|
-
self.df_with_original_index:
|
263
|
-
self.fit_columns_renaming:
|
262
|
+
self.df_with_original_index: pd.DataFrame | None = None
|
263
|
+
self.fit_columns_renaming: dict[str, str] | None = None
|
264
264
|
self.country_added = False
|
265
265
|
self.fit_generated_features: list[str] = []
|
266
|
-
self.fit_dropped_features:
|
266
|
+
self.fit_dropped_features: set[str] = set()
|
267
267
|
self.fit_search_keys = search_keys
|
268
268
|
self.warning_counter = WarningCounter()
|
269
|
-
self.X:
|
270
|
-
self.y:
|
271
|
-
self.eval_set:
|
269
|
+
self.X: pd.DataFrame | None = None
|
270
|
+
self.y: pd.Series | None = None
|
271
|
+
self.eval_set: list[tuple] | None = None
|
272
272
|
self.autodetected_search_keys: dict[str, SearchKey] = {}
|
273
273
|
self.imbalanced = False
|
274
274
|
self.fit_select_features = True
|
@@ -288,17 +288,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
288
288
|
self.model_task_type = ModelTaskType.parse(model_task_type)
|
289
289
|
self.model_task_type = model_task_type
|
290
290
|
self.endpoint = endpoint
|
291
|
-
self._search_task:
|
291
|
+
self._search_task: SearchTask | None = None
|
292
292
|
self.features_info: pd.DataFrame = self.EMPTY_FEATURES_INFO
|
293
293
|
self._features_info_without_links: pd.DataFrame = self.EMPTY_FEATURES_INFO
|
294
294
|
self._internal_features_info: pd.DataFrame = self.EMPTY_INTERNAL_FEATURES_INFO
|
295
295
|
self.relevant_data_sources: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
296
296
|
self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
297
|
-
self.metrics:
|
297
|
+
self.metrics: pd.DataFrame | None = None
|
298
298
|
self.feature_names_ = []
|
299
299
|
self.external_source_feature_names = []
|
300
300
|
self.feature_importances_ = []
|
301
|
-
self.psi_values:
|
301
|
+
self.psi_values: dict[str, float] | None = None
|
302
302
|
self.search_id = search_id
|
303
303
|
self.disable_force_downsampling = disable_force_downsampling
|
304
304
|
self.print_trace_id = print_trace_id
|
@@ -375,7 +375,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
375
375
|
self.autofe_features_display_handle = None
|
376
376
|
self.report_button_handle = None
|
377
377
|
|
378
|
-
def _get_sample_config(self, sample_config:
|
378
|
+
def _get_sample_config(self, sample_config: SampleConfig | None = None):
|
379
379
|
sample_config = sample_config or SampleConfig(force_sample_size=Dataset.FORCE_SAMPLE_SIZE)
|
380
380
|
|
381
381
|
maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
|
@@ -430,20 +430,20 @@ class FeaturesEnricher(TransformerMixin):
|
|
430
430
|
|
431
431
|
def fit(
|
432
432
|
self,
|
433
|
-
X:
|
434
|
-
y:
|
435
|
-
eval_set:
|
433
|
+
X: pd.DataFrame | pd.Series | np.ndarray,
|
434
|
+
y: pd.Series | np.ndarray | list,
|
435
|
+
eval_set: list[tuple] | tuple | None = None,
|
436
436
|
*args,
|
437
|
-
exclude_features_sources:
|
438
|
-
calculate_metrics:
|
439
|
-
estimator:
|
440
|
-
scoring:
|
441
|
-
remove_outliers_calc_metrics:
|
442
|
-
progress_callback:
|
443
|
-
search_id_callback:
|
437
|
+
exclude_features_sources: list[str] | None = None,
|
438
|
+
calculate_metrics: bool | None = None,
|
439
|
+
estimator: Any | None = None,
|
440
|
+
scoring: Callable | str | None = None,
|
441
|
+
remove_outliers_calc_metrics: bool | None = None,
|
442
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
443
|
+
search_id_callback: Callable[[str], Any] | None = None,
|
444
444
|
select_features: bool = True,
|
445
|
-
auto_fe_parameters:
|
446
|
-
stability_threshold: float = 0.
|
445
|
+
auto_fe_parameters: AutoFEParameters | None = None,
|
446
|
+
stability_threshold: float = 0.2,
|
447
447
|
stability_agg_func: str = "max",
|
448
448
|
**kwargs,
|
449
449
|
):
|
@@ -479,7 +479,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
479
479
|
If True, return only selected features both from input and data sources.
|
480
480
|
Otherwise, return all features from input and only selected features from data sources.
|
481
481
|
|
482
|
-
stability_threshold: float, optional (default=0.
|
482
|
+
stability_threshold: float, optional (default=0.2)
|
483
483
|
Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
|
484
484
|
then feature will be dropped.
|
485
485
|
|
@@ -579,27 +579,26 @@ class FeaturesEnricher(TransformerMixin):
|
|
579
579
|
|
580
580
|
def fit_transform(
|
581
581
|
self,
|
582
|
-
X:
|
583
|
-
y:
|
584
|
-
eval_set:
|
582
|
+
X: pd.DataFrame | pd.Series | np.ndarray,
|
583
|
+
y: pd.DataFrame | pd.Series | np.ndarray | list,
|
584
|
+
eval_set: list[tuple] | tuple | None = None,
|
585
585
|
*args,
|
586
|
-
exclude_features_sources:
|
586
|
+
exclude_features_sources: list[str] | None | None = None,
|
587
587
|
keep_input: bool = True,
|
588
|
-
calculate_metrics:
|
589
|
-
scoring:
|
590
|
-
estimator:
|
591
|
-
remove_outliers_calc_metrics:
|
592
|
-
progress_callback:
|
588
|
+
calculate_metrics: bool | None = None,
|
589
|
+
scoring: Callable | str | None = None,
|
590
|
+
estimator: Any | None = None,
|
591
|
+
remove_outliers_calc_metrics: bool | None = None,
|
592
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
593
593
|
select_features: bool = True,
|
594
|
-
auto_fe_parameters:
|
595
|
-
stability_threshold: float = 0.
|
594
|
+
auto_fe_parameters: AutoFEParameters | None = None,
|
595
|
+
stability_threshold: float = 0.2,
|
596
596
|
stability_agg_func: str = "max",
|
597
597
|
**kwargs,
|
598
598
|
) -> pd.DataFrame:
|
599
599
|
"""Fit to data, then transform it.
|
600
600
|
|
601
601
|
Fits transformer to `X` and `y` and returns a transformed version of `X`.
|
602
|
-
If keep_input is True, then all input columns are copied to the output dataframe.
|
603
602
|
|
604
603
|
Parameters
|
605
604
|
----------
|
@@ -613,7 +612,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
613
612
|
list of pairs (X, y) for validation.
|
614
613
|
|
615
614
|
keep_input: bool, optional (default=True)
|
616
|
-
|
615
|
+
keep_input: bool, optional (default=True)
|
616
|
+
If True, then all search keys, ID columns, selected client features and enriched columns will be returned.
|
617
|
+
If False, then only enriched columns are returned.
|
617
618
|
|
618
619
|
estimator: sklearn-compatible estimator, optional (default=None)
|
619
620
|
Custom estimator for metrics calculation.
|
@@ -629,7 +630,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
629
630
|
If True, return only selected features both from input and data sources.
|
630
631
|
Otherwise, return all features from input and only selected features from data sources.
|
631
632
|
|
632
|
-
stability_threshold: float, optional (default=0.
|
633
|
+
stability_threshold: float, optional (default=0.2)
|
633
634
|
Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
|
634
635
|
then feature will be dropped.
|
635
636
|
|
@@ -747,20 +748,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
747
748
|
self,
|
748
749
|
X: pd.DataFrame,
|
749
750
|
*args,
|
750
|
-
y:
|
751
|
-
exclude_features_sources:
|
751
|
+
y: pd.Series | None = None,
|
752
|
+
exclude_features_sources: list[str] | None = None,
|
752
753
|
keep_input: bool = True,
|
753
|
-
trace_id:
|
754
|
+
trace_id: str | None = None,
|
754
755
|
metrics_calculation: bool = False,
|
755
756
|
silent_mode=False,
|
756
|
-
progress_bar:
|
757
|
-
progress_callback:
|
757
|
+
progress_bar: ProgressBar | None = None,
|
758
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
758
759
|
**kwargs,
|
759
760
|
) -> pd.DataFrame:
|
760
761
|
"""Transform `X`.
|
761
762
|
|
762
763
|
Returns a transformed version of `X`.
|
763
|
-
If keep_input is True, then all input columns are copied to the output dataframe.
|
764
764
|
|
765
765
|
Parameters
|
766
766
|
----------
|
@@ -768,7 +768,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
768
768
|
Input samples.
|
769
769
|
|
770
770
|
keep_input: bool, optional (default=True)
|
771
|
-
|
771
|
+
keep_input: bool, optional (default=True)
|
772
|
+
If True, then all search keys, ID columns, selected client features, enriched columns and intput columns
|
773
|
+
that were not present on fit will be returned.
|
774
|
+
If False, then only enriched columns are returned.
|
772
775
|
|
773
776
|
Returns
|
774
777
|
-------
|
@@ -809,6 +812,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
809
812
|
metrics_calculation=metrics_calculation,
|
810
813
|
silent_mode=silent_mode,
|
811
814
|
progress_bar=progress_bar,
|
815
|
+
keep_input=keep_input,
|
812
816
|
)
|
813
817
|
self.logger.info("Transform finished successfully")
|
814
818
|
search_progress = SearchProgress(100.0, ProgressStage.FINISHED)
|
@@ -850,30 +854,26 @@ class FeaturesEnricher(TransformerMixin):
|
|
850
854
|
raise e
|
851
855
|
finally:
|
852
856
|
self.logger.info(f"Transform elapsed time: {time.time() - start_time}")
|
853
|
-
|
854
|
-
|
855
|
-
if keep_input:
|
856
|
-
return result
|
857
|
-
else:
|
858
|
-
return result.drop(columns=X.columns, errors="ignore")
|
857
|
+
|
858
|
+
return result
|
859
859
|
|
860
860
|
def calculate_metrics(
|
861
861
|
self,
|
862
|
-
X:
|
863
|
-
y:
|
864
|
-
eval_set:
|
862
|
+
X: pd.DataFrame | pd.Series | np.ndarray | None = None,
|
863
|
+
y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
|
864
|
+
eval_set: list[tuple] | tuple | None = None,
|
865
865
|
*args,
|
866
|
-
scoring:
|
867
|
-
cv:
|
866
|
+
scoring: Callable | str | None = None,
|
867
|
+
cv: BaseCrossValidator | CVType | str | None = None,
|
868
868
|
estimator=None,
|
869
|
-
exclude_features_sources:
|
870
|
-
remove_outliers_calc_metrics:
|
871
|
-
trace_id:
|
869
|
+
exclude_features_sources: list[str] | None = None,
|
870
|
+
remove_outliers_calc_metrics: bool | None = None,
|
871
|
+
trace_id: str | None = None,
|
872
872
|
internal_call: bool = False,
|
873
|
-
progress_bar:
|
874
|
-
progress_callback:
|
873
|
+
progress_bar: ProgressBar | None = None,
|
874
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
875
875
|
**kwargs,
|
876
|
-
) ->
|
876
|
+
) -> pd.DataFrame | None:
|
877
877
|
"""Calculate metrics
|
878
878
|
|
879
879
|
Parameters
|
@@ -1311,16 +1311,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
1311
1311
|
def _select_features_by_psi(
|
1312
1312
|
self,
|
1313
1313
|
trace_id: str,
|
1314
|
-
X:
|
1315
|
-
y:
|
1316
|
-
eval_set:
|
1314
|
+
X: pd.DataFrame | pd.Series | np.ndarray,
|
1315
|
+
y: pd.DataFrame | pd.Series | np.ndarray | list,
|
1316
|
+
eval_set: list[tuple] | tuple | None,
|
1317
1317
|
stability_threshold: float,
|
1318
1318
|
stability_agg_func: Callable,
|
1319
|
-
cv:
|
1319
|
+
cv: BaseCrossValidator | CVType | str | None = None,
|
1320
1320
|
estimator=None,
|
1321
|
-
exclude_features_sources:
|
1321
|
+
exclude_features_sources: list[str] | None = None,
|
1322
1322
|
progress_bar: bool = True,
|
1323
|
-
progress_callback:
|
1323
|
+
progress_callback: Callable | None = None,
|
1324
1324
|
):
|
1325
1325
|
search_keys = self.search_keys.copy()
|
1326
1326
|
validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set, silent=True)
|
@@ -1469,7 +1469,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1469
1469
|
|
1470
1470
|
unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
|
1471
1471
|
if unstable_by_sparsity:
|
1472
|
-
self.logger.info(f"Unstable by sparsity features: {sorted(unstable_by_sparsity)}")
|
1472
|
+
self.logger.info(f"Unstable by sparsity features ({stability_threshold}): {sorted(unstable_by_sparsity)}")
|
1473
1473
|
|
1474
1474
|
psi_values = calculate_features_psi(
|
1475
1475
|
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type, stability_agg_func
|
@@ -1479,7 +1479,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1479
1479
|
|
1480
1480
|
unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
|
1481
1481
|
if unstable_by_value:
|
1482
|
-
self.logger.info(f"Unstable by value features: {sorted(unstable_by_value)}")
|
1482
|
+
self.logger.info(f"Unstable by value features ({stability_threshold}): {sorted(unstable_by_value)}")
|
1483
1483
|
|
1484
1484
|
self.psi_values = {
|
1485
1485
|
feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
|
@@ -1557,12 +1557,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
1557
1557
|
self.logger.warning(msg)
|
1558
1558
|
|
1559
1559
|
def _has_features_with_commercial_schema(
|
1560
|
-
self, commercial_schema: str, exclude_features_sources:
|
1560
|
+
self, commercial_schema: str, exclude_features_sources: list[str] | None
|
1561
1561
|
) -> bool:
|
1562
1562
|
return len(self._get_features_with_commercial_schema(commercial_schema, exclude_features_sources)) > 0
|
1563
1563
|
|
1564
1564
|
def _get_features_with_commercial_schema(
|
1565
|
-
self, commercial_schema: str, exclude_features_sources:
|
1565
|
+
self, commercial_schema: str, exclude_features_sources: list[str] | None
|
1566
1566
|
) -> list[str]:
|
1567
1567
|
if exclude_features_sources:
|
1568
1568
|
filtered_features_info = self._internal_features_info[
|
@@ -1577,14 +1577,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
1577
1577
|
].values
|
1578
1578
|
)
|
1579
1579
|
|
1580
|
-
def _has_paid_features(self, exclude_features_sources:
|
1580
|
+
def _has_paid_features(self, exclude_features_sources: list[str] | None) -> bool:
|
1581
1581
|
return self._has_features_with_commercial_schema(CommercialSchema.PAID.value, exclude_features_sources)
|
1582
1582
|
|
1583
1583
|
def _is_input_same_as_fit(
|
1584
1584
|
self,
|
1585
|
-
X:
|
1586
|
-
y:
|
1587
|
-
eval_set:
|
1585
|
+
X: pd.DataFrame | pd.Series | np.ndarray | None = None,
|
1586
|
+
y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
|
1587
|
+
eval_set: list[tuple] | None = None,
|
1588
1588
|
) -> tuple:
|
1589
1589
|
if X is None:
|
1590
1590
|
return True, self.X, self.y, self.eval_set
|
@@ -1615,9 +1615,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
1615
1615
|
def _get_cv_and_groups(
|
1616
1616
|
self,
|
1617
1617
|
X: pd.DataFrame,
|
1618
|
-
cv_override:
|
1618
|
+
cv_override: BaseCrossValidator | CVType | str | None,
|
1619
1619
|
search_keys: dict[str, SearchKey],
|
1620
|
-
) -> tuple[BaseCrossValidator,
|
1620
|
+
) -> tuple[BaseCrossValidator, np.ndarray] | None:
|
1621
1621
|
_cv = cv_override or self.cv
|
1622
1622
|
group_columns = sorted(self._get_group_columns(X, search_keys))
|
1623
1623
|
groups = None
|
@@ -1645,8 +1645,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1645
1645
|
return _cv, groups
|
1646
1646
|
|
1647
1647
|
def _get_and_validate_client_cat_features(
|
1648
|
-
self, estimator:
|
1649
|
-
) -> tuple[
|
1648
|
+
self, estimator: Any | None, X: pd.DataFrame, search_keys: dict[str, SearchKey]
|
1649
|
+
) -> tuple[list[str] | None, list[str]]:
|
1650
1650
|
cat_features = []
|
1651
1651
|
search_keys_for_metrics = []
|
1652
1652
|
if (
|
@@ -1678,16 +1678,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
1678
1678
|
def _get_cached_enriched_data(
|
1679
1679
|
self,
|
1680
1680
|
trace_id: str,
|
1681
|
-
X:
|
1682
|
-
y:
|
1683
|
-
eval_set:
|
1684
|
-
exclude_features_sources:
|
1685
|
-
remove_outliers_calc_metrics:
|
1686
|
-
cv_override:
|
1687
|
-
search_keys_for_metrics:
|
1688
|
-
progress_bar:
|
1689
|
-
progress_callback:
|
1690
|
-
client_cat_features:
|
1681
|
+
X: pd.DataFrame | pd.Series | np.ndarray | None = None,
|
1682
|
+
y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
|
1683
|
+
eval_set: list[tuple] | tuple | None = None,
|
1684
|
+
exclude_features_sources: list[str] | None = None,
|
1685
|
+
remove_outliers_calc_metrics: bool | None = None,
|
1686
|
+
cv_override: BaseCrossValidator | CVType | str | None = None,
|
1687
|
+
search_keys_for_metrics: list[str] | None = None,
|
1688
|
+
progress_bar: ProgressBar | None = None,
|
1689
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
1690
|
+
client_cat_features: list[str] | None = None,
|
1691
1691
|
is_for_metrics: bool = False,
|
1692
1692
|
):
|
1693
1693
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
@@ -1893,15 +1893,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
1893
1893
|
def _get_enriched_datasets(
|
1894
1894
|
self,
|
1895
1895
|
trace_id: str,
|
1896
|
-
validated_X:
|
1897
|
-
validated_y:
|
1898
|
-
eval_set:
|
1899
|
-
exclude_features_sources:
|
1896
|
+
validated_X: pd.DataFrame | pd.Series | np.ndarray | None,
|
1897
|
+
validated_y: pd.DataFrame | pd.Series | np.ndarray | list | None,
|
1898
|
+
eval_set: list[tuple] | None,
|
1899
|
+
exclude_features_sources: list[str] | None,
|
1900
1900
|
is_input_same_as_fit: bool,
|
1901
1901
|
is_demo_dataset: bool,
|
1902
|
-
remove_outliers_calc_metrics:
|
1903
|
-
progress_bar:
|
1904
|
-
progress_callback:
|
1902
|
+
remove_outliers_calc_metrics: bool | None,
|
1903
|
+
progress_bar: ProgressBar | None,
|
1904
|
+
progress_callback: Callable[[SearchProgress], Any] | None,
|
1905
1905
|
is_for_metrics: bool = False,
|
1906
1906
|
) -> _EnrichedDataForMetrics:
|
1907
1907
|
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
@@ -1939,7 +1939,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1939
1939
|
)
|
1940
1940
|
|
1941
1941
|
def __get_sampled_cached_enriched(
|
1942
|
-
self, datasets_hash: str, exclude_features_sources:
|
1942
|
+
self, datasets_hash: str, exclude_features_sources: list[str] | None
|
1943
1943
|
) -> _EnrichedDataForMetrics:
|
1944
1944
|
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming, generated_features = (
|
1945
1945
|
self.__cached_sampled_datasets[datasets_hash]
|
@@ -1959,7 +1959,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1959
1959
|
)
|
1960
1960
|
|
1961
1961
|
def __get_enriched_as_input(
|
1962
|
-
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set:
|
1962
|
+
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: list[tuple] | None, is_demo_dataset: bool
|
1963
1963
|
) -> _EnrichedDataForMetrics:
|
1964
1964
|
eval_set_sampled_dict = {}
|
1965
1965
|
|
@@ -2055,9 +2055,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
2055
2055
|
|
2056
2056
|
def __get_enriched_from_fit(
|
2057
2057
|
self,
|
2058
|
-
eval_set:
|
2058
|
+
eval_set: list[tuple] | None,
|
2059
2059
|
trace_id: str,
|
2060
|
-
remove_outliers_calc_metrics:
|
2060
|
+
remove_outliers_calc_metrics: bool | None,
|
2061
2061
|
) -> _EnrichedDataForMetrics:
|
2062
2062
|
eval_set_sampled_dict = {}
|
2063
2063
|
search_keys = self.fit_search_keys.copy()
|
@@ -2163,11 +2163,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
2163
2163
|
self,
|
2164
2164
|
validated_X: pd.DataFrame,
|
2165
2165
|
validated_y: pd.Series,
|
2166
|
-
eval_set:
|
2167
|
-
exclude_features_sources:
|
2166
|
+
eval_set: list[tuple] | None,
|
2167
|
+
exclude_features_sources: list[str] | None,
|
2168
2168
|
trace_id: str,
|
2169
|
-
progress_bar:
|
2170
|
-
progress_callback:
|
2169
|
+
progress_bar: ProgressBar | None,
|
2170
|
+
progress_callback: Callable[[SearchProgress], Any] | None,
|
2171
2171
|
is_for_metrics: bool = False,
|
2172
2172
|
) -> _EnrichedDataForMetrics:
|
2173
2173
|
has_eval_set = eval_set is not None
|
@@ -2178,7 +2178,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2178
2178
|
df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set)
|
2179
2179
|
|
2180
2180
|
# Exclude OOT eval sets from transform because they are not used for metrics calculation
|
2181
|
-
if
|
2181
|
+
if is_for_metrics and EVAL_SET_INDEX in df.columns:
|
2182
2182
|
for eval_index in df[EVAL_SET_INDEX].unique():
|
2183
2183
|
if eval_index == 0:
|
2184
2184
|
continue
|
@@ -2231,7 +2231,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2231
2231
|
)
|
2232
2232
|
|
2233
2233
|
def __combine_train_and_eval_sets(
|
2234
|
-
self, X: pd.DataFrame, y:
|
2234
|
+
self, X: pd.DataFrame, y: pd.Series | None = None, eval_set: list[tuple] | None = None
|
2235
2235
|
) -> pd.DataFrame:
|
2236
2236
|
df = X.copy()
|
2237
2237
|
if y is not None:
|
@@ -2354,7 +2354,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2354
2354
|
generated_features=generated_features,
|
2355
2355
|
)
|
2356
2356
|
|
2357
|
-
def get_search_id(self) ->
|
2357
|
+
def get_search_id(self) -> str | None:
|
2358
2358
|
"""Returns search_id of the fitted enricher. Not available before a successful fit."""
|
2359
2359
|
return self._search_task.search_task_id if self._search_task else None
|
2360
2360
|
|
@@ -2367,7 +2367,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2367
2367
|
|
2368
2368
|
return self.features_info
|
2369
2369
|
|
2370
|
-
def get_progress(self, trace_id:
|
2370
|
+
def get_progress(self, trace_id: str | None = None, search_task: SearchTask | None = None) -> SearchProgress:
|
2371
2371
|
search_task = search_task or self._search_task
|
2372
2372
|
if search_task is not None:
|
2373
2373
|
trace_id = trace_id or uuid.uuid4()
|
@@ -2475,13 +2475,14 @@ if response.status_code == 200:
|
|
2475
2475
|
trace_id: str,
|
2476
2476
|
X: pd.DataFrame,
|
2477
2477
|
*,
|
2478
|
-
y:
|
2479
|
-
exclude_features_sources:
|
2478
|
+
y: pd.Series | None = None,
|
2479
|
+
exclude_features_sources: list[str] | None = None,
|
2480
2480
|
metrics_calculation: bool = False,
|
2481
2481
|
silent_mode: bool = False,
|
2482
|
-
progress_bar:
|
2483
|
-
progress_callback:
|
2482
|
+
progress_bar: ProgressBar | None = None,
|
2483
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
2484
2484
|
add_fit_system_record_id: bool = False,
|
2485
|
+
keep_input: bool = True,
|
2485
2486
|
) -> tuple[pd.DataFrame, dict[str, str], list[str], dict[str, SearchKey]]:
|
2486
2487
|
if self._search_task is None:
|
2487
2488
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
@@ -2775,7 +2776,7 @@ if response.status_code == 200:
|
|
2775
2776
|
progress_bar.progress = progress.to_progress_bar()
|
2776
2777
|
if progress_callback is not None:
|
2777
2778
|
progress_callback(progress)
|
2778
|
-
prev_progress:
|
2779
|
+
prev_progress: SearchProgress | None = None
|
2779
2780
|
polling_period_seconds = 1
|
2780
2781
|
try:
|
2781
2782
|
while progress.stage != ProgressStage.DOWNLOADING.value:
|
@@ -2830,21 +2831,30 @@ if response.status_code == 200:
|
|
2830
2831
|
how="left",
|
2831
2832
|
)
|
2832
2833
|
|
2834
|
+
fit_input_columns = [c.originalName for c in self._search_task.get_file_metadata(trace_id).columns]
|
2835
|
+
new_columns_on_transform = [c for c in validated_Xy.columns if c not in fit_input_columns]
|
2836
|
+
|
2833
2837
|
selected_generated_features = [
|
2834
2838
|
c for c in generated_features if not self.fit_select_features or c in self.feature_names_
|
2835
2839
|
]
|
2836
|
-
|
2837
|
-
|
2838
|
-
|
2839
|
-
|
2840
|
-
|
2841
|
-
|
2842
|
-
|
2843
|
-
|
2844
|
-
|
2840
|
+
if keep_input is True:
|
2841
|
+
selected_input_columns = [
|
2842
|
+
c
|
2843
|
+
for c in validated_Xy.columns
|
2844
|
+
if not self.fit_select_features
|
2845
|
+
or c in self.feature_names_
|
2846
|
+
or c in new_columns_on_transform
|
2847
|
+
or c in self.search_keys
|
2848
|
+
or c in (self.id_columns or [])
|
2849
|
+
or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
|
2850
|
+
]
|
2851
|
+
else:
|
2852
|
+
selected_input_columns = []
|
2853
|
+
|
2845
2854
|
selecting_columns = selected_input_columns + selected_generated_features
|
2846
2855
|
selecting_columns.extend(
|
2847
|
-
c for c in result.columns
|
2856
|
+
c for c in result.columns
|
2857
|
+
if c in self.feature_names_ and c not in selecting_columns and c not in validated_Xy.columns
|
2848
2858
|
)
|
2849
2859
|
if add_fit_system_record_id:
|
2850
2860
|
selecting_columns.append(SORT_ID)
|
@@ -2871,7 +2881,7 @@ if response.status_code == 200:
|
|
2871
2881
|
|
2872
2882
|
return result, columns_renaming, generated_features, search_keys
|
2873
2883
|
|
2874
|
-
def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id:
|
2884
|
+
def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
|
2875
2885
|
if (search_keys is None or len(search_keys) == 0) and self.country_code is None:
|
2876
2886
|
if search_id:
|
2877
2887
|
self.logger.debug(f"search_id {search_id} provided without search_keys")
|
@@ -2944,22 +2954,22 @@ if response.status_code == 200:
|
|
2944
2954
|
def __inner_fit(
|
2945
2955
|
self,
|
2946
2956
|
trace_id: str,
|
2947
|
-
X:
|
2948
|
-
y:
|
2949
|
-
eval_set:
|
2950
|
-
progress_bar:
|
2957
|
+
X: pd.DataFrame | pd.Series | np.ndarray,
|
2958
|
+
y: pd.DataFrame | pd.Series | np.ndarray | list | None,
|
2959
|
+
eval_set: list[tuple] | None,
|
2960
|
+
progress_bar: ProgressBar | None,
|
2951
2961
|
start_time: int,
|
2952
2962
|
*,
|
2953
|
-
exclude_features_sources:
|
2954
|
-
calculate_metrics:
|
2955
|
-
scoring:
|
2956
|
-
estimator:
|
2963
|
+
exclude_features_sources: list[str] | None = None,
|
2964
|
+
calculate_metrics: bool | None,
|
2965
|
+
scoring: Callable | str | None,
|
2966
|
+
estimator: Any | None,
|
2957
2967
|
stability_threshold: float,
|
2958
2968
|
stability_agg_func: str,
|
2959
|
-
remove_outliers_calc_metrics:
|
2969
|
+
remove_outliers_calc_metrics: bool | None,
|
2960
2970
|
auto_fe_parameters: AutoFEParameters,
|
2961
|
-
progress_callback:
|
2962
|
-
search_id_callback:
|
2971
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
2972
|
+
search_id_callback: Callable[[str], Any] | None = None,
|
2963
2973
|
):
|
2964
2974
|
self._search_task = None
|
2965
2975
|
self.warning_counter.reset()
|
@@ -3378,16 +3388,6 @@ if response.status_code == 200:
|
|
3378
3388
|
|
3379
3389
|
self.__show_selected_features()
|
3380
3390
|
|
3381
|
-
autofe_description = self.get_autofe_features_description()
|
3382
|
-
if autofe_description is not None and len(autofe_description) > 0:
|
3383
|
-
self.logger.info(f"AutoFE descriptions: {autofe_description}")
|
3384
|
-
self.autofe_features_display_handle = display_html_dataframe(
|
3385
|
-
df=autofe_description,
|
3386
|
-
internal_df=autofe_description,
|
3387
|
-
header=self.bundle.get("autofe_descriptions_header"),
|
3388
|
-
display_id=f"autofe_descriptions_{uuid.uuid4()}",
|
3389
|
-
)
|
3390
|
-
|
3391
3391
|
if self._has_paid_features(exclude_features_sources):
|
3392
3392
|
if calculate_metrics is not None and calculate_metrics:
|
3393
3393
|
msg = self.bundle.get("metrics_with_paid_features")
|
@@ -3466,7 +3466,7 @@ if response.status_code == 200:
|
|
3466
3466
|
def __should_add_date_column(self):
|
3467
3467
|
return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
|
3468
3468
|
|
3469
|
-
def __get_renamed_id_columns(self, renaming:
|
3469
|
+
def __get_renamed_id_columns(self, renaming: dict[str, str] | None = None):
|
3470
3470
|
renaming = renaming or self.fit_columns_renaming
|
3471
3471
|
reverse_renaming = {v: k for k, v in renaming.items()}
|
3472
3472
|
return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
|
@@ -3511,11 +3511,11 @@ if response.status_code == 200:
|
|
3511
3511
|
def _validate_train_eval(
|
3512
3512
|
self,
|
3513
3513
|
X: pd.DataFrame,
|
3514
|
-
y:
|
3515
|
-
eval_set:
|
3514
|
+
y: pd.Series | None = None,
|
3515
|
+
eval_set: list[tuple[pd.DataFrame, pd.Series]] | None = None,
|
3516
3516
|
is_transform: bool = False,
|
3517
3517
|
silent: bool = False,
|
3518
|
-
) -> tuple[pd.DataFrame, pd.Series,
|
3518
|
+
) -> tuple[pd.DataFrame, pd.Series, list[tuple[pd.DataFrame, pd.Series]]] | None:
|
3519
3519
|
validated_X = self._validate_X(X, is_transform)
|
3520
3520
|
validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
|
3521
3521
|
validated_eval_set = self._validate_eval_set(validated_X, eval_set, silent=silent)
|
@@ -3594,7 +3594,7 @@ if response.status_code == 200:
|
|
3594
3594
|
|
3595
3595
|
return validated_X
|
3596
3596
|
|
3597
|
-
def _validate_y(self, X: pd.DataFrame, y, enforce_y: bool = True) ->
|
3597
|
+
def _validate_y(self, X: pd.DataFrame, y, enforce_y: bool = True) -> pd.Series | None:
|
3598
3598
|
if y is None and not enforce_y:
|
3599
3599
|
return None
|
3600
3600
|
if (
|
@@ -3644,7 +3644,7 @@ if response.status_code == 200:
|
|
3644
3644
|
return validated_y
|
3645
3645
|
|
3646
3646
|
def _validate_eval_set(
|
3647
|
-
self, X: pd.DataFrame, eval_set:
|
3647
|
+
self, X: pd.DataFrame, eval_set: list[tuple[pd.DataFrame, pd.Series]] | None, silent: bool = False
|
3648
3648
|
):
|
3649
3649
|
if eval_set is None:
|
3650
3650
|
return None
|
@@ -3756,7 +3756,7 @@ if response.status_code == 200:
|
|
3756
3756
|
|
3757
3757
|
return validated_eval_X, validated_eval_y
|
3758
3758
|
|
3759
|
-
def _validate_baseline_score(self, X: pd.DataFrame, eval_set:
|
3759
|
+
def _validate_baseline_score(self, X: pd.DataFrame, eval_set: list[tuple] | None):
|
3760
3760
|
if self.baseline_score_column is not None:
|
3761
3761
|
if self.baseline_score_column not in X.columns:
|
3762
3762
|
raise ValidationError(
|
@@ -3783,7 +3783,7 @@ if response.status_code == 200:
|
|
3783
3783
|
|
3784
3784
|
@staticmethod
|
3785
3785
|
def _sort_by_system_record_id(
|
3786
|
-
X: pd.DataFrame, y: pd.Series, cv:
|
3786
|
+
X: pd.DataFrame, y: pd.Series, cv: CVType | None
|
3787
3787
|
) -> tuple[pd.DataFrame, pd.Series]:
|
3788
3788
|
if cv not in [CVType.time_series, CVType.blocked_time_series]:
|
3789
3789
|
record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
|
@@ -3801,7 +3801,7 @@ if response.status_code == 200:
|
|
3801
3801
|
# Deprecated
|
3802
3802
|
@staticmethod
|
3803
3803
|
def _sort_by_keys(
|
3804
|
-
X: pd.DataFrame, y: pd.Series, search_keys: dict[str, SearchKey], cv:
|
3804
|
+
X: pd.DataFrame, y: pd.Series, search_keys: dict[str, SearchKey], cv: CVType | None
|
3805
3805
|
) -> tuple[pd.DataFrame, pd.Series]:
|
3806
3806
|
if cv not in [CVType.time_series, CVType.blocked_time_series]:
|
3807
3807
|
if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
|
@@ -3841,14 +3841,14 @@ if response.status_code == 200:
|
|
3841
3841
|
def __log_debug_information(
|
3842
3842
|
self,
|
3843
3843
|
X: pd.DataFrame,
|
3844
|
-
y:
|
3845
|
-
eval_set:
|
3846
|
-
exclude_features_sources:
|
3847
|
-
calculate_metrics:
|
3848
|
-
cv:
|
3849
|
-
scoring:
|
3850
|
-
estimator:
|
3851
|
-
remove_outliers_calc_metrics:
|
3844
|
+
y: pd.Series | np.ndarray | list | None = None,
|
3845
|
+
eval_set: list[tuple] | None = None,
|
3846
|
+
exclude_features_sources: list[str] | None = None,
|
3847
|
+
calculate_metrics: bool | None = None,
|
3848
|
+
cv: Any | None = None,
|
3849
|
+
scoring: Any | None = None,
|
3850
|
+
estimator: Any | None = None,
|
3851
|
+
remove_outliers_calc_metrics: bool | None = None,
|
3852
3852
|
):
|
3853
3853
|
try:
|
3854
3854
|
resolved_api_key = self.api_key or os.environ.get(UPGINI_API_KEY)
|
@@ -3973,7 +3973,7 @@ if response.status_code == 200:
|
|
3973
3973
|
]
|
3974
3974
|
|
3975
3975
|
@staticmethod
|
3976
|
-
def _get_email_column(search_keys: dict[str, SearchKey]) ->
|
3976
|
+
def _get_email_column(search_keys: dict[str, SearchKey]) -> str | None:
|
3977
3977
|
cols = [col for col, t in search_keys.items() if t == SearchKey.EMAIL]
|
3978
3978
|
if len(cols) > 1:
|
3979
3979
|
raise Exception("More than one email column found after unnest")
|
@@ -3981,7 +3981,7 @@ if response.status_code == 200:
|
|
3981
3981
|
return cols[0]
|
3982
3982
|
|
3983
3983
|
@staticmethod
|
3984
|
-
def _get_hem_column(search_keys: dict[str, SearchKey]) ->
|
3984
|
+
def _get_hem_column(search_keys: dict[str, SearchKey]) -> str | None:
|
3985
3985
|
cols = [col for col, t in search_keys.items() if t == SearchKey.HEM]
|
3986
3986
|
if len(cols) > 1:
|
3987
3987
|
raise Exception("More than one hem column found after unnest")
|
@@ -3989,7 +3989,7 @@ if response.status_code == 200:
|
|
3989
3989
|
return cols[0]
|
3990
3990
|
|
3991
3991
|
@staticmethod
|
3992
|
-
def _get_ip_column(search_keys: dict[str, SearchKey]) ->
|
3992
|
+
def _get_ip_column(search_keys: dict[str, SearchKey]) -> str | None:
|
3993
3993
|
cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
|
3994
3994
|
if len(cols) > 1:
|
3995
3995
|
raise Exception("More than one ip column found after unnest")
|
@@ -3997,25 +3997,25 @@ if response.status_code == 200:
|
|
3997
3997
|
return cols[0]
|
3998
3998
|
|
3999
3999
|
@staticmethod
|
4000
|
-
def _get_phone_column(search_keys: dict[str, SearchKey]) ->
|
4000
|
+
def _get_phone_column(search_keys: dict[str, SearchKey]) -> str | None:
|
4001
4001
|
for col, t in search_keys.items():
|
4002
4002
|
if t == SearchKey.PHONE:
|
4003
4003
|
return col
|
4004
4004
|
|
4005
4005
|
@staticmethod
|
4006
|
-
def _get_country_column(search_keys: dict[str, SearchKey]) ->
|
4006
|
+
def _get_country_column(search_keys: dict[str, SearchKey]) -> str | None:
|
4007
4007
|
for col, t in search_keys.items():
|
4008
4008
|
if t == SearchKey.COUNTRY:
|
4009
4009
|
return col
|
4010
4010
|
|
4011
4011
|
@staticmethod
|
4012
|
-
def _get_postal_column(search_keys: dict[str, SearchKey]) ->
|
4012
|
+
def _get_postal_column(search_keys: dict[str, SearchKey]) -> str | None:
|
4013
4013
|
for col, t in search_keys.items():
|
4014
4014
|
if t == SearchKey.POSTAL_CODE:
|
4015
4015
|
return col
|
4016
4016
|
|
4017
4017
|
@staticmethod
|
4018
|
-
def _get_date_column(search_keys: dict[str, SearchKey]) ->
|
4018
|
+
def _get_date_column(search_keys: dict[str, SearchKey]) -> str | None:
|
4019
4019
|
return SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
4020
4020
|
|
4021
4021
|
def _explode_multiple_search_keys(
|
@@ -4062,10 +4062,10 @@ if response.status_code == 200:
|
|
4062
4062
|
id_name: str,
|
4063
4063
|
target_name: str,
|
4064
4064
|
columns_renaming: dict[str, str],
|
4065
|
-
id_columns:
|
4066
|
-
cv:
|
4065
|
+
id_columns: list[str] | None,
|
4066
|
+
cv: CVType | None,
|
4067
4067
|
model_task_type: ModelTaskType,
|
4068
|
-
logger:
|
4068
|
+
logger: logging.Logger | None = None,
|
4069
4069
|
bundle: ResourceBundle = bundle,
|
4070
4070
|
) -> pd.DataFrame:
|
4071
4071
|
original_index_name = df.index.name
|
@@ -4201,7 +4201,7 @@ if response.status_code == 200:
|
|
4201
4201
|
def __enrich(
|
4202
4202
|
self,
|
4203
4203
|
input_df: pd.DataFrame,
|
4204
|
-
result_features:
|
4204
|
+
result_features: pd.DataFrame | None,
|
4205
4205
|
how: str = "inner",
|
4206
4206
|
drop_system_record_id=True,
|
4207
4207
|
) -> pd.DataFrame:
|
@@ -4320,7 +4320,7 @@ if response.status_code == 200:
|
|
4320
4320
|
self,
|
4321
4321
|
trace_id: str,
|
4322
4322
|
clients_features_df: pd.DataFrame,
|
4323
|
-
updated_shaps:
|
4323
|
+
updated_shaps: dict[str, float] | None = None,
|
4324
4324
|
update_selected_features: bool = True,
|
4325
4325
|
silent=False,
|
4326
4326
|
):
|
@@ -4659,12 +4659,12 @@ if response.status_code == 200:
|
|
4659
4659
|
|
4660
4660
|
def __show_metrics(
|
4661
4661
|
self,
|
4662
|
-
scoring:
|
4663
|
-
estimator:
|
4664
|
-
remove_outliers_calc_metrics:
|
4662
|
+
scoring: Callable | str | None,
|
4663
|
+
estimator: Any | None,
|
4664
|
+
remove_outliers_calc_metrics: bool | None,
|
4665
4665
|
trace_id: str,
|
4666
|
-
progress_bar:
|
4667
|
-
progress_callback:
|
4666
|
+
progress_bar: ProgressBar | None = None,
|
4667
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
4668
4668
|
):
|
4669
4669
|
self.metrics = self.calculate_metrics(
|
4670
4670
|
scoring=scoring,
|
@@ -4698,13 +4698,23 @@ if response.status_code == 200:
|
|
4698
4698
|
self.bundle.get("relevant_data_sources_header"),
|
4699
4699
|
display_id=f"data_sources_{uuid.uuid4()}",
|
4700
4700
|
)
|
4701
|
+
|
4702
|
+
autofe_description = self.get_autofe_features_description()
|
4703
|
+
if autofe_description is not None and len(autofe_description) > 0:
|
4704
|
+
self.logger.info(f"AutoFE descriptions: {autofe_description}")
|
4705
|
+
self.autofe_features_display_handle = display_html_dataframe(
|
4706
|
+
df=autofe_description,
|
4707
|
+
internal_df=autofe_description,
|
4708
|
+
header=self.bundle.get("autofe_descriptions_header"),
|
4709
|
+
display_id=f"autofe_descriptions_{uuid.uuid4()}",
|
4710
|
+
)
|
4701
4711
|
else:
|
4702
4712
|
msg = self.bundle.get("features_info_zero_important_features")
|
4703
4713
|
self.__log_warning(msg, show_support_link=True)
|
4704
4714
|
except (ImportError, NameError):
|
4705
4715
|
print(self._internal_features_info)
|
4706
4716
|
|
4707
|
-
def __show_report_button(self, display_id:
|
4717
|
+
def __show_report_button(self, display_id: str | None = None, display_handle=None):
|
4708
4718
|
try:
|
4709
4719
|
return prepare_and_show_report(
|
4710
4720
|
relevant_features_df=self._features_info_without_links,
|
@@ -4844,7 +4854,7 @@ if response.status_code == 200:
|
|
4844
4854
|
except Exception:
|
4845
4855
|
self.logger.exception("Failed to dump python libs")
|
4846
4856
|
|
4847
|
-
def __display_support_link(self, link_text:
|
4857
|
+
def __display_support_link(self, link_text: str | None = None):
|
4848
4858
|
support_link = self.bundle.get("support_link")
|
4849
4859
|
link_text = link_text or self.bundle.get("support_text")
|
4850
4860
|
try:
|
@@ -4871,9 +4881,9 @@ if response.status_code == 200:
|
|
4871
4881
|
def dump_input(
|
4872
4882
|
self,
|
4873
4883
|
trace_id: str,
|
4874
|
-
X:
|
4875
|
-
y:
|
4876
|
-
eval_set:
|
4884
|
+
X: pd.DataFrame | pd.Series,
|
4885
|
+
y: pd.DataFrame | pd.Series | None = None,
|
4886
|
+
eval_set: tuple | None = None,
|
4877
4887
|
):
|
4878
4888
|
def dump_task(X_, y_, eval_set_):
|
4879
4889
|
with MDC(trace_id=trace_id):
|
@@ -4964,7 +4974,7 @@ def is_frames_equal(first, second, bundle: ResourceBundle) -> bool:
|
|
4964
4974
|
raise ValidationError(bundle.get("x_and_eval_x_diff_types").format(type(first), type(second)))
|
4965
4975
|
|
4966
4976
|
|
4967
|
-
def drop_duplicates(df:
|
4977
|
+
def drop_duplicates(df: pd.DataFrame | np.ndarray | Any) -> pd.DataFrame:
|
4968
4978
|
if isinstance(df, pd.DataFrame):
|
4969
4979
|
return df.drop_duplicates()
|
4970
4980
|
elif isinstance(df, np.ndarray):
|
upgini/http.py
CHANGED
@@ -413,43 +413,9 @@ class _RestClient:
|
|
413
413
|
with open(path, "rb") as file:
|
414
414
|
files = {"file": (file_name, file, "application/octet-stream")}
|
415
415
|
self._with_unauth_retry(
|
416
|
-
lambda: self._send_post_file_req_v2(
|
417
|
-
api_path, files, trace_id=trace_id, need_json_response=False
|
418
|
-
)
|
416
|
+
lambda: self._send_post_file_req_v2(api_path, files, trace_id=trace_id, need_json_response=False)
|
419
417
|
)
|
420
418
|
|
421
|
-
def dump_input_files(
|
422
|
-
self,
|
423
|
-
trace_id: str,
|
424
|
-
x_path: str,
|
425
|
-
y_path: Optional[str] = None,
|
426
|
-
eval_x_path: Optional[str] = None,
|
427
|
-
eval_y_path: Optional[str] = None,
|
428
|
-
):
|
429
|
-
api_path = self.SEARCH_DUMP_INPUT_FILE_FMT
|
430
|
-
|
431
|
-
def upload_with_check(path: str, file_name: str):
|
432
|
-
digest_sha256 = file_hash(path)
|
433
|
-
if self.is_file_uploaded(trace_id, digest_sha256):
|
434
|
-
# print(f"File {path} was already uploaded with digest {digest_sha256}, skipping")
|
435
|
-
return
|
436
|
-
else:
|
437
|
-
with open(path, "rb") as file:
|
438
|
-
files = {"file": (file_name, file, "application/octet-stream")}
|
439
|
-
self._with_unauth_retry(
|
440
|
-
lambda: self._send_post_file_req_v2(
|
441
|
-
api_path, files, trace_id=trace_id, need_json_response=False
|
442
|
-
)
|
443
|
-
)
|
444
|
-
|
445
|
-
upload_with_check(x_path, "x.parquet")
|
446
|
-
if y_path:
|
447
|
-
upload_with_check(y_path, "y.parquet")
|
448
|
-
if eval_x_path:
|
449
|
-
upload_with_check(eval_x_path, "eval_x.parquet")
|
450
|
-
if eval_y_path:
|
451
|
-
upload_with_check(eval_y_path, "eval_y.parquet")
|
452
|
-
|
453
419
|
def initial_search_v2(
|
454
420
|
self,
|
455
421
|
trace_id: str,
|
@@ -1080,6 +1046,7 @@ class LoggerFactory:
|
|
1080
1046
|
|
1081
1047
|
upgini_logger = logging.getLogger(f"upgini.{hash(key)}")
|
1082
1048
|
upgini_logger.handlers.clear()
|
1049
|
+
upgini_logger.propagate = False # Prevent duplicate logging in Jupyter notebooks
|
1083
1050
|
rest_client = get_rest_client(backend_url, api_token, client_ip, client_visitorid)
|
1084
1051
|
datadog_handler = BackendLogHandler(rest_client, client_ip, client_visitorid)
|
1085
1052
|
json_formatter = jsonlogger.JsonFormatter(
|
upgini/metrics.py
CHANGED
@@ -815,9 +815,9 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
815
815
|
encoded = cat_encoder.transform(x[self.cat_features]).astype(int)
|
816
816
|
else:
|
817
817
|
encoded = cat_encoder.transform(x[self.cat_features])
|
818
|
-
cat_features =
|
819
|
-
x.drop(columns=
|
820
|
-
x[
|
818
|
+
cat_features = self.cat_features
|
819
|
+
x = x.drop(columns=self.cat_features, errors="ignore")
|
820
|
+
x[self.cat_features] = encoded
|
821
821
|
else:
|
822
822
|
cat_features = self.cat_features
|
823
823
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.116
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -749,6 +749,36 @@ enricher.fit(
|
|
749
749
|
)
|
750
750
|
```
|
751
751
|
|
752
|
+
### Control feature stability with PSI parameters
|
753
|
+
|
754
|
+
`FeaturesEnricher` supports Population Stability Index (PSI) calculation on eval_set to evaluate feature stability over time. You can control this behavior using stability parameters in `fit` and `fit_transform` methods:
|
755
|
+
|
756
|
+
```python
|
757
|
+
enricher = FeaturesEnricher(
|
758
|
+
search_keys={"registration_date": SearchKey.DATE}
|
759
|
+
)
|
760
|
+
|
761
|
+
# Control feature stability during fit
|
762
|
+
enricher.fit(
|
763
|
+
X, y,
|
764
|
+
stability_threshold=0.2, # PSI threshold: features with PSI above this value will be dropped
|
765
|
+
stability_agg_func="max" # Aggregation function for stability values: "max", "min", "mean"
|
766
|
+
)
|
767
|
+
|
768
|
+
# Same parameters work for fit_transform
|
769
|
+
enriched_df = enricher.fit_transform(
|
770
|
+
X, y,
|
771
|
+
stability_threshold=0.1, # Stricter threshold for more stable features
|
772
|
+
stability_agg_func="mean" # Use mean aggregation instead of max
|
773
|
+
)
|
774
|
+
```
|
775
|
+
|
776
|
+
**Stability parameters:**
|
777
|
+
- `stability_threshold` (float, default=0.2): PSI threshold value. Features with PSI below this threshold will be excluded from the final feature set. Lower values mean stricter stability requirements.
|
778
|
+
- `stability_agg_func` (str, default="max"): Function to aggregate PSI values across time intervals. Options: "max" (most conservative), "min" (least conservative), "mean" (balanced approach).
|
779
|
+
|
780
|
+
**PSI (Population Stability Index)** measures how much feature distribution changes over time. Lower PSI values indicate more stable features, which are generally more reliable for production ML models.
|
781
|
+
|
752
782
|
### Use custom loss function in feature selection & metrics calculation
|
753
783
|
|
754
784
|
`FeaturesEnricher` can be initialized with additional string parameter `loss`.
|
@@ -874,7 +904,7 @@ Some convenient ways to start contributing are:
|
|
874
904
|
⚙️ **Gitpod** [](https://gitpod.io/#https://github.com/upgini/upgini) You can use Gitpod to launch a fully functional development environment right in your browser.
|
875
905
|
|
876
906
|
## 🔗 Useful links
|
877
|
-
- [Simple sales predictions as a template notebook](#-simple-sales-
|
907
|
+
- [Simple sales predictions as a template notebook](#-simple-sales-prediction-for-retail-stores)
|
878
908
|
- [Full list of Kaggle Guides & Examples](https://www.kaggle.com/romaupgini/code)
|
879
909
|
- [Project on PyPI](https://pypi.org/project/upgini)
|
880
910
|
- [More perks for registered users](https://profile.upgini.com)
|
@@ -1,12 +1,12 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=uljlkRI1AQACoivbMb5ybeY28dehm84UBPMxgB7YFtM,24
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=pQ8JQe0cdygD-W9GefJmfE6bnj4EYzXsjlgWdIS9nS8,31578
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
7
|
-
upgini/http.py,sha256
|
6
|
+
upgini/features_enricher.py,sha256=iYay-Ye5WGntieg3X7uyg9W3x_1FUELrmhJnJIvQMeI,228897
|
7
|
+
upgini/http.py,sha256=-J_wOpnwVnT0ebPC6sOs6fN3AWtCD0LJLu6nlYmxaqk,44348
|
8
8
|
upgini/metadata.py,sha256=VzgtgEbPPtNxTrj9LM5qSDP3DujHwAXqbUSKBjPcb9c,12477
|
9
|
-
upgini/metrics.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=_kBg6gSXx82LRlRadg-Qggm-GtcPBLbtK3nGCKQjELo,45925
|
10
10
|
upgini/search_task.py,sha256=SAiUd1AytbA2Q6PSnnztr7oTRKpud1wQZ5YtKjsmQHU,18256
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,
|
|
74
74
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
75
75
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
76
76
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
77
|
-
upgini-1.2.
|
78
|
-
upgini-1.2.
|
79
|
-
upgini-1.2.
|
80
|
-
upgini-1.2.
|
77
|
+
upgini-1.2.116.dist-info/METADATA,sha256=w7UkEj5YDpGksjUI1ii61tcOjPenO4XNEDrvPnaJVj4,50692
|
78
|
+
upgini-1.2.116.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
79
|
+
upgini-1.2.116.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
80
|
+
upgini-1.2.116.dist-info/RECORD,,
|
File without changes
|
File without changes
|