upgini 1.2.114a5__py3-none-any.whl → 1.2.115a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +215 -207
- upgini/http.py +2 -35
- {upgini-1.2.114a5.dist-info → upgini-1.2.115a1.dist-info}/METADATA +31 -1
- {upgini-1.2.114a5.dist-info → upgini-1.2.115a1.dist-info}/RECORD +7 -7
- {upgini-1.2.114a5.dist-info → upgini-1.2.115a1.dist-info}/WHEEL +0 -0
- {upgini-1.2.114a5.dist-info → upgini-1.2.115a1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.115a1"
|
upgini/features_enricher.py
CHANGED
@@ -12,7 +12,7 @@ from collections import Counter
|
|
12
12
|
from copy import deepcopy
|
13
13
|
from dataclasses import dataclass
|
14
14
|
from threading import Thread
|
15
|
-
from typing import Any, Callable
|
15
|
+
from typing import Any, Callable
|
16
16
|
|
17
17
|
import numpy as np
|
18
18
|
import pandas as pd
|
@@ -207,34 +207,34 @@ class FeaturesEnricher(TransformerMixin):
|
|
207
207
|
|
208
208
|
def __init__(
|
209
209
|
self,
|
210
|
-
search_keys:
|
211
|
-
country_code:
|
212
|
-
model_task_type:
|
213
|
-
api_key:
|
214
|
-
endpoint:
|
215
|
-
search_id:
|
216
|
-
shared_datasets:
|
217
|
-
runtime_parameters:
|
218
|
-
date_format:
|
210
|
+
search_keys: dict[str, SearchKey] | None = None,
|
211
|
+
country_code: str | None = None,
|
212
|
+
model_task_type: ModelTaskType | str | None = None,
|
213
|
+
api_key: str | None = None,
|
214
|
+
endpoint: str | None = None,
|
215
|
+
search_id: str | None = None,
|
216
|
+
shared_datasets: list[str] | None = None,
|
217
|
+
runtime_parameters: RuntimeParameters | None = None,
|
218
|
+
date_format: str | None = None,
|
219
219
|
random_state: int = 42,
|
220
|
-
cv:
|
221
|
-
loss:
|
220
|
+
cv: CVType | None = None,
|
221
|
+
loss: str | None = None,
|
222
222
|
autodetect_search_keys: bool = True,
|
223
|
-
generate_features:
|
224
|
-
columns_for_online_api:
|
225
|
-
round_embeddings:
|
223
|
+
generate_features: list[str] | None = None,
|
224
|
+
columns_for_online_api: list[str] | None = None,
|
225
|
+
round_embeddings: int | None = None,
|
226
226
|
logs_enabled: bool = True,
|
227
227
|
raise_validation_error: bool = True,
|
228
|
-
exclude_columns:
|
229
|
-
baseline_score_column:
|
230
|
-
client_ip:
|
231
|
-
client_visitorid:
|
232
|
-
custom_bundle_config:
|
228
|
+
exclude_columns: list[str] | None = None,
|
229
|
+
baseline_score_column: Any | None = None,
|
230
|
+
client_ip: str | None = None,
|
231
|
+
client_visitorid: str | None = None,
|
232
|
+
custom_bundle_config: str | None = None,
|
233
233
|
add_date_if_missing: bool = True,
|
234
234
|
disable_force_downsampling: bool = False,
|
235
|
-
id_columns:
|
235
|
+
id_columns: list[str] | None = None,
|
236
236
|
generate_search_key_features: bool = True,
|
237
|
-
sample_config:
|
237
|
+
sample_config: SampleConfig | None = None,
|
238
238
|
print_trace_id: bool = False,
|
239
239
|
**kwargs,
|
240
240
|
):
|
@@ -259,16 +259,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
259
259
|
print(msg)
|
260
260
|
|
261
261
|
self.passed_features: list[str] = []
|
262
|
-
self.df_with_original_index:
|
263
|
-
self.fit_columns_renaming:
|
262
|
+
self.df_with_original_index: pd.DataFrame | None = None
|
263
|
+
self.fit_columns_renaming: dict[str, str] | None = None
|
264
264
|
self.country_added = False
|
265
265
|
self.fit_generated_features: list[str] = []
|
266
|
-
self.fit_dropped_features:
|
266
|
+
self.fit_dropped_features: set[str] = set()
|
267
267
|
self.fit_search_keys = search_keys
|
268
268
|
self.warning_counter = WarningCounter()
|
269
|
-
self.X:
|
270
|
-
self.y:
|
271
|
-
self.eval_set:
|
269
|
+
self.X: pd.DataFrame | None = None
|
270
|
+
self.y: pd.Series | None = None
|
271
|
+
self.eval_set: list[tuple] | None = None
|
272
272
|
self.autodetected_search_keys: dict[str, SearchKey] = {}
|
273
273
|
self.imbalanced = False
|
274
274
|
self.fit_select_features = True
|
@@ -288,17 +288,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
288
288
|
self.model_task_type = ModelTaskType.parse(model_task_type)
|
289
289
|
self.model_task_type = model_task_type
|
290
290
|
self.endpoint = endpoint
|
291
|
-
self._search_task:
|
291
|
+
self._search_task: SearchTask | None = None
|
292
292
|
self.features_info: pd.DataFrame = self.EMPTY_FEATURES_INFO
|
293
293
|
self._features_info_without_links: pd.DataFrame = self.EMPTY_FEATURES_INFO
|
294
294
|
self._internal_features_info: pd.DataFrame = self.EMPTY_INTERNAL_FEATURES_INFO
|
295
295
|
self.relevant_data_sources: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
296
296
|
self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
297
|
-
self.metrics:
|
297
|
+
self.metrics: pd.DataFrame | None = None
|
298
298
|
self.feature_names_ = []
|
299
299
|
self.external_source_feature_names = []
|
300
300
|
self.feature_importances_ = []
|
301
|
-
self.psi_values:
|
301
|
+
self.psi_values: dict[str, float] | None = None
|
302
302
|
self.search_id = search_id
|
303
303
|
self.disable_force_downsampling = disable_force_downsampling
|
304
304
|
self.print_trace_id = print_trace_id
|
@@ -375,7 +375,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
375
375
|
self.autofe_features_display_handle = None
|
376
376
|
self.report_button_handle = None
|
377
377
|
|
378
|
-
def _get_sample_config(self, sample_config:
|
378
|
+
def _get_sample_config(self, sample_config: SampleConfig | None = None):
|
379
379
|
sample_config = sample_config or SampleConfig(force_sample_size=Dataset.FORCE_SAMPLE_SIZE)
|
380
380
|
|
381
381
|
maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
|
@@ -430,20 +430,20 @@ class FeaturesEnricher(TransformerMixin):
|
|
430
430
|
|
431
431
|
def fit(
|
432
432
|
self,
|
433
|
-
X:
|
434
|
-
y:
|
435
|
-
eval_set:
|
433
|
+
X: pd.DataFrame | pd.Series | np.ndarray,
|
434
|
+
y: pd.Series | np.ndarray | list,
|
435
|
+
eval_set: list[tuple] | tuple | None = None,
|
436
436
|
*args,
|
437
|
-
exclude_features_sources:
|
438
|
-
calculate_metrics:
|
439
|
-
estimator:
|
440
|
-
scoring:
|
441
|
-
remove_outliers_calc_metrics:
|
442
|
-
progress_callback:
|
443
|
-
search_id_callback:
|
437
|
+
exclude_features_sources: list[str] | None = None,
|
438
|
+
calculate_metrics: bool | None = None,
|
439
|
+
estimator: Any | None = None,
|
440
|
+
scoring: Callable | str | None = None,
|
441
|
+
remove_outliers_calc_metrics: bool | None = None,
|
442
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
443
|
+
search_id_callback: Callable[[str], Any] | None = None,
|
444
444
|
select_features: bool = True,
|
445
|
-
auto_fe_parameters:
|
446
|
-
stability_threshold: float = 0.
|
445
|
+
auto_fe_parameters: AutoFEParameters | None = None,
|
446
|
+
stability_threshold: float = 0.2,
|
447
447
|
stability_agg_func: str = "max",
|
448
448
|
**kwargs,
|
449
449
|
):
|
@@ -479,7 +479,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
479
479
|
If True, return only selected features both from input and data sources.
|
480
480
|
Otherwise, return all features from input and only selected features from data sources.
|
481
481
|
|
482
|
-
stability_threshold: float, optional (default=0.
|
482
|
+
stability_threshold: float, optional (default=0.2)
|
483
483
|
Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
|
484
484
|
then feature will be dropped.
|
485
485
|
|
@@ -579,27 +579,26 @@ class FeaturesEnricher(TransformerMixin):
|
|
579
579
|
|
580
580
|
def fit_transform(
|
581
581
|
self,
|
582
|
-
X:
|
583
|
-
y:
|
584
|
-
eval_set:
|
582
|
+
X: pd.DataFrame | pd.Series | np.ndarray,
|
583
|
+
y: pd.DataFrame | pd.Series | np.ndarray | list,
|
584
|
+
eval_set: list[tuple] | tuple | None = None,
|
585
585
|
*args,
|
586
|
-
exclude_features_sources:
|
587
|
-
keep_input: bool =
|
588
|
-
calculate_metrics:
|
589
|
-
scoring:
|
590
|
-
estimator:
|
591
|
-
remove_outliers_calc_metrics:
|
592
|
-
progress_callback:
|
586
|
+
exclude_features_sources: list[str] | None | None = None,
|
587
|
+
keep_input: bool | None = None,
|
588
|
+
calculate_metrics: bool | None = None,
|
589
|
+
scoring: Callable | str | None = None,
|
590
|
+
estimator: Any | None = None,
|
591
|
+
remove_outliers_calc_metrics: bool | None = None,
|
592
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
593
593
|
select_features: bool = True,
|
594
|
-
auto_fe_parameters:
|
595
|
-
stability_threshold: float = 0.
|
594
|
+
auto_fe_parameters: AutoFEParameters | None = None,
|
595
|
+
stability_threshold: float = 0.2,
|
596
596
|
stability_agg_func: str = "max",
|
597
597
|
**kwargs,
|
598
598
|
) -> pd.DataFrame:
|
599
599
|
"""Fit to data, then transform it.
|
600
600
|
|
601
601
|
Fits transformer to `X` and `y` and returns a transformed version of `X`.
|
602
|
-
If keep_input is True, then all input columns are copied to the output dataframe.
|
603
602
|
|
604
603
|
Parameters
|
605
604
|
----------
|
@@ -613,7 +612,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
613
612
|
list of pairs (X, y) for validation.
|
614
613
|
|
615
614
|
keep_input: bool, optional (default=True)
|
615
|
+
keep_input: bool, optional (default=None)
|
616
616
|
If True, copy original input columns to the output dataframe.
|
617
|
+
If False, then only enriched columns are returned.
|
618
|
+
If None, then all search keys, ID columns, selected client features and enriched columns will be returned.
|
617
619
|
|
618
620
|
estimator: sklearn-compatible estimator, optional (default=None)
|
619
621
|
Custom estimator for metrics calculation.
|
@@ -629,7 +631,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
629
631
|
If True, return only selected features both from input and data sources.
|
630
632
|
Otherwise, return all features from input and only selected features from data sources.
|
631
633
|
|
632
|
-
stability_threshold: float, optional (default=0.
|
634
|
+
stability_threshold: float, optional (default=0.2)
|
633
635
|
Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
|
634
636
|
then feature will be dropped.
|
635
637
|
|
@@ -747,28 +749,29 @@ class FeaturesEnricher(TransformerMixin):
|
|
747
749
|
self,
|
748
750
|
X: pd.DataFrame,
|
749
751
|
*args,
|
750
|
-
y:
|
751
|
-
exclude_features_sources:
|
752
|
-
keep_input: bool =
|
753
|
-
trace_id:
|
752
|
+
y: pd.Series | None = None,
|
753
|
+
exclude_features_sources: list[str] | None = None,
|
754
|
+
keep_input: bool | None = None,
|
755
|
+
trace_id: str | None = None,
|
754
756
|
metrics_calculation: bool = False,
|
755
757
|
silent_mode=False,
|
756
|
-
progress_bar:
|
757
|
-
progress_callback:
|
758
|
+
progress_bar: ProgressBar | None = None,
|
759
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
758
760
|
**kwargs,
|
759
761
|
) -> pd.DataFrame:
|
760
762
|
"""Transform `X`.
|
761
763
|
|
762
764
|
Returns a transformed version of `X`.
|
763
|
-
If keep_input is True, then all input columns are copied to the output dataframe.
|
764
765
|
|
765
766
|
Parameters
|
766
767
|
----------
|
767
768
|
X: pandas.DataFrame of shape (n_samples, n_features)
|
768
769
|
Input samples.
|
769
770
|
|
770
|
-
keep_input: bool, optional (default=
|
771
|
+
keep_input: bool, optional (default=None)
|
771
772
|
If True, copy original input columns to the output dataframe.
|
773
|
+
If False, then only enriched columns are returned.
|
774
|
+
If None, then all search keys, ID columns, selected client features and enriched columns will be returned.
|
772
775
|
|
773
776
|
Returns
|
774
777
|
-------
|
@@ -809,6 +812,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
809
812
|
metrics_calculation=metrics_calculation,
|
810
813
|
silent_mode=silent_mode,
|
811
814
|
progress_bar=progress_bar,
|
815
|
+
keep_input=keep_input,
|
812
816
|
)
|
813
817
|
self.logger.info("Transform finished successfully")
|
814
818
|
search_progress = SearchProgress(100.0, ProgressStage.FINISHED)
|
@@ -850,30 +854,26 @@ class FeaturesEnricher(TransformerMixin):
|
|
850
854
|
raise e
|
851
855
|
finally:
|
852
856
|
self.logger.info(f"Transform elapsed time: {time.time() - start_time}")
|
853
|
-
|
854
|
-
|
855
|
-
if keep_input:
|
856
|
-
return result
|
857
|
-
else:
|
858
|
-
return result.drop(columns=X.columns, errors="ignore")
|
857
|
+
|
858
|
+
return result
|
859
859
|
|
860
860
|
def calculate_metrics(
|
861
861
|
self,
|
862
|
-
X:
|
863
|
-
y:
|
864
|
-
eval_set:
|
862
|
+
X: pd.DataFrame | pd.Series | np.ndarray | None = None,
|
863
|
+
y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
|
864
|
+
eval_set: list[tuple] | tuple | None = None,
|
865
865
|
*args,
|
866
|
-
scoring:
|
867
|
-
cv:
|
866
|
+
scoring: Callable | str | None = None,
|
867
|
+
cv: BaseCrossValidator | CVType | str | None = None,
|
868
868
|
estimator=None,
|
869
|
-
exclude_features_sources:
|
870
|
-
remove_outliers_calc_metrics:
|
871
|
-
trace_id:
|
869
|
+
exclude_features_sources: list[str] | None = None,
|
870
|
+
remove_outliers_calc_metrics: bool | None = None,
|
871
|
+
trace_id: str | None = None,
|
872
872
|
internal_call: bool = False,
|
873
|
-
progress_bar:
|
874
|
-
progress_callback:
|
873
|
+
progress_bar: ProgressBar | None = None,
|
874
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
875
875
|
**kwargs,
|
876
|
-
) ->
|
876
|
+
) -> pd.DataFrame | None:
|
877
877
|
"""Calculate metrics
|
878
878
|
|
879
879
|
Parameters
|
@@ -1311,16 +1311,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
1311
1311
|
def _select_features_by_psi(
|
1312
1312
|
self,
|
1313
1313
|
trace_id: str,
|
1314
|
-
X:
|
1315
|
-
y:
|
1316
|
-
eval_set:
|
1314
|
+
X: pd.DataFrame | pd.Series | np.ndarray,
|
1315
|
+
y: pd.DataFrame | pd.Series | np.ndarray | list,
|
1316
|
+
eval_set: list[tuple] | tuple | None,
|
1317
1317
|
stability_threshold: float,
|
1318
1318
|
stability_agg_func: Callable,
|
1319
|
-
cv:
|
1319
|
+
cv: BaseCrossValidator | CVType | str | None = None,
|
1320
1320
|
estimator=None,
|
1321
|
-
exclude_features_sources:
|
1321
|
+
exclude_features_sources: list[str] | None = None,
|
1322
1322
|
progress_bar: bool = True,
|
1323
|
-
progress_callback:
|
1323
|
+
progress_callback: Callable | None = None,
|
1324
1324
|
):
|
1325
1325
|
search_keys = self.search_keys.copy()
|
1326
1326
|
validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set, silent=True)
|
@@ -1469,7 +1469,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1469
1469
|
|
1470
1470
|
unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
|
1471
1471
|
if unstable_by_sparsity:
|
1472
|
-
self.logger.info(f"Unstable by sparsity features: {sorted(unstable_by_sparsity)}")
|
1472
|
+
self.logger.info(f"Unstable by sparsity features ({stability_threshold}): {sorted(unstable_by_sparsity)}")
|
1473
1473
|
|
1474
1474
|
psi_values = calculate_features_psi(
|
1475
1475
|
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type, stability_agg_func
|
@@ -1479,7 +1479,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1479
1479
|
|
1480
1480
|
unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
|
1481
1481
|
if unstable_by_value:
|
1482
|
-
self.logger.info(f"Unstable by value features: {sorted(unstable_by_value)}")
|
1482
|
+
self.logger.info(f"Unstable by value features ({stability_threshold}): {sorted(unstable_by_value)}")
|
1483
1483
|
|
1484
1484
|
self.psi_values = {
|
1485
1485
|
feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
|
@@ -1557,12 +1557,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
1557
1557
|
self.logger.warning(msg)
|
1558
1558
|
|
1559
1559
|
def _has_features_with_commercial_schema(
|
1560
|
-
self, commercial_schema: str, exclude_features_sources:
|
1560
|
+
self, commercial_schema: str, exclude_features_sources: list[str] | None
|
1561
1561
|
) -> bool:
|
1562
1562
|
return len(self._get_features_with_commercial_schema(commercial_schema, exclude_features_sources)) > 0
|
1563
1563
|
|
1564
1564
|
def _get_features_with_commercial_schema(
|
1565
|
-
self, commercial_schema: str, exclude_features_sources:
|
1565
|
+
self, commercial_schema: str, exclude_features_sources: list[str] | None
|
1566
1566
|
) -> list[str]:
|
1567
1567
|
if exclude_features_sources:
|
1568
1568
|
filtered_features_info = self._internal_features_info[
|
@@ -1577,14 +1577,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
1577
1577
|
].values
|
1578
1578
|
)
|
1579
1579
|
|
1580
|
-
def _has_paid_features(self, exclude_features_sources:
|
1580
|
+
def _has_paid_features(self, exclude_features_sources: list[str] | None) -> bool:
|
1581
1581
|
return self._has_features_with_commercial_schema(CommercialSchema.PAID.value, exclude_features_sources)
|
1582
1582
|
|
1583
1583
|
def _is_input_same_as_fit(
|
1584
1584
|
self,
|
1585
|
-
X:
|
1586
|
-
y:
|
1587
|
-
eval_set:
|
1585
|
+
X: pd.DataFrame | pd.Series | np.ndarray | None = None,
|
1586
|
+
y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
|
1587
|
+
eval_set: list[tuple] | None = None,
|
1588
1588
|
) -> tuple:
|
1589
1589
|
if X is None:
|
1590
1590
|
return True, self.X, self.y, self.eval_set
|
@@ -1615,9 +1615,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
1615
1615
|
def _get_cv_and_groups(
|
1616
1616
|
self,
|
1617
1617
|
X: pd.DataFrame,
|
1618
|
-
cv_override:
|
1618
|
+
cv_override: BaseCrossValidator | CVType | str | None,
|
1619
1619
|
search_keys: dict[str, SearchKey],
|
1620
|
-
) -> tuple[BaseCrossValidator,
|
1620
|
+
) -> tuple[BaseCrossValidator, np.ndarray] | None:
|
1621
1621
|
_cv = cv_override or self.cv
|
1622
1622
|
group_columns = sorted(self._get_group_columns(X, search_keys))
|
1623
1623
|
groups = None
|
@@ -1645,8 +1645,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1645
1645
|
return _cv, groups
|
1646
1646
|
|
1647
1647
|
def _get_and_validate_client_cat_features(
|
1648
|
-
self, estimator:
|
1649
|
-
) -> tuple[
|
1648
|
+
self, estimator: Any | None, X: pd.DataFrame, search_keys: dict[str, SearchKey]
|
1649
|
+
) -> tuple[list[str] | None, list[str]]:
|
1650
1650
|
cat_features = []
|
1651
1651
|
search_keys_for_metrics = []
|
1652
1652
|
if (
|
@@ -1678,16 +1678,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
1678
1678
|
def _get_cached_enriched_data(
|
1679
1679
|
self,
|
1680
1680
|
trace_id: str,
|
1681
|
-
X:
|
1682
|
-
y:
|
1683
|
-
eval_set:
|
1684
|
-
exclude_features_sources:
|
1685
|
-
remove_outliers_calc_metrics:
|
1686
|
-
cv_override:
|
1687
|
-
search_keys_for_metrics:
|
1688
|
-
progress_bar:
|
1689
|
-
progress_callback:
|
1690
|
-
client_cat_features:
|
1681
|
+
X: pd.DataFrame | pd.Series | np.ndarray | None = None,
|
1682
|
+
y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
|
1683
|
+
eval_set: list[tuple] | tuple | None = None,
|
1684
|
+
exclude_features_sources: list[str] | None = None,
|
1685
|
+
remove_outliers_calc_metrics: bool | None = None,
|
1686
|
+
cv_override: BaseCrossValidator | CVType | str | None = None,
|
1687
|
+
search_keys_for_metrics: list[str] | None = None,
|
1688
|
+
progress_bar: ProgressBar | None = None,
|
1689
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
1690
|
+
client_cat_features: list[str] | None = None,
|
1691
1691
|
is_for_metrics: bool = False,
|
1692
1692
|
):
|
1693
1693
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
@@ -1893,15 +1893,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
1893
1893
|
def _get_enriched_datasets(
|
1894
1894
|
self,
|
1895
1895
|
trace_id: str,
|
1896
|
-
validated_X:
|
1897
|
-
validated_y:
|
1898
|
-
eval_set:
|
1899
|
-
exclude_features_sources:
|
1896
|
+
validated_X: pd.DataFrame | pd.Series | np.ndarray | None,
|
1897
|
+
validated_y: pd.DataFrame | pd.Series | np.ndarray | list | None,
|
1898
|
+
eval_set: list[tuple] | None,
|
1899
|
+
exclude_features_sources: list[str] | None,
|
1900
1900
|
is_input_same_as_fit: bool,
|
1901
1901
|
is_demo_dataset: bool,
|
1902
|
-
remove_outliers_calc_metrics:
|
1903
|
-
progress_bar:
|
1904
|
-
progress_callback:
|
1902
|
+
remove_outliers_calc_metrics: bool | None,
|
1903
|
+
progress_bar: ProgressBar | None,
|
1904
|
+
progress_callback: Callable[[SearchProgress], Any] | None,
|
1905
1905
|
is_for_metrics: bool = False,
|
1906
1906
|
) -> _EnrichedDataForMetrics:
|
1907
1907
|
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
@@ -1939,7 +1939,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1939
1939
|
)
|
1940
1940
|
|
1941
1941
|
def __get_sampled_cached_enriched(
|
1942
|
-
self, datasets_hash: str, exclude_features_sources:
|
1942
|
+
self, datasets_hash: str, exclude_features_sources: list[str] | None
|
1943
1943
|
) -> _EnrichedDataForMetrics:
|
1944
1944
|
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming, generated_features = (
|
1945
1945
|
self.__cached_sampled_datasets[datasets_hash]
|
@@ -1959,7 +1959,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1959
1959
|
)
|
1960
1960
|
|
1961
1961
|
def __get_enriched_as_input(
|
1962
|
-
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set:
|
1962
|
+
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: list[tuple] | None, is_demo_dataset: bool
|
1963
1963
|
) -> _EnrichedDataForMetrics:
|
1964
1964
|
eval_set_sampled_dict = {}
|
1965
1965
|
|
@@ -2055,9 +2055,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
2055
2055
|
|
2056
2056
|
def __get_enriched_from_fit(
|
2057
2057
|
self,
|
2058
|
-
eval_set:
|
2058
|
+
eval_set: list[tuple] | None,
|
2059
2059
|
trace_id: str,
|
2060
|
-
remove_outliers_calc_metrics:
|
2060
|
+
remove_outliers_calc_metrics: bool | None,
|
2061
2061
|
) -> _EnrichedDataForMetrics:
|
2062
2062
|
eval_set_sampled_dict = {}
|
2063
2063
|
search_keys = self.fit_search_keys.copy()
|
@@ -2163,11 +2163,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
2163
2163
|
self,
|
2164
2164
|
validated_X: pd.DataFrame,
|
2165
2165
|
validated_y: pd.Series,
|
2166
|
-
eval_set:
|
2167
|
-
exclude_features_sources:
|
2166
|
+
eval_set: list[tuple] | None,
|
2167
|
+
exclude_features_sources: list[str] | None,
|
2168
2168
|
trace_id: str,
|
2169
|
-
progress_bar:
|
2170
|
-
progress_callback:
|
2169
|
+
progress_bar: ProgressBar | None,
|
2170
|
+
progress_callback: Callable[[SearchProgress], Any] | None,
|
2171
2171
|
is_for_metrics: bool = False,
|
2172
2172
|
) -> _EnrichedDataForMetrics:
|
2173
2173
|
has_eval_set = eval_set is not None
|
@@ -2231,7 +2231,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2231
2231
|
)
|
2232
2232
|
|
2233
2233
|
def __combine_train_and_eval_sets(
|
2234
|
-
self, X: pd.DataFrame, y:
|
2234
|
+
self, X: pd.DataFrame, y: pd.Series | None = None, eval_set: list[tuple] | None = None
|
2235
2235
|
) -> pd.DataFrame:
|
2236
2236
|
df = X.copy()
|
2237
2237
|
if y is not None:
|
@@ -2354,7 +2354,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2354
2354
|
generated_features=generated_features,
|
2355
2355
|
)
|
2356
2356
|
|
2357
|
-
def get_search_id(self) ->
|
2357
|
+
def get_search_id(self) -> str | None:
|
2358
2358
|
"""Returns search_id of the fitted enricher. Not available before a successful fit."""
|
2359
2359
|
return self._search_task.search_task_id if self._search_task else None
|
2360
2360
|
|
@@ -2367,7 +2367,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2367
2367
|
|
2368
2368
|
return self.features_info
|
2369
2369
|
|
2370
|
-
def get_progress(self, trace_id:
|
2370
|
+
def get_progress(self, trace_id: str | None = None, search_task: SearchTask | None = None) -> SearchProgress:
|
2371
2371
|
search_task = search_task or self._search_task
|
2372
2372
|
if search_task is not None:
|
2373
2373
|
trace_id = trace_id or uuid.uuid4()
|
@@ -2475,13 +2475,14 @@ if response.status_code == 200:
|
|
2475
2475
|
trace_id: str,
|
2476
2476
|
X: pd.DataFrame,
|
2477
2477
|
*,
|
2478
|
-
y:
|
2479
|
-
exclude_features_sources:
|
2478
|
+
y: pd.Series | None = None,
|
2479
|
+
exclude_features_sources: list[str] | None = None,
|
2480
2480
|
metrics_calculation: bool = False,
|
2481
2481
|
silent_mode: bool = False,
|
2482
|
-
progress_bar:
|
2483
|
-
progress_callback:
|
2482
|
+
progress_bar: ProgressBar | None = None,
|
2483
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
2484
2484
|
add_fit_system_record_id: bool = False,
|
2485
|
+
keep_input: bool | None = None,
|
2485
2486
|
) -> tuple[pd.DataFrame, dict[str, str], list[str], dict[str, SearchKey]]:
|
2486
2487
|
if self._search_task is None:
|
2487
2488
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
@@ -2775,7 +2776,7 @@ if response.status_code == 200:
|
|
2775
2776
|
progress_bar.progress = progress.to_progress_bar()
|
2776
2777
|
if progress_callback is not None:
|
2777
2778
|
progress_callback(progress)
|
2778
|
-
prev_progress:
|
2779
|
+
prev_progress: SearchProgress | None = None
|
2779
2780
|
polling_period_seconds = 1
|
2780
2781
|
try:
|
2781
2782
|
while progress.stage != ProgressStage.DOWNLOADING.value:
|
@@ -2833,18 +2834,25 @@ if response.status_code == 200:
|
|
2833
2834
|
selected_generated_features = [
|
2834
2835
|
c for c in generated_features if not self.fit_select_features or c in self.feature_names_
|
2835
2836
|
]
|
2836
|
-
|
2837
|
-
|
2838
|
-
|
2839
|
-
|
2840
|
-
|
2841
|
-
|
2842
|
-
|
2843
|
-
|
2844
|
-
|
2837
|
+
if keep_input is None:
|
2838
|
+
selected_input_columns = [
|
2839
|
+
c
|
2840
|
+
for c in validated_Xy.columns
|
2841
|
+
if not self.fit_select_features
|
2842
|
+
or c in self.feature_names_
|
2843
|
+
or c in self.search_keys
|
2844
|
+
or c in (self.id_columns or [])
|
2845
|
+
or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
|
2846
|
+
]
|
2847
|
+
elif keep_input is True:
|
2848
|
+
selected_input_columns = validated_Xy.columns.to_list()
|
2849
|
+
else:
|
2850
|
+
selected_input_columns = []
|
2851
|
+
|
2845
2852
|
selecting_columns = selected_input_columns + selected_generated_features
|
2846
2853
|
selecting_columns.extend(
|
2847
|
-
c for c in result.columns
|
2854
|
+
c for c in result.columns
|
2855
|
+
if c in self.feature_names_ and c not in selecting_columns and c not in validated_Xy.columns
|
2848
2856
|
)
|
2849
2857
|
if add_fit_system_record_id:
|
2850
2858
|
selecting_columns.append(SORT_ID)
|
@@ -2871,7 +2879,7 @@ if response.status_code == 200:
|
|
2871
2879
|
|
2872
2880
|
return result, columns_renaming, generated_features, search_keys
|
2873
2881
|
|
2874
|
-
def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id:
|
2882
|
+
def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
|
2875
2883
|
if (search_keys is None or len(search_keys) == 0) and self.country_code is None:
|
2876
2884
|
if search_id:
|
2877
2885
|
self.logger.debug(f"search_id {search_id} provided without search_keys")
|
@@ -2944,22 +2952,22 @@ if response.status_code == 200:
|
|
2944
2952
|
def __inner_fit(
|
2945
2953
|
self,
|
2946
2954
|
trace_id: str,
|
2947
|
-
X:
|
2948
|
-
y:
|
2949
|
-
eval_set:
|
2950
|
-
progress_bar:
|
2955
|
+
X: pd.DataFrame | pd.Series | np.ndarray,
|
2956
|
+
y: pd.DataFrame | pd.Series | np.ndarray | list | None,
|
2957
|
+
eval_set: list[tuple] | None,
|
2958
|
+
progress_bar: ProgressBar | None,
|
2951
2959
|
start_time: int,
|
2952
2960
|
*,
|
2953
|
-
exclude_features_sources:
|
2954
|
-
calculate_metrics:
|
2955
|
-
scoring:
|
2956
|
-
estimator:
|
2961
|
+
exclude_features_sources: list[str] | None = None,
|
2962
|
+
calculate_metrics: bool | None,
|
2963
|
+
scoring: Callable | str | None,
|
2964
|
+
estimator: Any | None,
|
2957
2965
|
stability_threshold: float,
|
2958
2966
|
stability_agg_func: str,
|
2959
|
-
remove_outliers_calc_metrics:
|
2967
|
+
remove_outliers_calc_metrics: bool | None,
|
2960
2968
|
auto_fe_parameters: AutoFEParameters,
|
2961
|
-
progress_callback:
|
2962
|
-
search_id_callback:
|
2969
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
2970
|
+
search_id_callback: Callable[[str], Any] | None = None,
|
2963
2971
|
):
|
2964
2972
|
self._search_task = None
|
2965
2973
|
self.warning_counter.reset()
|
@@ -3378,16 +3386,6 @@ if response.status_code == 200:
|
|
3378
3386
|
|
3379
3387
|
self.__show_selected_features()
|
3380
3388
|
|
3381
|
-
autofe_description = self.get_autofe_features_description()
|
3382
|
-
if autofe_description is not None and len(autofe_description) > 0:
|
3383
|
-
self.logger.info(f"AutoFE descriptions: {autofe_description}")
|
3384
|
-
self.autofe_features_display_handle = display_html_dataframe(
|
3385
|
-
df=autofe_description,
|
3386
|
-
internal_df=autofe_description,
|
3387
|
-
header=self.bundle.get("autofe_descriptions_header"),
|
3388
|
-
display_id=f"autofe_descriptions_{uuid.uuid4()}",
|
3389
|
-
)
|
3390
|
-
|
3391
3389
|
if self._has_paid_features(exclude_features_sources):
|
3392
3390
|
if calculate_metrics is not None and calculate_metrics:
|
3393
3391
|
msg = self.bundle.get("metrics_with_paid_features")
|
@@ -3466,7 +3464,7 @@ if response.status_code == 200:
|
|
3466
3464
|
def __should_add_date_column(self):
|
3467
3465
|
return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
|
3468
3466
|
|
3469
|
-
def __get_renamed_id_columns(self, renaming:
|
3467
|
+
def __get_renamed_id_columns(self, renaming: dict[str, str] | None = None):
|
3470
3468
|
renaming = renaming or self.fit_columns_renaming
|
3471
3469
|
reverse_renaming = {v: k for k, v in renaming.items()}
|
3472
3470
|
return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
|
@@ -3511,11 +3509,11 @@ if response.status_code == 200:
|
|
3511
3509
|
def _validate_train_eval(
|
3512
3510
|
self,
|
3513
3511
|
X: pd.DataFrame,
|
3514
|
-
y:
|
3515
|
-
eval_set:
|
3512
|
+
y: pd.Series | None = None,
|
3513
|
+
eval_set: list[tuple[pd.DataFrame, pd.Series]] | None = None,
|
3516
3514
|
is_transform: bool = False,
|
3517
3515
|
silent: bool = False,
|
3518
|
-
) -> tuple[pd.DataFrame, pd.Series,
|
3516
|
+
) -> tuple[pd.DataFrame, pd.Series, list[tuple[pd.DataFrame, pd.Series]]] | None:
|
3519
3517
|
validated_X = self._validate_X(X, is_transform)
|
3520
3518
|
validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
|
3521
3519
|
validated_eval_set = self._validate_eval_set(validated_X, eval_set, silent=silent)
|
@@ -3594,7 +3592,7 @@ if response.status_code == 200:
|
|
3594
3592
|
|
3595
3593
|
return validated_X
|
3596
3594
|
|
3597
|
-
def _validate_y(self, X: pd.DataFrame, y, enforce_y: bool = True) ->
|
3595
|
+
def _validate_y(self, X: pd.DataFrame, y, enforce_y: bool = True) -> pd.Series | None:
|
3598
3596
|
if y is None and not enforce_y:
|
3599
3597
|
return None
|
3600
3598
|
if (
|
@@ -3644,7 +3642,7 @@ if response.status_code == 200:
|
|
3644
3642
|
return validated_y
|
3645
3643
|
|
3646
3644
|
def _validate_eval_set(
|
3647
|
-
self, X: pd.DataFrame, eval_set:
|
3645
|
+
self, X: pd.DataFrame, eval_set: list[tuple[pd.DataFrame, pd.Series]] | None, silent: bool = False
|
3648
3646
|
):
|
3649
3647
|
if eval_set is None:
|
3650
3648
|
return None
|
@@ -3756,7 +3754,7 @@ if response.status_code == 200:
|
|
3756
3754
|
|
3757
3755
|
return validated_eval_X, validated_eval_y
|
3758
3756
|
|
3759
|
-
def _validate_baseline_score(self, X: pd.DataFrame, eval_set:
|
3757
|
+
def _validate_baseline_score(self, X: pd.DataFrame, eval_set: list[tuple] | None):
|
3760
3758
|
if self.baseline_score_column is not None:
|
3761
3759
|
if self.baseline_score_column not in X.columns:
|
3762
3760
|
raise ValidationError(
|
@@ -3783,7 +3781,7 @@ if response.status_code == 200:
|
|
3783
3781
|
|
3784
3782
|
@staticmethod
|
3785
3783
|
def _sort_by_system_record_id(
|
3786
|
-
X: pd.DataFrame, y: pd.Series, cv:
|
3784
|
+
X: pd.DataFrame, y: pd.Series, cv: CVType | None
|
3787
3785
|
) -> tuple[pd.DataFrame, pd.Series]:
|
3788
3786
|
if cv not in [CVType.time_series, CVType.blocked_time_series]:
|
3789
3787
|
record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
|
@@ -3801,7 +3799,7 @@ if response.status_code == 200:
|
|
3801
3799
|
# Deprecated
|
3802
3800
|
@staticmethod
|
3803
3801
|
def _sort_by_keys(
|
3804
|
-
X: pd.DataFrame, y: pd.Series, search_keys: dict[str, SearchKey], cv:
|
3802
|
+
X: pd.DataFrame, y: pd.Series, search_keys: dict[str, SearchKey], cv: CVType | None
|
3805
3803
|
) -> tuple[pd.DataFrame, pd.Series]:
|
3806
3804
|
if cv not in [CVType.time_series, CVType.blocked_time_series]:
|
3807
3805
|
if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
|
@@ -3841,14 +3839,14 @@ if response.status_code == 200:
|
|
3841
3839
|
def __log_debug_information(
|
3842
3840
|
self,
|
3843
3841
|
X: pd.DataFrame,
|
3844
|
-
y:
|
3845
|
-
eval_set:
|
3846
|
-
exclude_features_sources:
|
3847
|
-
calculate_metrics:
|
3848
|
-
cv:
|
3849
|
-
scoring:
|
3850
|
-
estimator:
|
3851
|
-
remove_outliers_calc_metrics:
|
3842
|
+
y: pd.Series | np.ndarray | list | None = None,
|
3843
|
+
eval_set: list[tuple] | None = None,
|
3844
|
+
exclude_features_sources: list[str] | None = None,
|
3845
|
+
calculate_metrics: bool | None = None,
|
3846
|
+
cv: Any | None = None,
|
3847
|
+
scoring: Any | None = None,
|
3848
|
+
estimator: Any | None = None,
|
3849
|
+
remove_outliers_calc_metrics: bool | None = None,
|
3852
3850
|
):
|
3853
3851
|
try:
|
3854
3852
|
resolved_api_key = self.api_key or os.environ.get(UPGINI_API_KEY)
|
@@ -3973,7 +3971,7 @@ if response.status_code == 200:
|
|
3973
3971
|
]
|
3974
3972
|
|
3975
3973
|
@staticmethod
|
3976
|
-
def _get_email_column(search_keys: dict[str, SearchKey]) ->
|
3974
|
+
def _get_email_column(search_keys: dict[str, SearchKey]) -> str | None:
|
3977
3975
|
cols = [col for col, t in search_keys.items() if t == SearchKey.EMAIL]
|
3978
3976
|
if len(cols) > 1:
|
3979
3977
|
raise Exception("More than one email column found after unnest")
|
@@ -3981,7 +3979,7 @@ if response.status_code == 200:
|
|
3981
3979
|
return cols[0]
|
3982
3980
|
|
3983
3981
|
@staticmethod
|
3984
|
-
def _get_hem_column(search_keys: dict[str, SearchKey]) ->
|
3982
|
+
def _get_hem_column(search_keys: dict[str, SearchKey]) -> str | None:
|
3985
3983
|
cols = [col for col, t in search_keys.items() if t == SearchKey.HEM]
|
3986
3984
|
if len(cols) > 1:
|
3987
3985
|
raise Exception("More than one hem column found after unnest")
|
@@ -3989,7 +3987,7 @@ if response.status_code == 200:
|
|
3989
3987
|
return cols[0]
|
3990
3988
|
|
3991
3989
|
@staticmethod
|
3992
|
-
def _get_ip_column(search_keys: dict[str, SearchKey]) ->
|
3990
|
+
def _get_ip_column(search_keys: dict[str, SearchKey]) -> str | None:
|
3993
3991
|
cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
|
3994
3992
|
if len(cols) > 1:
|
3995
3993
|
raise Exception("More than one ip column found after unnest")
|
@@ -3997,25 +3995,25 @@ if response.status_code == 200:
|
|
3997
3995
|
return cols[0]
|
3998
3996
|
|
3999
3997
|
@staticmethod
|
4000
|
-
def _get_phone_column(search_keys: dict[str, SearchKey]) ->
|
3998
|
+
def _get_phone_column(search_keys: dict[str, SearchKey]) -> str | None:
|
4001
3999
|
for col, t in search_keys.items():
|
4002
4000
|
if t == SearchKey.PHONE:
|
4003
4001
|
return col
|
4004
4002
|
|
4005
4003
|
@staticmethod
|
4006
|
-
def _get_country_column(search_keys: dict[str, SearchKey]) ->
|
4004
|
+
def _get_country_column(search_keys: dict[str, SearchKey]) -> str | None:
|
4007
4005
|
for col, t in search_keys.items():
|
4008
4006
|
if t == SearchKey.COUNTRY:
|
4009
4007
|
return col
|
4010
4008
|
|
4011
4009
|
@staticmethod
|
4012
|
-
def _get_postal_column(search_keys: dict[str, SearchKey]) ->
|
4010
|
+
def _get_postal_column(search_keys: dict[str, SearchKey]) -> str | None:
|
4013
4011
|
for col, t in search_keys.items():
|
4014
4012
|
if t == SearchKey.POSTAL_CODE:
|
4015
4013
|
return col
|
4016
4014
|
|
4017
4015
|
@staticmethod
|
4018
|
-
def _get_date_column(search_keys: dict[str, SearchKey]) ->
|
4016
|
+
def _get_date_column(search_keys: dict[str, SearchKey]) -> str | None:
|
4019
4017
|
return SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
4020
4018
|
|
4021
4019
|
def _explode_multiple_search_keys(
|
@@ -4062,10 +4060,10 @@ if response.status_code == 200:
|
|
4062
4060
|
id_name: str,
|
4063
4061
|
target_name: str,
|
4064
4062
|
columns_renaming: dict[str, str],
|
4065
|
-
id_columns:
|
4066
|
-
cv:
|
4063
|
+
id_columns: list[str] | None,
|
4064
|
+
cv: CVType | None,
|
4067
4065
|
model_task_type: ModelTaskType,
|
4068
|
-
logger:
|
4066
|
+
logger: logging.Logger | None = None,
|
4069
4067
|
bundle: ResourceBundle = bundle,
|
4070
4068
|
) -> pd.DataFrame:
|
4071
4069
|
original_index_name = df.index.name
|
@@ -4201,7 +4199,7 @@ if response.status_code == 200:
|
|
4201
4199
|
def __enrich(
|
4202
4200
|
self,
|
4203
4201
|
input_df: pd.DataFrame,
|
4204
|
-
result_features:
|
4202
|
+
result_features: pd.DataFrame | None,
|
4205
4203
|
how: str = "inner",
|
4206
4204
|
drop_system_record_id=True,
|
4207
4205
|
) -> pd.DataFrame:
|
@@ -4320,7 +4318,7 @@ if response.status_code == 200:
|
|
4320
4318
|
self,
|
4321
4319
|
trace_id: str,
|
4322
4320
|
clients_features_df: pd.DataFrame,
|
4323
|
-
updated_shaps:
|
4321
|
+
updated_shaps: dict[str, float] | None = None,
|
4324
4322
|
update_selected_features: bool = True,
|
4325
4323
|
silent=False,
|
4326
4324
|
):
|
@@ -4659,12 +4657,12 @@ if response.status_code == 200:
|
|
4659
4657
|
|
4660
4658
|
def __show_metrics(
|
4661
4659
|
self,
|
4662
|
-
scoring:
|
4663
|
-
estimator:
|
4664
|
-
remove_outliers_calc_metrics:
|
4660
|
+
scoring: Callable | str | None,
|
4661
|
+
estimator: Any | None,
|
4662
|
+
remove_outliers_calc_metrics: bool | None,
|
4665
4663
|
trace_id: str,
|
4666
|
-
progress_bar:
|
4667
|
-
progress_callback:
|
4664
|
+
progress_bar: ProgressBar | None = None,
|
4665
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
4668
4666
|
):
|
4669
4667
|
self.metrics = self.calculate_metrics(
|
4670
4668
|
scoring=scoring,
|
@@ -4698,13 +4696,23 @@ if response.status_code == 200:
|
|
4698
4696
|
self.bundle.get("relevant_data_sources_header"),
|
4699
4697
|
display_id=f"data_sources_{uuid.uuid4()}",
|
4700
4698
|
)
|
4699
|
+
|
4700
|
+
autofe_description = self.get_autofe_features_description()
|
4701
|
+
if autofe_description is not None and len(autofe_description) > 0:
|
4702
|
+
self.logger.info(f"AutoFE descriptions: {autofe_description}")
|
4703
|
+
self.autofe_features_display_handle = display_html_dataframe(
|
4704
|
+
df=autofe_description,
|
4705
|
+
internal_df=autofe_description,
|
4706
|
+
header=self.bundle.get("autofe_descriptions_header"),
|
4707
|
+
display_id=f"autofe_descriptions_{uuid.uuid4()}",
|
4708
|
+
)
|
4701
4709
|
else:
|
4702
4710
|
msg = self.bundle.get("features_info_zero_important_features")
|
4703
4711
|
self.__log_warning(msg, show_support_link=True)
|
4704
4712
|
except (ImportError, NameError):
|
4705
4713
|
print(self._internal_features_info)
|
4706
4714
|
|
4707
|
-
def __show_report_button(self, display_id:
|
4715
|
+
def __show_report_button(self, display_id: str | None = None, display_handle=None):
|
4708
4716
|
try:
|
4709
4717
|
return prepare_and_show_report(
|
4710
4718
|
relevant_features_df=self._features_info_without_links,
|
@@ -4844,7 +4852,7 @@ if response.status_code == 200:
|
|
4844
4852
|
except Exception:
|
4845
4853
|
self.logger.exception("Failed to dump python libs")
|
4846
4854
|
|
4847
|
-
def __display_support_link(self, link_text:
|
4855
|
+
def __display_support_link(self, link_text: str | None = None):
|
4848
4856
|
support_link = self.bundle.get("support_link")
|
4849
4857
|
link_text = link_text or self.bundle.get("support_text")
|
4850
4858
|
try:
|
@@ -4871,9 +4879,9 @@ if response.status_code == 200:
|
|
4871
4879
|
def dump_input(
|
4872
4880
|
self,
|
4873
4881
|
trace_id: str,
|
4874
|
-
X:
|
4875
|
-
y:
|
4876
|
-
eval_set:
|
4882
|
+
X: pd.DataFrame | pd.Series,
|
4883
|
+
y: pd.DataFrame | pd.Series | None = None,
|
4884
|
+
eval_set: tuple | None = None,
|
4877
4885
|
):
|
4878
4886
|
def dump_task(X_, y_, eval_set_):
|
4879
4887
|
with MDC(trace_id=trace_id):
|
@@ -4964,7 +4972,7 @@ def is_frames_equal(first, second, bundle: ResourceBundle) -> bool:
|
|
4964
4972
|
raise ValidationError(bundle.get("x_and_eval_x_diff_types").format(type(first), type(second)))
|
4965
4973
|
|
4966
4974
|
|
4967
|
-
def drop_duplicates(df:
|
4975
|
+
def drop_duplicates(df: pd.DataFrame | np.ndarray | Any) -> pd.DataFrame:
|
4968
4976
|
if isinstance(df, pd.DataFrame):
|
4969
4977
|
return df.drop_duplicates()
|
4970
4978
|
elif isinstance(df, np.ndarray):
|
upgini/http.py
CHANGED
@@ -413,43 +413,9 @@ class _RestClient:
|
|
413
413
|
with open(path, "rb") as file:
|
414
414
|
files = {"file": (file_name, file, "application/octet-stream")}
|
415
415
|
self._with_unauth_retry(
|
416
|
-
lambda: self._send_post_file_req_v2(
|
417
|
-
api_path, files, trace_id=trace_id, need_json_response=False
|
418
|
-
)
|
416
|
+
lambda: self._send_post_file_req_v2(api_path, files, trace_id=trace_id, need_json_response=False)
|
419
417
|
)
|
420
418
|
|
421
|
-
def dump_input_files(
|
422
|
-
self,
|
423
|
-
trace_id: str,
|
424
|
-
x_path: str,
|
425
|
-
y_path: Optional[str] = None,
|
426
|
-
eval_x_path: Optional[str] = None,
|
427
|
-
eval_y_path: Optional[str] = None,
|
428
|
-
):
|
429
|
-
api_path = self.SEARCH_DUMP_INPUT_FILE_FMT
|
430
|
-
|
431
|
-
def upload_with_check(path: str, file_name: str):
|
432
|
-
digest_sha256 = file_hash(path)
|
433
|
-
if self.is_file_uploaded(trace_id, digest_sha256):
|
434
|
-
# print(f"File {path} was already uploaded with digest {digest_sha256}, skipping")
|
435
|
-
return
|
436
|
-
else:
|
437
|
-
with open(path, "rb") as file:
|
438
|
-
files = {"file": (file_name, file, "application/octet-stream")}
|
439
|
-
self._with_unauth_retry(
|
440
|
-
lambda: self._send_post_file_req_v2(
|
441
|
-
api_path, files, trace_id=trace_id, need_json_response=False
|
442
|
-
)
|
443
|
-
)
|
444
|
-
|
445
|
-
upload_with_check(x_path, "x.parquet")
|
446
|
-
if y_path:
|
447
|
-
upload_with_check(y_path, "y.parquet")
|
448
|
-
if eval_x_path:
|
449
|
-
upload_with_check(eval_x_path, "eval_x.parquet")
|
450
|
-
if eval_y_path:
|
451
|
-
upload_with_check(eval_y_path, "eval_y.parquet")
|
452
|
-
|
453
419
|
def initial_search_v2(
|
454
420
|
self,
|
455
421
|
trace_id: str,
|
@@ -1080,6 +1046,7 @@ class LoggerFactory:
|
|
1080
1046
|
|
1081
1047
|
upgini_logger = logging.getLogger(f"upgini.{hash(key)}")
|
1082
1048
|
upgini_logger.handlers.clear()
|
1049
|
+
upgini_logger.propagate = False # Prevent duplicate logging in Jupyter notebooks
|
1083
1050
|
rest_client = get_rest_client(backend_url, api_token, client_ip, client_visitorid)
|
1084
1051
|
datadog_handler = BackendLogHandler(rest_client, client_ip, client_visitorid)
|
1085
1052
|
json_formatter = jsonlogger.JsonFormatter(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.115a1
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -749,6 +749,36 @@ enricher.fit(
|
|
749
749
|
)
|
750
750
|
```
|
751
751
|
|
752
|
+
### Control feature stability with PSI parameters
|
753
|
+
|
754
|
+
`FeaturesEnricher` supports Population Stability Index (PSI) calculation on eval_set to evaluate feature stability over time. You can control this behavior using stability parameters in `fit` and `fit_transform` methods:
|
755
|
+
|
756
|
+
```python
|
757
|
+
enricher = FeaturesEnricher(
|
758
|
+
search_keys={"registration_date": SearchKey.DATE}
|
759
|
+
)
|
760
|
+
|
761
|
+
# Control feature stability during fit
|
762
|
+
enricher.fit(
|
763
|
+
X, y,
|
764
|
+
stability_threshold=0.2, # PSI threshold: features with PSI above this value will be dropped
|
765
|
+
stability_agg_func="max" # Aggregation function for stability values: "max", "min", "mean"
|
766
|
+
)
|
767
|
+
|
768
|
+
# Same parameters work for fit_transform
|
769
|
+
enriched_df = enricher.fit_transform(
|
770
|
+
X, y,
|
771
|
+
stability_threshold=0.1, # Stricter threshold for more stable features
|
772
|
+
stability_agg_func="mean" # Use mean aggregation instead of max
|
773
|
+
)
|
774
|
+
```
|
775
|
+
|
776
|
+
**Stability parameters:**
|
777
|
+
- `stability_threshold` (float, default=0.2): PSI threshold value. Features with PSI below this threshold will be excluded from the final feature set. Lower values mean stricter stability requirements.
|
778
|
+
- `stability_agg_func` (str, default="max"): Function to aggregate PSI values across time intervals. Options: "max" (most conservative), "min" (least conservative), "mean" (balanced approach).
|
779
|
+
|
780
|
+
**PSI (Population Stability Index)** measures how much feature distribution changes over time. Lower PSI values indicate more stable features, which are generally more reliable for production ML models.
|
781
|
+
|
752
782
|
### Use custom loss function in feature selection & metrics calculation
|
753
783
|
|
754
784
|
`FeaturesEnricher` can be initialized with additional string parameter `loss`.
|
@@ -1,10 +1,10 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=bgpppKWVKHgLo8IRKBM8YYuR4qMETYP4hSkfrlgcwgU,26
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=pQ8JQe0cdygD-W9GefJmfE6bnj4EYzXsjlgWdIS9nS8,31578
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
7
|
-
upgini/http.py,sha256
|
6
|
+
upgini/features_enricher.py,sha256=Ogye2TBqV-k1Znbc3ffmRKwsff00P7ea5useiuOsZIc,228799
|
7
|
+
upgini/http.py,sha256=-J_wOpnwVnT0ebPC6sOs6fN3AWtCD0LJLu6nlYmxaqk,44348
|
8
8
|
upgini/metadata.py,sha256=VzgtgEbPPtNxTrj9LM5qSDP3DujHwAXqbUSKBjPcb9c,12477
|
9
9
|
upgini/metrics.py,sha256=gjJDtlV6JrhUJumbNipdzjY4ojEupHGPihb9_VxjtWc,45939
|
10
10
|
upgini/search_task.py,sha256=SAiUd1AytbA2Q6PSnnztr7oTRKpud1wQZ5YtKjsmQHU,18256
|
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,
|
|
74
74
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
75
75
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
76
76
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
77
|
-
upgini-1.2.
|
78
|
-
upgini-1.2.
|
79
|
-
upgini-1.2.
|
80
|
-
upgini-1.2.
|
77
|
+
upgini-1.2.115a1.dist-info/METADATA,sha256=_aYBoX0V8yPCap7XV0FQp_5_RIJWH1aVe0S5jDvDGXw,50695
|
78
|
+
upgini-1.2.115a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
79
|
+
upgini-1.2.115a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
80
|
+
upgini-1.2.115a1.dist-info/RECORD,,
|
File without changes
|
File without changes
|