upgini 1.2.70a3832.dev3__py3-none-any.whl → 1.2.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/date.py +15 -21
- upgini/autofe/feature.py +5 -1
- upgini/autofe/timeseries/base.py +3 -9
- upgini/autofe/timeseries/cross.py +22 -12
- upgini/autofe/timeseries/roll.py +2 -7
- upgini/autofe/timeseries/trend.py +2 -1
- upgini/autofe/utils.py +83 -0
- upgini/dataset.py +8 -1
- upgini/features_enricher.py +335 -270
- upgini/metadata.py +4 -0
- upgini/metrics.py +67 -60
- upgini/resource_bundle/strings.properties +1 -0
- upgini/search_task.py +7 -1
- upgini/utils/mstats.py +1 -1
- upgini/utils/sklearn_ext.py +11 -0
- upgini/utils/sort.py +1 -1
- upgini/utils/target_utils.py +4 -2
- {upgini-1.2.70a3832.dev3.dist-info → upgini-1.2.71.dist-info}/METADATA +3 -4
- {upgini-1.2.70a3832.dev3.dist-info → upgini-1.2.71.dist-info}/RECORD +22 -22
- upgini/lazy_import.py +0 -35
- {upgini-1.2.70a3832.dev3.dist-info → upgini-1.2.71.dist-info}/WHEEL +0 -0
- {upgini-1.2.70a3832.dev3.dist-info → upgini-1.2.71.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -12,6 +12,7 @@ import tempfile
|
|
|
12
12
|
import time
|
|
13
13
|
import uuid
|
|
14
14
|
from collections import Counter
|
|
15
|
+
from copy import deepcopy
|
|
15
16
|
from dataclasses import dataclass
|
|
16
17
|
from threading import Thread
|
|
17
18
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
|
@@ -54,6 +55,7 @@ from upgini.metadata import (
|
|
|
54
55
|
SORT_ID,
|
|
55
56
|
SYSTEM_RECORD_ID,
|
|
56
57
|
TARGET,
|
|
58
|
+
AutoFEParameters,
|
|
57
59
|
CVType,
|
|
58
60
|
FeaturesMetadataV2,
|
|
59
61
|
FileColumnMeaningType,
|
|
@@ -407,6 +409,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
407
409
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
408
410
|
search_id_callback: Optional[Callable[[str], Any]] = None,
|
|
409
411
|
select_features: bool = True,
|
|
412
|
+
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
|
410
413
|
**kwargs,
|
|
411
414
|
):
|
|
412
415
|
"""Fit to data.
|
|
@@ -495,6 +498,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
495
498
|
importance_threshold=importance_threshold,
|
|
496
499
|
max_features=max_features,
|
|
497
500
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
|
501
|
+
auto_fe_parameters=auto_fe_parameters,
|
|
498
502
|
progress_callback=progress_callback,
|
|
499
503
|
search_id_callback=search_id_callback,
|
|
500
504
|
)
|
|
@@ -550,6 +554,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
550
554
|
remove_outliers_calc_metrics: Optional[bool] = None,
|
|
551
555
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
552
556
|
select_features: bool = True,
|
|
557
|
+
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
|
553
558
|
**kwargs,
|
|
554
559
|
) -> pd.DataFrame:
|
|
555
560
|
"""Fit to data, then transform it.
|
|
@@ -649,6 +654,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
649
654
|
importance_threshold=importance_threshold,
|
|
650
655
|
max_features=max_features,
|
|
651
656
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
|
657
|
+
auto_fe_parameters=auto_fe_parameters,
|
|
652
658
|
progress_callback=progress_callback,
|
|
653
659
|
)
|
|
654
660
|
self.logger.info("Inner fit finished successfully")
|
|
@@ -703,6 +709,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
703
709
|
self,
|
|
704
710
|
X: pd.DataFrame,
|
|
705
711
|
*args,
|
|
712
|
+
y: Optional[pd.Series] = None,
|
|
706
713
|
exclude_features_sources: Optional[List[str]] = None,
|
|
707
714
|
keep_input: bool = True,
|
|
708
715
|
importance_threshold: Optional[float] = None,
|
|
@@ -763,9 +770,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
763
770
|
|
|
764
771
|
start_time = time.time()
|
|
765
772
|
try:
|
|
766
|
-
result, _, _ = self.__inner_transform(
|
|
773
|
+
result, _, _, _ = self.__inner_transform(
|
|
767
774
|
trace_id,
|
|
768
775
|
X,
|
|
776
|
+
y=y,
|
|
769
777
|
exclude_features_sources=exclude_features_sources,
|
|
770
778
|
importance_threshold=importance_threshold,
|
|
771
779
|
max_features=max_features,
|
|
@@ -834,7 +842,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
834
842
|
max_features: Optional[int] = None,
|
|
835
843
|
remove_outliers_calc_metrics: Optional[bool] = None,
|
|
836
844
|
trace_id: Optional[str] = None,
|
|
837
|
-
|
|
845
|
+
internal_call: bool = False,
|
|
838
846
|
progress_bar: Optional[ProgressBar] = None,
|
|
839
847
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
840
848
|
**kwargs,
|
|
@@ -1088,7 +1096,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1088
1096
|
enriched_shaps = enriched_cv_result.shap_values
|
|
1089
1097
|
|
|
1090
1098
|
if enriched_shaps is not None:
|
|
1091
|
-
self._update_shap_values(trace_id, fitting_X, enriched_shaps, silent)
|
|
1099
|
+
self._update_shap_values(trace_id, fitting_X, enriched_shaps, silent=not internal_call)
|
|
1092
1100
|
|
|
1093
1101
|
if enriched_metric is None:
|
|
1094
1102
|
self.logger.warning(
|
|
@@ -1249,7 +1257,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1249
1257
|
if self.raise_validation_error:
|
|
1250
1258
|
raise e
|
|
1251
1259
|
else:
|
|
1252
|
-
if not
|
|
1260
|
+
if not internal_call:
|
|
1253
1261
|
self._dump_python_libs()
|
|
1254
1262
|
self.__display_support_link()
|
|
1255
1263
|
raise e
|
|
@@ -1478,12 +1486,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1478
1486
|
|
|
1479
1487
|
excluding_search_keys = list(search_keys.keys())
|
|
1480
1488
|
if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
|
|
1481
|
-
|
|
1489
|
+
should_not_exclude = set()
|
|
1482
1490
|
for sk in excluding_search_keys:
|
|
1483
|
-
renamed_sk = columns_renaming.get(sk)
|
|
1491
|
+
renamed_sk = columns_renaming.get(sk, sk)
|
|
1484
1492
|
if renamed_sk in search_keys_for_metrics or renamed_sk in self.feature_names_:
|
|
1485
|
-
|
|
1486
|
-
excluding_search_keys = [sk for sk in excluding_search_keys if sk not in
|
|
1493
|
+
should_not_exclude.add(sk)
|
|
1494
|
+
excluding_search_keys = [sk for sk in excluding_search_keys if sk not in should_not_exclude]
|
|
1487
1495
|
|
|
1488
1496
|
self.logger.info(f"Excluding search keys: {excluding_search_keys}")
|
|
1489
1497
|
|
|
@@ -1505,8 +1513,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1505
1513
|
self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
|
|
1506
1514
|
|
|
1507
1515
|
filtered_enriched_features = self.__filtered_enriched_features(
|
|
1508
|
-
importance_threshold,
|
|
1509
|
-
max_features,
|
|
1516
|
+
importance_threshold, max_features, trace_id, validated_X
|
|
1510
1517
|
)
|
|
1511
1518
|
filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
|
|
1512
1519
|
|
|
@@ -1682,7 +1689,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1682
1689
|
validated_X,
|
|
1683
1690
|
validated_y,
|
|
1684
1691
|
eval_set,
|
|
1685
|
-
is_demo_dataset,
|
|
1686
1692
|
exclude_features_sources,
|
|
1687
1693
|
trace_id,
|
|
1688
1694
|
progress_bar,
|
|
@@ -1698,8 +1704,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1698
1704
|
if exclude_features_sources:
|
|
1699
1705
|
enriched_X = enriched_X.drop(columns=exclude_features_sources, errors="ignore")
|
|
1700
1706
|
|
|
1701
|
-
return self.
|
|
1702
|
-
|
|
1707
|
+
return self.__cache_and_return_results(
|
|
1708
|
+
datasets_hash,
|
|
1709
|
+
X_sampled,
|
|
1710
|
+
y_sampled,
|
|
1711
|
+
enriched_X,
|
|
1712
|
+
eval_set_sampled_dict,
|
|
1713
|
+
columns_renaming,
|
|
1714
|
+
search_keys,
|
|
1703
1715
|
)
|
|
1704
1716
|
|
|
1705
1717
|
def __sample_only_input(
|
|
@@ -1776,17 +1788,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1776
1788
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1777
1789
|
|
|
1778
1790
|
datasets_hash = hash_input(X_sampled, y_sampled, eval_set_sampled_dict)
|
|
1779
|
-
self.
|
|
1791
|
+
return self.__cache_and_return_results(
|
|
1792
|
+
datasets_hash,
|
|
1780
1793
|
X_sampled,
|
|
1781
1794
|
y_sampled,
|
|
1782
1795
|
enriched_X,
|
|
1783
1796
|
eval_set_sampled_dict,
|
|
1784
|
-
search_keys,
|
|
1785
1797
|
columns_renaming,
|
|
1786
|
-
|
|
1787
|
-
|
|
1788
|
-
return self.__mk_sampled_data_tuple(
|
|
1789
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
|
1798
|
+
search_keys,
|
|
1790
1799
|
)
|
|
1791
1800
|
|
|
1792
1801
|
def __sample_balanced(
|
|
@@ -1825,13 +1834,34 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1825
1834
|
# index in each dataset (X, eval set) may be reordered and non unique, but index in validated datasets
|
|
1826
1835
|
# can differs from it
|
|
1827
1836
|
fit_features = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
|
1828
|
-
|
|
1837
|
+
|
|
1838
|
+
# Pre-process features if we need to drop outliers
|
|
1839
|
+
if rows_to_drop is not None:
|
|
1840
|
+
self.logger.info(f"Before dropping target outliers size: {len(fit_features)}")
|
|
1841
|
+
fit_features = fit_features[
|
|
1842
|
+
~fit_features[ENTITY_SYSTEM_RECORD_ID].isin(rows_to_drop[ENTITY_SYSTEM_RECORD_ID])
|
|
1843
|
+
]
|
|
1844
|
+
self.logger.info(f"After dropping target outliers size: {len(fit_features)}")
|
|
1845
|
+
|
|
1846
|
+
enriched_eval_sets = {}
|
|
1847
|
+
enriched_Xy = self.__enrich(
|
|
1829
1848
|
self.df_with_original_index,
|
|
1830
1849
|
fit_features,
|
|
1831
|
-
|
|
1850
|
+
how="inner",
|
|
1832
1851
|
drop_system_record_id=False,
|
|
1833
1852
|
)
|
|
1834
1853
|
|
|
1854
|
+
# Handle eval sets extraction based on EVAL_SET_INDEX
|
|
1855
|
+
if EVAL_SET_INDEX in enriched_Xy.columns:
|
|
1856
|
+
eval_set_indices = list(enriched_Xy[EVAL_SET_INDEX].unique())
|
|
1857
|
+
if 0 in eval_set_indices:
|
|
1858
|
+
eval_set_indices.remove(0)
|
|
1859
|
+
for eval_set_index in eval_set_indices:
|
|
1860
|
+
enriched_eval_sets[eval_set_index] = enriched_Xy.loc[
|
|
1861
|
+
enriched_Xy[EVAL_SET_INDEX] == eval_set_index
|
|
1862
|
+
].copy()
|
|
1863
|
+
enriched_Xy = enriched_Xy.loc[enriched_Xy[EVAL_SET_INDEX] == 0].copy()
|
|
1864
|
+
|
|
1835
1865
|
x_columns = [c for c in self.df_with_original_index.columns if c not in [EVAL_SET_INDEX, TARGET]]
|
|
1836
1866
|
X_sampled = enriched_Xy[x_columns].copy()
|
|
1837
1867
|
y_sampled = enriched_Xy[TARGET].copy()
|
|
@@ -1855,17 +1885,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1855
1885
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1856
1886
|
|
|
1857
1887
|
datasets_hash = hash_input(self.X, self.y, self.eval_set)
|
|
1858
|
-
self.
|
|
1888
|
+
return self.__cache_and_return_results(
|
|
1889
|
+
datasets_hash,
|
|
1859
1890
|
X_sampled,
|
|
1860
1891
|
y_sampled,
|
|
1861
1892
|
enriched_X,
|
|
1862
1893
|
eval_set_sampled_dict,
|
|
1863
|
-
search_keys,
|
|
1864
1894
|
self.fit_columns_renaming,
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
return self.__mk_sampled_data_tuple(
|
|
1868
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, self.fit_columns_renaming
|
|
1895
|
+
search_keys,
|
|
1869
1896
|
)
|
|
1870
1897
|
|
|
1871
1898
|
def __sample_imbalanced(
|
|
@@ -1873,169 +1900,162 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1873
1900
|
validated_X: pd.DataFrame,
|
|
1874
1901
|
validated_y: pd.Series,
|
|
1875
1902
|
eval_set: Optional[List[tuple]],
|
|
1876
|
-
is_demo_dataset: bool,
|
|
1877
1903
|
exclude_features_sources: Optional[List[str]],
|
|
1878
1904
|
trace_id: str,
|
|
1879
1905
|
progress_bar: Optional[ProgressBar],
|
|
1880
1906
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
|
1881
1907
|
) -> _SampledDataForMetrics:
|
|
1882
|
-
|
|
1883
|
-
if eval_set is not None:
|
|
1884
|
-
self.logger.info("Transform with eval_set")
|
|
1885
|
-
# concatenate X and eval_set with eval_set_index
|
|
1886
|
-
df = validated_X.copy()
|
|
1887
|
-
df[TARGET] = validated_y
|
|
1888
|
-
df[EVAL_SET_INDEX] = 0
|
|
1889
|
-
for idx, eval_pair in enumerate(eval_set):
|
|
1890
|
-
eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
|
|
1891
|
-
eval_df_with_index = eval_x.copy()
|
|
1892
|
-
eval_df_with_index[TARGET] = eval_y
|
|
1893
|
-
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
|
1894
|
-
df = pd.concat([df, eval_df_with_index])
|
|
1895
|
-
|
|
1896
|
-
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1897
|
-
|
|
1898
|
-
# downsample if need to eval_set threshold
|
|
1899
|
-
num_samples = _num_samples(df)
|
|
1900
|
-
force_downsampling = (
|
|
1901
|
-
not self.disable_force_downsampling
|
|
1902
|
-
and self.columns_for_online_api is not None
|
|
1903
|
-
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1904
|
-
)
|
|
1905
|
-
# TODO: check that system_record_id was added before this step
|
|
1906
|
-
if force_downsampling:
|
|
1907
|
-
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
|
1908
|
-
df = balance_undersample_forced(
|
|
1909
|
-
df=df,
|
|
1910
|
-
target_column=TARGET,
|
|
1911
|
-
id_columns=self.id_columns,
|
|
1912
|
-
date_column=self._get_date_column(self.search_keys),
|
|
1913
|
-
task_type=self.model_task_type,
|
|
1914
|
-
cv_type=self.cv,
|
|
1915
|
-
random_state=self.random_state,
|
|
1916
|
-
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1917
|
-
logger=self.logger,
|
|
1918
|
-
bundle=self.bundle,
|
|
1919
|
-
warning_callback=self.__log_warning,
|
|
1920
|
-
)
|
|
1921
|
-
elif num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
|
|
1922
|
-
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
|
|
1923
|
-
df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
|
|
1908
|
+
has_eval_set = eval_set is not None
|
|
1924
1909
|
|
|
1925
|
-
|
|
1910
|
+
self.logger.info(f"Transform {'with' if has_eval_set else 'without'} eval_set")
|
|
1926
1911
|
|
|
1927
|
-
|
|
1928
|
-
|
|
1912
|
+
# Prepare
|
|
1913
|
+
df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set)
|
|
1914
|
+
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1915
|
+
df = self.__downsample_for_metrics(df)
|
|
1929
1916
|
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
|
|
1934
|
-
|
|
1935
|
-
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1917
|
+
# Transform
|
|
1918
|
+
enriched_df, columns_renaming, generated_features, search_keys = self.__inner_transform(
|
|
1919
|
+
trace_id,
|
|
1920
|
+
X=df.drop(columns=[TARGET]),
|
|
1921
|
+
y=df[TARGET],
|
|
1922
|
+
exclude_features_sources=exclude_features_sources,
|
|
1923
|
+
silent_mode=True,
|
|
1924
|
+
metrics_calculation=True,
|
|
1925
|
+
progress_bar=progress_bar,
|
|
1926
|
+
progress_callback=progress_callback,
|
|
1927
|
+
add_fit_system_record_id=True,
|
|
1928
|
+
)
|
|
1929
|
+
if enriched_df is None:
|
|
1930
|
+
return None
|
|
1943
1931
|
|
|
1944
|
-
|
|
1932
|
+
x_columns = [
|
|
1933
|
+
c
|
|
1934
|
+
for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
|
|
1935
|
+
if c in enriched_df.columns
|
|
1936
|
+
]
|
|
1945
1937
|
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
]
|
|
1938
|
+
X_sampled, y_sampled, enriched_X = self.__extract_train_data(enriched_df, x_columns)
|
|
1939
|
+
eval_set_sampled_dict = self.__extract_eval_data(
|
|
1940
|
+
enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
|
|
1941
|
+
)
|
|
1951
1942
|
|
|
1952
|
-
|
|
1953
|
-
|
|
1954
|
-
|
|
1955
|
-
|
|
1956
|
-
|
|
1943
|
+
# Cache and return results
|
|
1944
|
+
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
|
1945
|
+
return self.__cache_and_return_results(
|
|
1946
|
+
datasets_hash,
|
|
1947
|
+
X_sampled,
|
|
1948
|
+
y_sampled,
|
|
1949
|
+
enriched_X,
|
|
1950
|
+
eval_set_sampled_dict,
|
|
1951
|
+
columns_renaming,
|
|
1952
|
+
search_keys,
|
|
1953
|
+
)
|
|
1957
1954
|
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
|
|
1955
|
+
def __combine_train_and_eval_sets(
|
|
1956
|
+
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]]
|
|
1957
|
+
) -> pd.DataFrame:
|
|
1958
|
+
df = validated_X.copy()
|
|
1959
|
+
df[TARGET] = validated_y
|
|
1960
|
+
if eval_set is None:
|
|
1961
|
+
return df
|
|
1962
|
+
|
|
1963
|
+
df[EVAL_SET_INDEX] = 0
|
|
1967
1964
|
|
|
1968
|
-
|
|
1965
|
+
for idx, eval_pair in enumerate(eval_set):
|
|
1966
|
+
eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
|
|
1967
|
+
eval_df_with_index = eval_x.copy()
|
|
1968
|
+
eval_df_with_index[TARGET] = eval_y
|
|
1969
|
+
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
|
1970
|
+
df = pd.concat([df, eval_df_with_index])
|
|
1969
1971
|
|
|
1970
|
-
|
|
1972
|
+
return df
|
|
1971
1973
|
|
|
1972
|
-
|
|
1973
|
-
|
|
1974
|
-
|
|
1975
|
-
|
|
1976
|
-
|
|
1974
|
+
def __downsample_for_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
1975
|
+
num_samples = _num_samples(df)
|
|
1976
|
+
force_downsampling = (
|
|
1977
|
+
not self.disable_force_downsampling
|
|
1978
|
+
and self.columns_for_online_api is not None
|
|
1979
|
+
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1980
|
+
)
|
|
1981
|
+
|
|
1982
|
+
if force_downsampling:
|
|
1983
|
+
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
|
1984
|
+
return balance_undersample_forced(
|
|
1985
|
+
df=df,
|
|
1986
|
+
target_column=TARGET,
|
|
1987
|
+
id_columns=self.id_columns,
|
|
1988
|
+
date_column=self._get_date_column(self.search_keys),
|
|
1989
|
+
task_type=self.model_task_type,
|
|
1990
|
+
cv_type=self.cv,
|
|
1991
|
+
random_state=self.random_state,
|
|
1992
|
+
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1993
|
+
logger=self.logger,
|
|
1994
|
+
bundle=self.bundle,
|
|
1995
|
+
warning_callback=self.__log_warning,
|
|
1977
1996
|
)
|
|
1997
|
+
elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
|
1998
|
+
if EVAL_SET_INDEX in df.columns:
|
|
1999
|
+
threshold = Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
|
|
2000
|
+
sample_size = Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS
|
|
2001
|
+
else:
|
|
2002
|
+
threshold = Dataset.FIT_SAMPLE_THRESHOLD
|
|
2003
|
+
sample_size = Dataset.FIT_SAMPLE_ROWS
|
|
1978
2004
|
|
|
1979
|
-
if
|
|
1980
|
-
self.logger.info(f"
|
|
1981
|
-
df =
|
|
1982
|
-
df=df,
|
|
1983
|
-
target_column=TARGET,
|
|
1984
|
-
id_columns=self.id_columns,
|
|
1985
|
-
date_column=self._get_date_column(self.search_keys),
|
|
1986
|
-
task_type=self.model_task_type,
|
|
1987
|
-
cv_type=self.cv,
|
|
1988
|
-
random_state=self.random_state,
|
|
1989
|
-
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1990
|
-
logger=self.logger,
|
|
1991
|
-
bundle=self.bundle,
|
|
1992
|
-
warning_callback=self.__log_warning,
|
|
1993
|
-
)
|
|
1994
|
-
elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
|
1995
|
-
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
|
|
1996
|
-
df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
|
|
2005
|
+
if num_samples > threshold:
|
|
2006
|
+
self.logger.info(f"Downsampling from {num_samples} to {sample_size}")
|
|
2007
|
+
return df.sample(n=sample_size, random_state=self.random_state)
|
|
1997
2008
|
|
|
1998
|
-
|
|
1999
|
-
df = df.rename(columns={TARGET: tmp_target_name})
|
|
2009
|
+
return df
|
|
2000
2010
|
|
|
2001
|
-
|
|
2002
|
-
|
|
2003
|
-
|
|
2004
|
-
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
|
|
2012
|
-
if enriched_Xy is None:
|
|
2013
|
-
return None
|
|
2011
|
+
def __extract_train_data(
|
|
2012
|
+
self, enriched_df: pd.DataFrame, x_columns: List[str]
|
|
2013
|
+
) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
|
|
2014
|
+
if EVAL_SET_INDEX in enriched_df.columns:
|
|
2015
|
+
enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
|
|
2016
|
+
else:
|
|
2017
|
+
enriched_Xy = enriched_df
|
|
2018
|
+
X_sampled = enriched_Xy[x_columns].copy()
|
|
2019
|
+
y_sampled = enriched_Xy[TARGET].copy()
|
|
2020
|
+
enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
2021
|
+
return X_sampled, y_sampled, enriched_X
|
|
2014
2022
|
|
|
2015
|
-
|
|
2023
|
+
def __extract_eval_data(
|
|
2024
|
+
self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
|
|
2025
|
+
) -> Dict[int, Tuple]:
|
|
2026
|
+
eval_set_sampled_dict = {}
|
|
2016
2027
|
|
|
2017
|
-
|
|
2018
|
-
|
|
2019
|
-
|
|
2020
|
-
|
|
2021
|
-
]
|
|
2028
|
+
for idx in range(eval_set_len):
|
|
2029
|
+
enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
|
|
2030
|
+
eval_x_sampled = enriched_eval_xy[x_columns].copy()
|
|
2031
|
+
eval_y_sampled = enriched_eval_xy[TARGET].copy()
|
|
2032
|
+
enriched_eval_x = enriched_eval_xy[enriched_X_columns].copy()
|
|
2033
|
+
eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
|
|
2022
2034
|
|
|
2023
|
-
|
|
2024
|
-
|
|
2025
|
-
|
|
2035
|
+
return eval_set_sampled_dict
|
|
2036
|
+
|
|
2037
|
+
def __cache_and_return_results(
|
|
2038
|
+
self,
|
|
2039
|
+
datasets_hash: str,
|
|
2040
|
+
X_sampled: pd.DataFrame,
|
|
2041
|
+
y_sampled: pd.Series,
|
|
2042
|
+
enriched_X: pd.DataFrame,
|
|
2043
|
+
eval_set_sampled_dict: Dict[int, Tuple],
|
|
2044
|
+
columns_renaming: Dict[str, str],
|
|
2045
|
+
search_keys: Dict[str, SearchKey],
|
|
2046
|
+
) -> _SampledDataForMetrics:
|
|
2026
2047
|
|
|
2027
|
-
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
|
2028
2048
|
self.__cached_sampled_datasets[datasets_hash] = (
|
|
2029
2049
|
X_sampled,
|
|
2030
2050
|
y_sampled,
|
|
2031
2051
|
enriched_X,
|
|
2032
2052
|
eval_set_sampled_dict,
|
|
2033
|
-
|
|
2053
|
+
search_keys,
|
|
2034
2054
|
columns_renaming,
|
|
2035
2055
|
)
|
|
2036
2056
|
|
|
2037
2057
|
return self.__mk_sampled_data_tuple(
|
|
2038
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict,
|
|
2058
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
|
2039
2059
|
)
|
|
2040
2060
|
|
|
2041
2061
|
def __mk_sampled_data_tuple(
|
|
@@ -2047,7 +2067,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2047
2067
|
search_keys: Dict,
|
|
2048
2068
|
columns_renaming: Dict[str, str],
|
|
2049
2069
|
):
|
|
2050
|
-
search_keys = {
|
|
2070
|
+
search_keys = {
|
|
2071
|
+
columns_renaming.get(k, k): v
|
|
2072
|
+
for k, v in search_keys.items()
|
|
2073
|
+
if columns_renaming.get(k, k) in X_sampled.columns.to_list()
|
|
2074
|
+
}
|
|
2051
2075
|
return FeaturesEnricher._SampledDataForMetrics(
|
|
2052
2076
|
X_sampled=X_sampled,
|
|
2053
2077
|
y_sampled=y_sampled,
|
|
@@ -2161,6 +2185,7 @@ if response.status_code == 200:
|
|
|
2161
2185
|
trace_id: str,
|
|
2162
2186
|
X: pd.DataFrame,
|
|
2163
2187
|
*,
|
|
2188
|
+
y: Optional[pd.Series] = None,
|
|
2164
2189
|
exclude_features_sources: Optional[List[str]] = None,
|
|
2165
2190
|
importance_threshold: Optional[float] = None,
|
|
2166
2191
|
max_features: Optional[int] = None,
|
|
@@ -2169,8 +2194,7 @@ if response.status_code == 200:
|
|
|
2169
2194
|
progress_bar: Optional[ProgressBar] = None,
|
|
2170
2195
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
2171
2196
|
add_fit_system_record_id: bool = False,
|
|
2172
|
-
|
|
2173
|
-
) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
|
|
2197
|
+
) -> Tuple[pd.DataFrame, Dict[str, str], List[str], Dict[str, SearchKey]]:
|
|
2174
2198
|
if self._search_task is None:
|
|
2175
2199
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
2176
2200
|
|
|
@@ -2179,20 +2203,28 @@ if response.status_code == 200:
|
|
|
2179
2203
|
self.logger.info("Start transform")
|
|
2180
2204
|
|
|
2181
2205
|
validated_X = self._validate_X(X, is_transform=True)
|
|
2206
|
+
if y is not None:
|
|
2207
|
+
validated_y = self._validate_y(validated_X, y)
|
|
2208
|
+
df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set=None)
|
|
2209
|
+
else:
|
|
2210
|
+
validated_y = None
|
|
2211
|
+
df = validated_X
|
|
2212
|
+
|
|
2213
|
+
validated_Xy = df.copy()
|
|
2182
2214
|
|
|
2183
|
-
self.__log_debug_information(validated_X, exclude_features_sources=exclude_features_sources)
|
|
2215
|
+
self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
|
|
2184
2216
|
|
|
2185
2217
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
2186
2218
|
|
|
2187
2219
|
if len(self.feature_names_) == 0:
|
|
2188
2220
|
self.logger.warning(self.bundle.get("no_important_features_for_transform"))
|
|
2189
|
-
return X, {c: c for c in X.columns}, []
|
|
2221
|
+
return X, {c: c for c in X.columns}, [], {}
|
|
2190
2222
|
|
|
2191
2223
|
if self._has_paid_features(exclude_features_sources):
|
|
2192
2224
|
msg = self.bundle.get("transform_with_paid_features")
|
|
2193
2225
|
self.logger.warning(msg)
|
|
2194
2226
|
self.__display_support_link(msg)
|
|
2195
|
-
return None, {c: c for c in X.columns}, []
|
|
2227
|
+
return None, {c: c for c in X.columns}, [], {}
|
|
2196
2228
|
|
|
2197
2229
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
2198
2230
|
online_api_features = [fm.name for fm in features_meta if fm.from_online_api and fm.shap_value > 0]
|
|
@@ -2215,7 +2247,7 @@ if response.status_code == 200:
|
|
|
2215
2247
|
self.logger.warning(msg)
|
|
2216
2248
|
print(msg)
|
|
2217
2249
|
show_request_quote_button()
|
|
2218
|
-
return None, {c: c for c in X.columns}, []
|
|
2250
|
+
return None, {c: c for c in X.columns}, [], {}
|
|
2219
2251
|
else:
|
|
2220
2252
|
msg = self.bundle.get("transform_usage_info").format(
|
|
2221
2253
|
transform_usage.limit, transform_usage.transformed_rows
|
|
@@ -2223,29 +2255,27 @@ if response.status_code == 200:
|
|
|
2223
2255
|
self.logger.info(msg)
|
|
2224
2256
|
print(msg)
|
|
2225
2257
|
|
|
2226
|
-
is_demo_dataset = hash_input(
|
|
2258
|
+
is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
|
|
2227
2259
|
|
|
2228
2260
|
columns_to_drop = [
|
|
2229
|
-
c for c in
|
|
2261
|
+
c for c in df.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
|
|
2230
2262
|
]
|
|
2231
2263
|
if len(columns_to_drop) > 0:
|
|
2232
2264
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
|
2233
2265
|
self.logger.warning(msg)
|
|
2234
2266
|
print(msg)
|
|
2235
|
-
|
|
2267
|
+
df = df.drop(columns=columns_to_drop)
|
|
2236
2268
|
|
|
2237
2269
|
search_keys = self.search_keys.copy()
|
|
2238
2270
|
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
|
2239
|
-
|
|
2271
|
+
search_keys.update(
|
|
2240
2272
|
{col: SearchKey.CUSTOM_KEY for col in self.id_columns if col not in self.search_keys}
|
|
2241
2273
|
)
|
|
2242
2274
|
|
|
2243
2275
|
search_keys = self.__prepare_search_keys(
|
|
2244
|
-
|
|
2276
|
+
df, search_keys, is_demo_dataset, is_transform=True, silent_mode=silent_mode
|
|
2245
2277
|
)
|
|
2246
2278
|
|
|
2247
|
-
df = validated_X.copy()
|
|
2248
|
-
|
|
2249
2279
|
df = self.__handle_index_search_keys(df, search_keys)
|
|
2250
2280
|
|
|
2251
2281
|
if DEFAULT_INDEX in df.columns:
|
|
@@ -2253,7 +2283,7 @@ if response.status_code == 200:
|
|
|
2253
2283
|
self.logger.info(msg)
|
|
2254
2284
|
print(msg)
|
|
2255
2285
|
df.drop(columns=DEFAULT_INDEX, inplace=True)
|
|
2256
|
-
|
|
2286
|
+
validated_Xy.drop(columns=DEFAULT_INDEX, inplace=True)
|
|
2257
2287
|
|
|
2258
2288
|
df = self.__add_country_code(df, search_keys)
|
|
2259
2289
|
|
|
@@ -2284,8 +2314,11 @@ if response.status_code == 200:
|
|
|
2284
2314
|
features_for_transform = self._search_task.get_features_for_transform() or []
|
|
2285
2315
|
if len(features_for_transform) > 0:
|
|
2286
2316
|
missing_features_for_transform = [
|
|
2287
|
-
columns_renaming.get(f) for f in features_for_transform if f not in df.columns
|
|
2317
|
+
columns_renaming.get(f) or f for f in features_for_transform if f not in df.columns
|
|
2288
2318
|
]
|
|
2319
|
+
if TARGET in missing_features_for_transform:
|
|
2320
|
+
raise ValidationError(self.bundle.get("missing_target_for_transform"))
|
|
2321
|
+
|
|
2289
2322
|
if len(missing_features_for_transform) > 0:
|
|
2290
2323
|
raise ValidationError(
|
|
2291
2324
|
self.bundle.get("missing_features_for_transform").format(missing_features_for_transform)
|
|
@@ -2341,11 +2374,10 @@ if response.status_code == 200:
|
|
|
2341
2374
|
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2342
2375
|
df = converter.convert(df)
|
|
2343
2376
|
|
|
2344
|
-
|
|
2377
|
+
meaning_types = {}
|
|
2378
|
+
meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
|
|
2379
|
+
meaning_types.update({col: key.value for col, key in search_keys.items()})
|
|
2345
2380
|
|
|
2346
|
-
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
2347
|
-
for col in features_for_transform:
|
|
2348
|
-
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
2349
2381
|
features_not_to_pass = [
|
|
2350
2382
|
c
|
|
2351
2383
|
for c in df.columns
|
|
@@ -2354,13 +2386,12 @@ if response.status_code == 200:
|
|
|
2354
2386
|
and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
|
|
2355
2387
|
]
|
|
2356
2388
|
|
|
2357
|
-
if add_fit_system_record_id
|
|
2358
|
-
reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
|
2389
|
+
if add_fit_system_record_id:
|
|
2359
2390
|
df = self.__add_fit_system_record_id(
|
|
2360
2391
|
df,
|
|
2361
2392
|
search_keys,
|
|
2362
2393
|
SYSTEM_RECORD_ID,
|
|
2363
|
-
|
|
2394
|
+
TARGET,
|
|
2364
2395
|
columns_renaming,
|
|
2365
2396
|
silent=True,
|
|
2366
2397
|
)
|
|
@@ -2485,28 +2516,34 @@ if response.status_code == 200:
|
|
|
2485
2516
|
if progress_callback is not None:
|
|
2486
2517
|
progress_callback(progress)
|
|
2487
2518
|
|
|
2488
|
-
def enrich():
|
|
2489
|
-
res, _ = self.__enrich(
|
|
2490
|
-
df_with_original_index,
|
|
2491
|
-
validation_task.get_all_validation_raw_features(trace_id, metrics_calculation),
|
|
2492
|
-
validated_X,
|
|
2493
|
-
is_transform=True,
|
|
2494
|
-
)
|
|
2495
|
-
return res
|
|
2496
|
-
|
|
2497
2519
|
if not silent_mode:
|
|
2498
2520
|
print(self.bundle.get("transform_start"))
|
|
2499
|
-
|
|
2500
|
-
|
|
2501
|
-
|
|
2502
|
-
|
|
2521
|
+
|
|
2522
|
+
# Prepare input DataFrame for __enrich by concatenating generated ids and client features
|
|
2523
|
+
combined_df = pd.concat(
|
|
2524
|
+
[
|
|
2525
|
+
validated_Xy.reset_index(drop=True),
|
|
2526
|
+
df_with_original_index.reset_index(drop=True),
|
|
2527
|
+
],
|
|
2528
|
+
axis=1,
|
|
2529
|
+
).set_index(validated_Xy.index)
|
|
2530
|
+
|
|
2531
|
+
result_features = validation_task.get_all_validation_raw_features(trace_id, metrics_calculation)
|
|
2532
|
+
|
|
2533
|
+
result = self.__enrich(
|
|
2534
|
+
combined_df,
|
|
2535
|
+
result_features,
|
|
2536
|
+
how="left",
|
|
2537
|
+
)
|
|
2503
2538
|
|
|
2504
2539
|
selecting_columns = [
|
|
2505
2540
|
c
|
|
2506
|
-
for c in itertools.chain(
|
|
2541
|
+
for c in itertools.chain(validated_Xy.columns.tolist(), generated_features)
|
|
2507
2542
|
if c not in self.dropped_client_feature_names_
|
|
2508
2543
|
]
|
|
2509
|
-
filtered_columns = self.__filtered_enriched_features(
|
|
2544
|
+
filtered_columns = self.__filtered_enriched_features(
|
|
2545
|
+
importance_threshold, max_features, trace_id, validated_X
|
|
2546
|
+
)
|
|
2510
2547
|
selecting_columns.extend(
|
|
2511
2548
|
c for c in filtered_columns if c in result.columns and c not in validated_X.columns
|
|
2512
2549
|
)
|
|
@@ -2515,7 +2552,7 @@ if response.status_code == 200:
|
|
|
2515
2552
|
|
|
2516
2553
|
selecting_columns = list(set(selecting_columns))
|
|
2517
2554
|
# sorting: first columns from X, then generated features, then enriched features
|
|
2518
|
-
sorted_selecting_columns = [c for c in
|
|
2555
|
+
sorted_selecting_columns = [c for c in validated_Xy.columns if c in selecting_columns]
|
|
2519
2556
|
for c in generated_features:
|
|
2520
2557
|
if c in selecting_columns and c not in sorted_selecting_columns:
|
|
2521
2558
|
sorted_selecting_columns.append(c)
|
|
@@ -2533,7 +2570,7 @@ if response.status_code == 200:
|
|
|
2533
2570
|
if add_fit_system_record_id:
|
|
2534
2571
|
result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
|
|
2535
2572
|
|
|
2536
|
-
return result, columns_renaming, generated_features
|
|
2573
|
+
return result, columns_renaming, generated_features, search_keys
|
|
2537
2574
|
|
|
2538
2575
|
def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
|
|
2539
2576
|
features_info = self._internal_features_info
|
|
@@ -2643,6 +2680,7 @@ if response.status_code == 200:
|
|
|
2643
2680
|
importance_threshold: Optional[float],
|
|
2644
2681
|
max_features: Optional[int],
|
|
2645
2682
|
remove_outliers_calc_metrics: Optional[bool],
|
|
2683
|
+
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
|
2646
2684
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
2647
2685
|
search_id_callback: Optional[Callable[[str], Any]] = None,
|
|
2648
2686
|
):
|
|
@@ -2948,6 +2986,7 @@ if response.status_code == 200:
|
|
|
2948
2986
|
runtime_parameters=runtime_parameters,
|
|
2949
2987
|
exclude_features_sources=exclude_features_sources,
|
|
2950
2988
|
force_downsampling=force_downsampling,
|
|
2989
|
+
auto_fe_parameters=auto_fe_parameters,
|
|
2951
2990
|
)
|
|
2952
2991
|
|
|
2953
2992
|
if search_id_callback is not None:
|
|
@@ -3211,8 +3250,7 @@ if response.status_code == 200:
|
|
|
3211
3250
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
|
3212
3251
|
if len(eval_pair) != 2:
|
|
3213
3252
|
raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
|
3214
|
-
eval_X = eval_pair
|
|
3215
|
-
eval_y = eval_pair[1]
|
|
3253
|
+
eval_X, eval_y = eval_pair
|
|
3216
3254
|
|
|
3217
3255
|
if _num_samples(eval_X) == 0:
|
|
3218
3256
|
raise ValidationError(self.bundle.get("eval_x_is_empty"))
|
|
@@ -3712,23 +3750,19 @@ if response.status_code == 200:
|
|
|
3712
3750
|
|
|
3713
3751
|
def __enrich(
|
|
3714
3752
|
self,
|
|
3715
|
-
|
|
3753
|
+
input_df: pd.DataFrame,
|
|
3716
3754
|
result_features: Optional[pd.DataFrame],
|
|
3717
|
-
|
|
3718
|
-
is_transform=False,
|
|
3719
|
-
rows_to_drop: Optional[pd.DataFrame] = None,
|
|
3755
|
+
how: str = "inner",
|
|
3720
3756
|
drop_system_record_id=True,
|
|
3721
|
-
) ->
|
|
3757
|
+
) -> pd.DataFrame:
|
|
3722
3758
|
if result_features is None:
|
|
3723
3759
|
self.logger.error(f"result features not found by search_task_id: {self.get_search_id()}")
|
|
3724
3760
|
raise RuntimeError(self.bundle.get("features_wasnt_returned"))
|
|
3725
|
-
result_features = (
|
|
3726
|
-
result_features.drop(columns=EVAL_SET_INDEX)
|
|
3727
|
-
if EVAL_SET_INDEX in result_features.columns
|
|
3728
|
-
else result_features
|
|
3729
|
-
)
|
|
3730
3761
|
|
|
3731
|
-
|
|
3762
|
+
if EVAL_SET_INDEX in result_features.columns:
|
|
3763
|
+
result_features = result_features.drop(columns=EVAL_SET_INDEX)
|
|
3764
|
+
|
|
3765
|
+
comparing_columns = input_df.columns
|
|
3732
3766
|
dup_features = [
|
|
3733
3767
|
c
|
|
3734
3768
|
for c in comparing_columns
|
|
@@ -3738,63 +3772,80 @@ if response.status_code == 200:
|
|
|
3738
3772
|
self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
|
|
3739
3773
|
raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
|
|
3740
3774
|
|
|
3741
|
-
# index
|
|
3742
|
-
original_index_name =
|
|
3743
|
-
|
|
3775
|
+
# Handle index and column renaming
|
|
3776
|
+
original_index_name = input_df.index.name
|
|
3777
|
+
renamed_column = None
|
|
3778
|
+
|
|
3779
|
+
# Handle column rename if it conflicts with index name
|
|
3780
|
+
if original_index_name in input_df.columns:
|
|
3781
|
+
renamed_column = f"{original_index_name}_renamed"
|
|
3782
|
+
input_df = input_df.rename(columns={original_index_name: renamed_column})
|
|
3783
|
+
|
|
3784
|
+
# Reset index for the merge operation
|
|
3785
|
+
input_df = input_df.reset_index()
|
|
3786
|
+
|
|
3744
3787
|
# TODO drop system_record_id before merge
|
|
3788
|
+
# Merge with result features
|
|
3745
3789
|
result_features = pd.merge(
|
|
3746
|
-
|
|
3790
|
+
input_df,
|
|
3747
3791
|
result_features,
|
|
3748
3792
|
on=ENTITY_SYSTEM_RECORD_ID,
|
|
3749
|
-
how=
|
|
3793
|
+
how=how,
|
|
3750
3794
|
)
|
|
3795
|
+
|
|
3796
|
+
# Restore the index
|
|
3751
3797
|
result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
|
|
3752
3798
|
result_features.index.name = original_index_name
|
|
3753
3799
|
|
|
3754
|
-
|
|
3755
|
-
|
|
3756
|
-
result_features = result_features
|
|
3757
|
-
~result_features[ENTITY_SYSTEM_RECORD_ID].isin(rows_to_drop[ENTITY_SYSTEM_RECORD_ID])
|
|
3758
|
-
]
|
|
3759
|
-
self.logger.info(f"After dropping target outliers size: {len(result_features)}")
|
|
3760
|
-
|
|
3761
|
-
result_eval_sets = {}
|
|
3762
|
-
if not is_transform and EVAL_SET_INDEX in result_features.columns:
|
|
3763
|
-
result_train_features = result_features.loc[result_features[EVAL_SET_INDEX] == 0].copy()
|
|
3764
|
-
eval_set_indices = list(result_features[EVAL_SET_INDEX].unique())
|
|
3765
|
-
if 0 in eval_set_indices:
|
|
3766
|
-
eval_set_indices.remove(0)
|
|
3767
|
-
for eval_set_index in eval_set_indices:
|
|
3768
|
-
result_eval_sets[eval_set_index] = result_features.loc[
|
|
3769
|
-
result_features[EVAL_SET_INDEX] == eval_set_index
|
|
3770
|
-
].copy()
|
|
3771
|
-
result_train_features = result_train_features.drop(columns=EVAL_SET_INDEX)
|
|
3772
|
-
else:
|
|
3773
|
-
result_train_features = result_features
|
|
3774
|
-
|
|
3775
|
-
if is_transform:
|
|
3776
|
-
index_name = X.index.name
|
|
3777
|
-
renamed_column = None
|
|
3778
|
-
if index_name in X.columns:
|
|
3779
|
-
renamed_column = f"{index_name}_renamed"
|
|
3780
|
-
X = X.rename(columns={index_name: renamed_column})
|
|
3781
|
-
result_train = pd.concat([X.reset_index(), result_train_features.reset_index(drop=True)], axis=1).set_index(
|
|
3782
|
-
index_name or DEFAULT_INDEX
|
|
3783
|
-
)
|
|
3784
|
-
result_train.index.name = index_name
|
|
3785
|
-
if renamed_column is not None:
|
|
3786
|
-
result_train = result_train.rename(columns={renamed_column: index_name})
|
|
3787
|
-
else:
|
|
3788
|
-
result_train = result_train_features
|
|
3800
|
+
# Restore renamed column if needed
|
|
3801
|
+
if renamed_column is not None:
|
|
3802
|
+
result_features = result_features.rename(columns={renamed_column: original_index_name})
|
|
3789
3803
|
|
|
3790
3804
|
if drop_system_record_id:
|
|
3791
|
-
|
|
3792
|
-
for eval_set_index in result_eval_sets.keys():
|
|
3793
|
-
result_eval_sets[eval_set_index] = result_eval_sets[eval_set_index].drop(
|
|
3794
|
-
columns=[SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID], errors="ignore"
|
|
3795
|
-
)
|
|
3805
|
+
result_features = result_features.drop(columns=[SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID], errors="ignore")
|
|
3796
3806
|
|
|
3797
|
-
return
|
|
3807
|
+
return result_features
|
|
3808
|
+
|
|
3809
|
+
def __get_features_importance_from_server(self, trace_id: str, df: pd.DataFrame):
|
|
3810
|
+
if self._search_task is None:
|
|
3811
|
+
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
3812
|
+
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
3813
|
+
if features_meta is None:
|
|
3814
|
+
raise Exception(self.bundle.get("missing_features_meta"))
|
|
3815
|
+
features_meta = deepcopy(features_meta)
|
|
3816
|
+
|
|
3817
|
+
original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
|
|
3818
|
+
df = df.rename(columns=original_names_dict)
|
|
3819
|
+
|
|
3820
|
+
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
|
3821
|
+
|
|
3822
|
+
importances = {}
|
|
3823
|
+
|
|
3824
|
+
for feature_meta in features_meta:
|
|
3825
|
+
if feature_meta.name in original_names_dict.keys():
|
|
3826
|
+
feature_meta.name = original_names_dict[feature_meta.name]
|
|
3827
|
+
|
|
3828
|
+
is_client_feature = feature_meta.name in df.columns
|
|
3829
|
+
|
|
3830
|
+
if feature_meta.shap_value == 0.0:
|
|
3831
|
+
continue
|
|
3832
|
+
|
|
3833
|
+
# Use only important features
|
|
3834
|
+
if (
|
|
3835
|
+
feature_meta.name == COUNTRY
|
|
3836
|
+
# In select_features mode we select also from etalon features and need to show them
|
|
3837
|
+
or (not self.fit_select_features and is_client_feature)
|
|
3838
|
+
):
|
|
3839
|
+
continue
|
|
3840
|
+
|
|
3841
|
+
# Temporary workaround for duplicate features metadata
|
|
3842
|
+
if feature_meta.name in importances:
|
|
3843
|
+
self.logger.warning(f"WARNING: Duplicate feature metadata: {feature_meta}")
|
|
3844
|
+
continue
|
|
3845
|
+
|
|
3846
|
+
importances[feature_meta.name] = feature_meta.shap_value
|
|
3847
|
+
|
|
3848
|
+
return importances
|
|
3798
3849
|
|
|
3799
3850
|
def __prepare_feature_importances(
|
|
3800
3851
|
self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
|
@@ -3804,6 +3855,7 @@ if response.status_code == 200:
|
|
|
3804
3855
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
3805
3856
|
if features_meta is None:
|
|
3806
3857
|
raise Exception(self.bundle.get("missing_features_meta"))
|
|
3858
|
+
features_meta = deepcopy(features_meta)
|
|
3807
3859
|
|
|
3808
3860
|
original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
|
|
3809
3861
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
|
@@ -3819,15 +3871,23 @@ if response.status_code == 200:
|
|
|
3819
3871
|
|
|
3820
3872
|
original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
|
|
3821
3873
|
|
|
3822
|
-
if updated_shaps is not None:
|
|
3823
|
-
for fm in features_meta:
|
|
3824
|
-
fm.shap_value = updated_shaps.get(fm.name, 0.0)
|
|
3825
|
-
|
|
3826
|
-
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
|
3827
3874
|
for feature_meta in features_meta:
|
|
3828
3875
|
if feature_meta.name in original_names_dict.keys():
|
|
3829
3876
|
feature_meta.name = original_names_dict[feature_meta.name]
|
|
3830
3877
|
|
|
3878
|
+
if updated_shaps is not None:
|
|
3879
|
+
updating_shap = updated_shaps.get(feature_meta.name)
|
|
3880
|
+
if updating_shap is None:
|
|
3881
|
+
self.logger.warning(
|
|
3882
|
+
f"WARNING: Shap value for feature {feature_meta.name} not found and will be set to 0.0"
|
|
3883
|
+
)
|
|
3884
|
+
updating_shap = 0.0
|
|
3885
|
+
feature_meta.shap_value = updating_shap
|
|
3886
|
+
|
|
3887
|
+
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
|
3888
|
+
|
|
3889
|
+
for feature_meta in features_meta:
|
|
3890
|
+
|
|
3831
3891
|
is_client_feature = feature_meta.name in df.columns
|
|
3832
3892
|
|
|
3833
3893
|
# TODO make a decision about selected features based on special flag from mlb
|
|
@@ -3839,7 +3899,7 @@ if response.status_code == 200:
|
|
|
3839
3899
|
# Use only important features
|
|
3840
3900
|
if (
|
|
3841
3901
|
# feature_meta.name in self.fit_generated_features or
|
|
3842
|
-
feature_meta.name == COUNTRY
|
|
3902
|
+
feature_meta.name == COUNTRY # constant synthetic column
|
|
3843
3903
|
# In select_features mode we select also from etalon features and need to show them
|
|
3844
3904
|
or (not self.fit_select_features and is_client_feature)
|
|
3845
3905
|
):
|
|
@@ -3981,16 +4041,19 @@ if response.status_code == 200:
|
|
|
3981
4041
|
)
|
|
3982
4042
|
|
|
3983
4043
|
def __filtered_importance_names(
|
|
3984
|
-
self, importance_threshold: Optional[float], max_features: Optional[int]
|
|
4044
|
+
self, importance_threshold: Optional[float], max_features: Optional[int], trace_id: str, df: pd.DataFrame
|
|
3985
4045
|
) -> List[str]:
|
|
3986
|
-
|
|
3987
|
-
|
|
4046
|
+
# get features importance from server
|
|
4047
|
+
filtered_importances = self.__get_features_importance_from_server(trace_id, df)
|
|
3988
4048
|
|
|
3989
|
-
filtered_importances
|
|
4049
|
+
if len(filtered_importances) == 0:
|
|
4050
|
+
return []
|
|
3990
4051
|
|
|
3991
4052
|
if importance_threshold is not None:
|
|
3992
4053
|
filtered_importances = [
|
|
3993
|
-
(name, importance)
|
|
4054
|
+
(name, importance)
|
|
4055
|
+
for name, importance in filtered_importances.items()
|
|
4056
|
+
if importance > importance_threshold
|
|
3994
4057
|
]
|
|
3995
4058
|
if max_features is not None:
|
|
3996
4059
|
filtered_importances = list(filtered_importances)[:max_features]
|
|
@@ -4129,7 +4192,7 @@ if response.status_code == 200:
|
|
|
4129
4192
|
max_features=max_features,
|
|
4130
4193
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
|
4131
4194
|
trace_id=trace_id,
|
|
4132
|
-
|
|
4195
|
+
internal_call=True,
|
|
4133
4196
|
progress_bar=progress_bar,
|
|
4134
4197
|
progress_callback=progress_callback,
|
|
4135
4198
|
)
|
|
@@ -4203,11 +4266,13 @@ if response.status_code == 200:
|
|
|
4203
4266
|
self,
|
|
4204
4267
|
importance_threshold: Optional[float],
|
|
4205
4268
|
max_features: Optional[int],
|
|
4269
|
+
trace_id: str,
|
|
4270
|
+
df: pd.DataFrame,
|
|
4206
4271
|
) -> List[str]:
|
|
4207
4272
|
importance_threshold = self.__validate_importance_threshold(importance_threshold)
|
|
4208
4273
|
max_features = self.__validate_max_features(max_features)
|
|
4209
4274
|
|
|
4210
|
-
return self.__filtered_importance_names(importance_threshold, max_features)
|
|
4275
|
+
return self.__filtered_importance_names(importance_threshold, max_features, trace_id, df)
|
|
4211
4276
|
|
|
4212
4277
|
def __detect_missing_search_keys(
|
|
4213
4278
|
self,
|