upgini 1.2.70a3832.dev2__py3-none-any.whl → 1.2.71a3810.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/date.py +15 -21
- upgini/autofe/feature.py +5 -1
- upgini/autofe/timeseries/base.py +1 -7
- upgini/autofe/timeseries/cross.py +21 -11
- upgini/autofe/timeseries/roll.py +2 -7
- upgini/autofe/timeseries/trend.py +2 -1
- upgini/autofe/unary.py +37 -1
- upgini/autofe/utils.py +83 -0
- upgini/dataset.py +9 -2
- upgini/features_enricher.py +259 -253
- upgini/http.py +4 -9
- upgini/metadata.py +4 -0
- upgini/metrics.py +48 -145
- upgini/resource_bundle/strings.properties +1 -1
- upgini/search_task.py +7 -1
- upgini/utils/deduplicate_utils.py +0 -2
- upgini/utils/feature_info.py +1 -2
- upgini/utils/mstats.py +1 -1
- upgini/utils/sklearn_ext.py +2 -9
- {upgini-1.2.70a3832.dev2.dist-info → upgini-1.2.71a3810.dev1.dist-info}/METADATA +6 -8
- {upgini-1.2.70a3832.dev2.dist-info → upgini-1.2.71a3810.dev1.dist-info}/RECORD +24 -23
- {upgini-1.2.70a3832.dev2.dist-info → upgini-1.2.71a3810.dev1.dist-info}/WHEEL +1 -1
- {upgini-1.2.70a3832.dev2.dist-info → upgini-1.2.71a3810.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -54,6 +54,7 @@ from upgini.metadata import (
|
|
|
54
54
|
SORT_ID,
|
|
55
55
|
SYSTEM_RECORD_ID,
|
|
56
56
|
TARGET,
|
|
57
|
+
AutoFEParameters,
|
|
57
58
|
CVType,
|
|
58
59
|
FeaturesMetadataV2,
|
|
59
60
|
FileColumnMeaningType,
|
|
@@ -407,6 +408,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
407
408
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
408
409
|
search_id_callback: Optional[Callable[[str], Any]] = None,
|
|
409
410
|
select_features: bool = True,
|
|
411
|
+
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
|
410
412
|
**kwargs,
|
|
411
413
|
):
|
|
412
414
|
"""Fit to data.
|
|
@@ -495,6 +497,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
495
497
|
importance_threshold=importance_threshold,
|
|
496
498
|
max_features=max_features,
|
|
497
499
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
|
500
|
+
auto_fe_parameters=auto_fe_parameters,
|
|
498
501
|
progress_callback=progress_callback,
|
|
499
502
|
search_id_callback=search_id_callback,
|
|
500
503
|
)
|
|
@@ -550,6 +553,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
550
553
|
remove_outliers_calc_metrics: Optional[bool] = None,
|
|
551
554
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
552
555
|
select_features: bool = True,
|
|
556
|
+
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
|
553
557
|
**kwargs,
|
|
554
558
|
) -> pd.DataFrame:
|
|
555
559
|
"""Fit to data, then transform it.
|
|
@@ -649,6 +653,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
649
653
|
importance_threshold=importance_threshold,
|
|
650
654
|
max_features=max_features,
|
|
651
655
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
|
656
|
+
auto_fe_parameters=auto_fe_parameters,
|
|
652
657
|
progress_callback=progress_callback,
|
|
653
658
|
)
|
|
654
659
|
self.logger.info("Inner fit finished successfully")
|
|
@@ -703,6 +708,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
703
708
|
self,
|
|
704
709
|
X: pd.DataFrame,
|
|
705
710
|
*args,
|
|
711
|
+
y: Optional[pd.Series] = None,
|
|
706
712
|
exclude_features_sources: Optional[List[str]] = None,
|
|
707
713
|
keep_input: bool = True,
|
|
708
714
|
importance_threshold: Optional[float] = None,
|
|
@@ -763,9 +769,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
763
769
|
|
|
764
770
|
start_time = time.time()
|
|
765
771
|
try:
|
|
766
|
-
result, _, _ = self.__inner_transform(
|
|
772
|
+
result, _, _, _ = self.__inner_transform(
|
|
767
773
|
trace_id,
|
|
768
774
|
X,
|
|
775
|
+
y=y,
|
|
769
776
|
exclude_features_sources=exclude_features_sources,
|
|
770
777
|
importance_threshold=importance_threshold,
|
|
771
778
|
max_features=max_features,
|
|
@@ -1478,12 +1485,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1478
1485
|
|
|
1479
1486
|
excluding_search_keys = list(search_keys.keys())
|
|
1480
1487
|
if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
|
|
1481
|
-
|
|
1488
|
+
should_not_exclude = set()
|
|
1482
1489
|
for sk in excluding_search_keys:
|
|
1483
|
-
renamed_sk = columns_renaming.get(sk)
|
|
1490
|
+
renamed_sk = columns_renaming.get(sk, sk)
|
|
1484
1491
|
if renamed_sk in search_keys_for_metrics or renamed_sk in self.feature_names_:
|
|
1485
|
-
|
|
1486
|
-
excluding_search_keys = [sk for sk in excluding_search_keys if sk not in
|
|
1492
|
+
should_not_exclude.add(sk)
|
|
1493
|
+
excluding_search_keys = [sk for sk in excluding_search_keys if sk not in should_not_exclude]
|
|
1487
1494
|
|
|
1488
1495
|
self.logger.info(f"Excluding search keys: {excluding_search_keys}")
|
|
1489
1496
|
|
|
@@ -1682,7 +1689,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1682
1689
|
validated_X,
|
|
1683
1690
|
validated_y,
|
|
1684
1691
|
eval_set,
|
|
1685
|
-
is_demo_dataset,
|
|
1686
1692
|
exclude_features_sources,
|
|
1687
1693
|
trace_id,
|
|
1688
1694
|
progress_bar,
|
|
@@ -1698,8 +1704,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1698
1704
|
if exclude_features_sources:
|
|
1699
1705
|
enriched_X = enriched_X.drop(columns=exclude_features_sources, errors="ignore")
|
|
1700
1706
|
|
|
1701
|
-
return self.
|
|
1702
|
-
|
|
1707
|
+
return self.__cache_and_return_results(
|
|
1708
|
+
datasets_hash,
|
|
1709
|
+
X_sampled,
|
|
1710
|
+
y_sampled,
|
|
1711
|
+
enriched_X,
|
|
1712
|
+
eval_set_sampled_dict,
|
|
1713
|
+
columns_renaming,
|
|
1714
|
+
search_keys,
|
|
1703
1715
|
)
|
|
1704
1716
|
|
|
1705
1717
|
def __sample_only_input(
|
|
@@ -1776,17 +1788,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1776
1788
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1777
1789
|
|
|
1778
1790
|
datasets_hash = hash_input(X_sampled, y_sampled, eval_set_sampled_dict)
|
|
1779
|
-
self.
|
|
1791
|
+
return self.__cache_and_return_results(
|
|
1792
|
+
datasets_hash,
|
|
1780
1793
|
X_sampled,
|
|
1781
1794
|
y_sampled,
|
|
1782
1795
|
enriched_X,
|
|
1783
1796
|
eval_set_sampled_dict,
|
|
1784
|
-
search_keys,
|
|
1785
1797
|
columns_renaming,
|
|
1786
|
-
|
|
1787
|
-
|
|
1788
|
-
return self.__mk_sampled_data_tuple(
|
|
1789
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
|
1798
|
+
search_keys,
|
|
1790
1799
|
)
|
|
1791
1800
|
|
|
1792
1801
|
def __sample_balanced(
|
|
@@ -1825,13 +1834,34 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1825
1834
|
# index in each dataset (X, eval set) may be reordered and non unique, but index in validated datasets
|
|
1826
1835
|
# can differs from it
|
|
1827
1836
|
fit_features = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
|
1828
|
-
|
|
1837
|
+
|
|
1838
|
+
# Pre-process features if we need to drop outliers
|
|
1839
|
+
if rows_to_drop is not None:
|
|
1840
|
+
self.logger.info(f"Before dropping target outliers size: {len(fit_features)}")
|
|
1841
|
+
fit_features = fit_features[
|
|
1842
|
+
~fit_features[ENTITY_SYSTEM_RECORD_ID].isin(rows_to_drop[ENTITY_SYSTEM_RECORD_ID])
|
|
1843
|
+
]
|
|
1844
|
+
self.logger.info(f"After dropping target outliers size: {len(fit_features)}")
|
|
1845
|
+
|
|
1846
|
+
enriched_eval_sets = {}
|
|
1847
|
+
enriched_Xy = self.__enrich(
|
|
1829
1848
|
self.df_with_original_index,
|
|
1830
1849
|
fit_features,
|
|
1831
|
-
|
|
1850
|
+
how="inner",
|
|
1832
1851
|
drop_system_record_id=False,
|
|
1833
1852
|
)
|
|
1834
1853
|
|
|
1854
|
+
# Handle eval sets extraction based on EVAL_SET_INDEX
|
|
1855
|
+
if EVAL_SET_INDEX in enriched_Xy.columns:
|
|
1856
|
+
eval_set_indices = list(enriched_Xy[EVAL_SET_INDEX].unique())
|
|
1857
|
+
if 0 in eval_set_indices:
|
|
1858
|
+
eval_set_indices.remove(0)
|
|
1859
|
+
for eval_set_index in eval_set_indices:
|
|
1860
|
+
enriched_eval_sets[eval_set_index] = enriched_Xy.loc[
|
|
1861
|
+
enriched_Xy[EVAL_SET_INDEX] == eval_set_index
|
|
1862
|
+
].copy()
|
|
1863
|
+
enriched_Xy = enriched_Xy.loc[enriched_Xy[EVAL_SET_INDEX] == 0].copy()
|
|
1864
|
+
|
|
1835
1865
|
x_columns = [c for c in self.df_with_original_index.columns if c not in [EVAL_SET_INDEX, TARGET]]
|
|
1836
1866
|
X_sampled = enriched_Xy[x_columns].copy()
|
|
1837
1867
|
y_sampled = enriched_Xy[TARGET].copy()
|
|
@@ -1855,17 +1885,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1855
1885
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1856
1886
|
|
|
1857
1887
|
datasets_hash = hash_input(self.X, self.y, self.eval_set)
|
|
1858
|
-
self.
|
|
1888
|
+
return self.__cache_and_return_results(
|
|
1889
|
+
datasets_hash,
|
|
1859
1890
|
X_sampled,
|
|
1860
1891
|
y_sampled,
|
|
1861
1892
|
enriched_X,
|
|
1862
1893
|
eval_set_sampled_dict,
|
|
1863
|
-
search_keys,
|
|
1864
1894
|
self.fit_columns_renaming,
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
return self.__mk_sampled_data_tuple(
|
|
1868
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, self.fit_columns_renaming
|
|
1895
|
+
search_keys,
|
|
1869
1896
|
)
|
|
1870
1897
|
|
|
1871
1898
|
def __sample_imbalanced(
|
|
@@ -1873,169 +1900,162 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1873
1900
|
validated_X: pd.DataFrame,
|
|
1874
1901
|
validated_y: pd.Series,
|
|
1875
1902
|
eval_set: Optional[List[tuple]],
|
|
1876
|
-
is_demo_dataset: bool,
|
|
1877
1903
|
exclude_features_sources: Optional[List[str]],
|
|
1878
1904
|
trace_id: str,
|
|
1879
1905
|
progress_bar: Optional[ProgressBar],
|
|
1880
1906
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
|
1881
1907
|
) -> _SampledDataForMetrics:
|
|
1882
|
-
|
|
1883
|
-
if eval_set is not None:
|
|
1884
|
-
self.logger.info("Transform with eval_set")
|
|
1885
|
-
# concatenate X and eval_set with eval_set_index
|
|
1886
|
-
df = validated_X.copy()
|
|
1887
|
-
df[TARGET] = validated_y
|
|
1888
|
-
df[EVAL_SET_INDEX] = 0
|
|
1889
|
-
for idx, eval_pair in enumerate(eval_set):
|
|
1890
|
-
eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
|
|
1891
|
-
eval_df_with_index = eval_x.copy()
|
|
1892
|
-
eval_df_with_index[TARGET] = eval_y
|
|
1893
|
-
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
|
1894
|
-
df = pd.concat([df, eval_df_with_index])
|
|
1895
|
-
|
|
1896
|
-
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1897
|
-
|
|
1898
|
-
# downsample if need to eval_set threshold
|
|
1899
|
-
num_samples = _num_samples(df)
|
|
1900
|
-
force_downsampling = (
|
|
1901
|
-
not self.disable_force_downsampling
|
|
1902
|
-
and self.columns_for_online_api is not None
|
|
1903
|
-
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1904
|
-
)
|
|
1905
|
-
# TODO: check that system_record_id was added before this step
|
|
1906
|
-
if force_downsampling:
|
|
1907
|
-
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
|
1908
|
-
df = balance_undersample_forced(
|
|
1909
|
-
df=df,
|
|
1910
|
-
target_column=TARGET,
|
|
1911
|
-
id_columns=self.id_columns,
|
|
1912
|
-
date_column=self._get_date_column(self.search_keys),
|
|
1913
|
-
task_type=self.model_task_type,
|
|
1914
|
-
cv_type=self.cv,
|
|
1915
|
-
random_state=self.random_state,
|
|
1916
|
-
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1917
|
-
logger=self.logger,
|
|
1918
|
-
bundle=self.bundle,
|
|
1919
|
-
warning_callback=self.__log_warning,
|
|
1920
|
-
)
|
|
1921
|
-
elif num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
|
|
1922
|
-
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
|
|
1923
|
-
df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
|
|
1908
|
+
has_eval_set = eval_set is not None
|
|
1924
1909
|
|
|
1925
|
-
|
|
1910
|
+
self.logger.info(f"Transform {'with' if has_eval_set else 'without'} eval_set")
|
|
1926
1911
|
|
|
1927
|
-
|
|
1928
|
-
|
|
1912
|
+
# Prepare
|
|
1913
|
+
df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set)
|
|
1914
|
+
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1915
|
+
df = self.__downsample_for_metrics(df)
|
|
1929
1916
|
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
|
|
1934
|
-
|
|
1935
|
-
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1917
|
+
# Transform
|
|
1918
|
+
enriched_df, columns_renaming, generated_features, search_keys = self.__inner_transform(
|
|
1919
|
+
trace_id,
|
|
1920
|
+
X=df.drop(columns=[TARGET]),
|
|
1921
|
+
y=df[TARGET],
|
|
1922
|
+
exclude_features_sources=exclude_features_sources,
|
|
1923
|
+
silent_mode=True,
|
|
1924
|
+
metrics_calculation=True,
|
|
1925
|
+
progress_bar=progress_bar,
|
|
1926
|
+
progress_callback=progress_callback,
|
|
1927
|
+
add_fit_system_record_id=True,
|
|
1928
|
+
)
|
|
1929
|
+
if enriched_df is None:
|
|
1930
|
+
return None
|
|
1943
1931
|
|
|
1944
|
-
|
|
1932
|
+
x_columns = [
|
|
1933
|
+
c
|
|
1934
|
+
for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
|
|
1935
|
+
if c in enriched_df.columns
|
|
1936
|
+
]
|
|
1945
1937
|
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
]
|
|
1938
|
+
X_sampled, y_sampled, enriched_X = self.__extract_train_data(enriched_df, x_columns)
|
|
1939
|
+
eval_set_sampled_dict = self.__extract_eval_data(
|
|
1940
|
+
enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
|
|
1941
|
+
)
|
|
1951
1942
|
|
|
1952
|
-
|
|
1953
|
-
|
|
1954
|
-
|
|
1955
|
-
|
|
1956
|
-
|
|
1943
|
+
# Cache and return results
|
|
1944
|
+
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
|
1945
|
+
return self.__cache_and_return_results(
|
|
1946
|
+
datasets_hash,
|
|
1947
|
+
X_sampled,
|
|
1948
|
+
y_sampled,
|
|
1949
|
+
enriched_X,
|
|
1950
|
+
eval_set_sampled_dict,
|
|
1951
|
+
columns_renaming,
|
|
1952
|
+
search_keys,
|
|
1953
|
+
)
|
|
1957
1954
|
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
|
|
1955
|
+
def __combine_train_and_eval_sets(
|
|
1956
|
+
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]]
|
|
1957
|
+
) -> pd.DataFrame:
|
|
1958
|
+
df = validated_X.copy()
|
|
1959
|
+
df[TARGET] = validated_y
|
|
1960
|
+
if eval_set is None:
|
|
1961
|
+
return df
|
|
1962
|
+
|
|
1963
|
+
df[EVAL_SET_INDEX] = 0
|
|
1964
|
+
|
|
1965
|
+
for idx, eval_pair in enumerate(eval_set):
|
|
1966
|
+
eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
|
|
1967
|
+
eval_df_with_index = eval_x.copy()
|
|
1968
|
+
eval_df_with_index[TARGET] = eval_y
|
|
1969
|
+
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
|
1970
|
+
df = pd.concat([df, eval_df_with_index])
|
|
1967
1971
|
|
|
1968
|
-
|
|
1972
|
+
return df
|
|
1969
1973
|
|
|
1970
|
-
|
|
1974
|
+
def __downsample_for_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
1975
|
+
num_samples = _num_samples(df)
|
|
1976
|
+
force_downsampling = (
|
|
1977
|
+
not self.disable_force_downsampling
|
|
1978
|
+
and self.columns_for_online_api is not None
|
|
1979
|
+
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1980
|
+
)
|
|
1971
1981
|
|
|
1972
|
-
|
|
1973
|
-
|
|
1974
|
-
|
|
1975
|
-
|
|
1976
|
-
|
|
1982
|
+
if force_downsampling:
|
|
1983
|
+
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
|
1984
|
+
return balance_undersample_forced(
|
|
1985
|
+
df=df,
|
|
1986
|
+
target_column=TARGET,
|
|
1987
|
+
id_columns=self.id_columns,
|
|
1988
|
+
date_column=self._get_date_column(self.search_keys),
|
|
1989
|
+
task_type=self.model_task_type,
|
|
1990
|
+
cv_type=self.cv,
|
|
1991
|
+
random_state=self.random_state,
|
|
1992
|
+
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1993
|
+
logger=self.logger,
|
|
1994
|
+
bundle=self.bundle,
|
|
1995
|
+
warning_callback=self.__log_warning,
|
|
1977
1996
|
)
|
|
1997
|
+
elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
|
1998
|
+
if EVAL_SET_INDEX in df.columns:
|
|
1999
|
+
threshold = Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
|
|
2000
|
+
sample_size = Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS
|
|
2001
|
+
else:
|
|
2002
|
+
threshold = Dataset.FIT_SAMPLE_THRESHOLD
|
|
2003
|
+
sample_size = Dataset.FIT_SAMPLE_ROWS
|
|
1978
2004
|
|
|
1979
|
-
if
|
|
1980
|
-
self.logger.info(f"
|
|
1981
|
-
df =
|
|
1982
|
-
df=df,
|
|
1983
|
-
target_column=TARGET,
|
|
1984
|
-
id_columns=self.id_columns,
|
|
1985
|
-
date_column=self._get_date_column(self.search_keys),
|
|
1986
|
-
task_type=self.model_task_type,
|
|
1987
|
-
cv_type=self.cv,
|
|
1988
|
-
random_state=self.random_state,
|
|
1989
|
-
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1990
|
-
logger=self.logger,
|
|
1991
|
-
bundle=self.bundle,
|
|
1992
|
-
warning_callback=self.__log_warning,
|
|
1993
|
-
)
|
|
1994
|
-
elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
|
1995
|
-
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
|
|
1996
|
-
df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
|
|
2005
|
+
if num_samples > threshold:
|
|
2006
|
+
self.logger.info(f"Downsampling from {num_samples} to {sample_size}")
|
|
2007
|
+
return df.sample(n=sample_size, random_state=self.random_state)
|
|
1997
2008
|
|
|
1998
|
-
|
|
1999
|
-
df = df.rename(columns={TARGET: tmp_target_name})
|
|
2009
|
+
return df
|
|
2000
2010
|
|
|
2001
|
-
|
|
2002
|
-
|
|
2003
|
-
|
|
2004
|
-
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
|
|
2012
|
-
if enriched_Xy is None:
|
|
2013
|
-
return None
|
|
2011
|
+
def __extract_train_data(
|
|
2012
|
+
self, enriched_df: pd.DataFrame, x_columns: List[str]
|
|
2013
|
+
) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
|
|
2014
|
+
if EVAL_SET_INDEX in enriched_df.columns:
|
|
2015
|
+
enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
|
|
2016
|
+
else:
|
|
2017
|
+
enriched_Xy = enriched_df
|
|
2018
|
+
X_sampled = enriched_Xy[x_columns].copy()
|
|
2019
|
+
y_sampled = enriched_Xy[TARGET].copy()
|
|
2020
|
+
enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
2021
|
+
return X_sampled, y_sampled, enriched_X
|
|
2014
2022
|
|
|
2015
|
-
|
|
2023
|
+
def __extract_eval_data(
|
|
2024
|
+
self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
|
|
2025
|
+
) -> Dict[int, Tuple]:
|
|
2026
|
+
eval_set_sampled_dict = {}
|
|
2016
2027
|
|
|
2017
|
-
|
|
2018
|
-
|
|
2019
|
-
|
|
2020
|
-
|
|
2021
|
-
]
|
|
2028
|
+
for idx in range(eval_set_len):
|
|
2029
|
+
enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
|
|
2030
|
+
eval_x_sampled = enriched_eval_xy[x_columns].copy()
|
|
2031
|
+
eval_y_sampled = enriched_eval_xy[TARGET].copy()
|
|
2032
|
+
enriched_eval_x = enriched_eval_xy[enriched_X_columns].copy()
|
|
2033
|
+
eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
|
|
2022
2034
|
|
|
2023
|
-
|
|
2024
|
-
|
|
2025
|
-
|
|
2035
|
+
return eval_set_sampled_dict
|
|
2036
|
+
|
|
2037
|
+
def __cache_and_return_results(
|
|
2038
|
+
self,
|
|
2039
|
+
datasets_hash: str,
|
|
2040
|
+
X_sampled: pd.DataFrame,
|
|
2041
|
+
y_sampled: pd.Series,
|
|
2042
|
+
enriched_X: pd.DataFrame,
|
|
2043
|
+
eval_set_sampled_dict: Dict[int, Tuple],
|
|
2044
|
+
columns_renaming: Dict[str, str],
|
|
2045
|
+
search_keys: Dict[str, SearchKey],
|
|
2046
|
+
) -> _SampledDataForMetrics:
|
|
2026
2047
|
|
|
2027
|
-
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
|
2028
2048
|
self.__cached_sampled_datasets[datasets_hash] = (
|
|
2029
2049
|
X_sampled,
|
|
2030
2050
|
y_sampled,
|
|
2031
2051
|
enriched_X,
|
|
2032
2052
|
eval_set_sampled_dict,
|
|
2033
|
-
|
|
2053
|
+
search_keys,
|
|
2034
2054
|
columns_renaming,
|
|
2035
2055
|
)
|
|
2036
2056
|
|
|
2037
2057
|
return self.__mk_sampled_data_tuple(
|
|
2038
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict,
|
|
2058
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
|
2039
2059
|
)
|
|
2040
2060
|
|
|
2041
2061
|
def __mk_sampled_data_tuple(
|
|
@@ -2047,7 +2067,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2047
2067
|
search_keys: Dict,
|
|
2048
2068
|
columns_renaming: Dict[str, str],
|
|
2049
2069
|
):
|
|
2050
|
-
search_keys = {
|
|
2070
|
+
search_keys = {
|
|
2071
|
+
columns_renaming.get(k, k): v
|
|
2072
|
+
for k, v in search_keys.items()
|
|
2073
|
+
if columns_renaming.get(k, k) in X_sampled.columns.to_list()
|
|
2074
|
+
}
|
|
2051
2075
|
return FeaturesEnricher._SampledDataForMetrics(
|
|
2052
2076
|
X_sampled=X_sampled,
|
|
2053
2077
|
y_sampled=y_sampled,
|
|
@@ -2161,6 +2185,7 @@ if response.status_code == 200:
|
|
|
2161
2185
|
trace_id: str,
|
|
2162
2186
|
X: pd.DataFrame,
|
|
2163
2187
|
*,
|
|
2188
|
+
y: Optional[pd.Series] = None,
|
|
2164
2189
|
exclude_features_sources: Optional[List[str]] = None,
|
|
2165
2190
|
importance_threshold: Optional[float] = None,
|
|
2166
2191
|
max_features: Optional[int] = None,
|
|
@@ -2169,8 +2194,7 @@ if response.status_code == 200:
|
|
|
2169
2194
|
progress_bar: Optional[ProgressBar] = None,
|
|
2170
2195
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
2171
2196
|
add_fit_system_record_id: bool = False,
|
|
2172
|
-
|
|
2173
|
-
) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
|
|
2197
|
+
) -> Tuple[pd.DataFrame, Dict[str, str], List[str], Dict[str, SearchKey]]:
|
|
2174
2198
|
if self._search_task is None:
|
|
2175
2199
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
2176
2200
|
|
|
@@ -2179,20 +2203,28 @@ if response.status_code == 200:
|
|
|
2179
2203
|
self.logger.info("Start transform")
|
|
2180
2204
|
|
|
2181
2205
|
validated_X = self._validate_X(X, is_transform=True)
|
|
2206
|
+
if y is not None:
|
|
2207
|
+
validated_y = self._validate_y(validated_X, y)
|
|
2208
|
+
df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set=None)
|
|
2209
|
+
else:
|
|
2210
|
+
validated_y = None
|
|
2211
|
+
df = validated_X
|
|
2182
2212
|
|
|
2183
|
-
|
|
2213
|
+
validated_Xy = df.copy()
|
|
2214
|
+
|
|
2215
|
+
self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
|
|
2184
2216
|
|
|
2185
2217
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
2186
2218
|
|
|
2187
2219
|
if len(self.feature_names_) == 0:
|
|
2188
2220
|
self.logger.warning(self.bundle.get("no_important_features_for_transform"))
|
|
2189
|
-
return X, {c: c for c in X.columns}, []
|
|
2221
|
+
return X, {c: c for c in X.columns}, [], {}
|
|
2190
2222
|
|
|
2191
2223
|
if self._has_paid_features(exclude_features_sources):
|
|
2192
2224
|
msg = self.bundle.get("transform_with_paid_features")
|
|
2193
2225
|
self.logger.warning(msg)
|
|
2194
2226
|
self.__display_support_link(msg)
|
|
2195
|
-
return None, {c: c for c in X.columns}, []
|
|
2227
|
+
return None, {c: c for c in X.columns}, [], {}
|
|
2196
2228
|
|
|
2197
2229
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
2198
2230
|
online_api_features = [fm.name for fm in features_meta if fm.from_online_api and fm.shap_value > 0]
|
|
@@ -2215,7 +2247,7 @@ if response.status_code == 200:
|
|
|
2215
2247
|
self.logger.warning(msg)
|
|
2216
2248
|
print(msg)
|
|
2217
2249
|
show_request_quote_button()
|
|
2218
|
-
return None, {c: c for c in X.columns}, []
|
|
2250
|
+
return None, {c: c for c in X.columns}, [], {}
|
|
2219
2251
|
else:
|
|
2220
2252
|
msg = self.bundle.get("transform_usage_info").format(
|
|
2221
2253
|
transform_usage.limit, transform_usage.transformed_rows
|
|
@@ -2223,29 +2255,27 @@ if response.status_code == 200:
|
|
|
2223
2255
|
self.logger.info(msg)
|
|
2224
2256
|
print(msg)
|
|
2225
2257
|
|
|
2226
|
-
is_demo_dataset = hash_input(
|
|
2258
|
+
is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
|
|
2227
2259
|
|
|
2228
2260
|
columns_to_drop = [
|
|
2229
|
-
c for c in
|
|
2261
|
+
c for c in df.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
|
|
2230
2262
|
]
|
|
2231
2263
|
if len(columns_to_drop) > 0:
|
|
2232
2264
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
|
2233
2265
|
self.logger.warning(msg)
|
|
2234
2266
|
print(msg)
|
|
2235
|
-
|
|
2267
|
+
df = df.drop(columns=columns_to_drop)
|
|
2236
2268
|
|
|
2237
2269
|
search_keys = self.search_keys.copy()
|
|
2238
2270
|
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
|
2239
|
-
|
|
2271
|
+
search_keys.update(
|
|
2240
2272
|
{col: SearchKey.CUSTOM_KEY for col in self.id_columns if col not in self.search_keys}
|
|
2241
2273
|
)
|
|
2242
2274
|
|
|
2243
2275
|
search_keys = self.__prepare_search_keys(
|
|
2244
|
-
|
|
2276
|
+
df, search_keys, is_demo_dataset, is_transform=True, silent_mode=silent_mode
|
|
2245
2277
|
)
|
|
2246
2278
|
|
|
2247
|
-
df = validated_X.copy()
|
|
2248
|
-
|
|
2249
2279
|
df = self.__handle_index_search_keys(df, search_keys)
|
|
2250
2280
|
|
|
2251
2281
|
if DEFAULT_INDEX in df.columns:
|
|
@@ -2253,7 +2283,7 @@ if response.status_code == 200:
|
|
|
2253
2283
|
self.logger.info(msg)
|
|
2254
2284
|
print(msg)
|
|
2255
2285
|
df.drop(columns=DEFAULT_INDEX, inplace=True)
|
|
2256
|
-
|
|
2286
|
+
validated_Xy.drop(columns=DEFAULT_INDEX, inplace=True)
|
|
2257
2287
|
|
|
2258
2288
|
df = self.__add_country_code(df, search_keys)
|
|
2259
2289
|
|
|
@@ -2284,8 +2314,11 @@ if response.status_code == 200:
|
|
|
2284
2314
|
features_for_transform = self._search_task.get_features_for_transform() or []
|
|
2285
2315
|
if len(features_for_transform) > 0:
|
|
2286
2316
|
missing_features_for_transform = [
|
|
2287
|
-
columns_renaming.get(f) for f in features_for_transform if f not in df.columns
|
|
2317
|
+
columns_renaming.get(f) or f for f in features_for_transform if f not in df.columns
|
|
2288
2318
|
]
|
|
2319
|
+
if TARGET in missing_features_for_transform:
|
|
2320
|
+
raise ValidationError(self.bundle.get("missing_target_for_transform"))
|
|
2321
|
+
|
|
2289
2322
|
if len(missing_features_for_transform) > 0:
|
|
2290
2323
|
raise ValidationError(
|
|
2291
2324
|
self.bundle.get("missing_features_for_transform").format(missing_features_for_transform)
|
|
@@ -2341,11 +2374,10 @@ if response.status_code == 200:
|
|
|
2341
2374
|
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2342
2375
|
df = converter.convert(df)
|
|
2343
2376
|
|
|
2344
|
-
|
|
2377
|
+
meaning_types = {}
|
|
2378
|
+
meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
|
|
2379
|
+
meaning_types.update({col: key.value for col, key in search_keys.items()})
|
|
2345
2380
|
|
|
2346
|
-
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
2347
|
-
for col in features_for_transform:
|
|
2348
|
-
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
2349
2381
|
features_not_to_pass = [
|
|
2350
2382
|
c
|
|
2351
2383
|
for c in df.columns
|
|
@@ -2354,13 +2386,12 @@ if response.status_code == 200:
|
|
|
2354
2386
|
and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
|
|
2355
2387
|
]
|
|
2356
2388
|
|
|
2357
|
-
if add_fit_system_record_id
|
|
2358
|
-
reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
|
2389
|
+
if add_fit_system_record_id:
|
|
2359
2390
|
df = self.__add_fit_system_record_id(
|
|
2360
2391
|
df,
|
|
2361
2392
|
search_keys,
|
|
2362
2393
|
SYSTEM_RECORD_ID,
|
|
2363
|
-
|
|
2394
|
+
TARGET,
|
|
2364
2395
|
columns_renaming,
|
|
2365
2396
|
silent=True,
|
|
2366
2397
|
)
|
|
@@ -2485,25 +2516,29 @@ if response.status_code == 200:
|
|
|
2485
2516
|
if progress_callback is not None:
|
|
2486
2517
|
progress_callback(progress)
|
|
2487
2518
|
|
|
2488
|
-
def enrich():
|
|
2489
|
-
res, _ = self.__enrich(
|
|
2490
|
-
df_with_original_index,
|
|
2491
|
-
validation_task.get_all_validation_raw_features(trace_id, metrics_calculation),
|
|
2492
|
-
validated_X,
|
|
2493
|
-
is_transform=True,
|
|
2494
|
-
)
|
|
2495
|
-
return res
|
|
2496
|
-
|
|
2497
2519
|
if not silent_mode:
|
|
2498
2520
|
print(self.bundle.get("transform_start"))
|
|
2499
|
-
|
|
2500
|
-
|
|
2501
|
-
|
|
2502
|
-
|
|
2521
|
+
|
|
2522
|
+
# Prepare input DataFrame for __enrich by concatenating generated ids and client features
|
|
2523
|
+
combined_df = pd.concat(
|
|
2524
|
+
[
|
|
2525
|
+
validated_Xy.reset_index(drop=True),
|
|
2526
|
+
df_with_original_index.reset_index(drop=True),
|
|
2527
|
+
],
|
|
2528
|
+
axis=1,
|
|
2529
|
+
).set_index(validated_Xy.index)
|
|
2530
|
+
|
|
2531
|
+
result_features = validation_task.get_all_validation_raw_features(trace_id, metrics_calculation)
|
|
2532
|
+
|
|
2533
|
+
result = self.__enrich(
|
|
2534
|
+
combined_df,
|
|
2535
|
+
result_features,
|
|
2536
|
+
how="left",
|
|
2537
|
+
)
|
|
2503
2538
|
|
|
2504
2539
|
selecting_columns = [
|
|
2505
2540
|
c
|
|
2506
|
-
for c in itertools.chain(
|
|
2541
|
+
for c in itertools.chain(validated_Xy.columns.tolist(), generated_features)
|
|
2507
2542
|
if c not in self.dropped_client_feature_names_
|
|
2508
2543
|
]
|
|
2509
2544
|
filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
|
|
@@ -2515,7 +2550,7 @@ if response.status_code == 200:
|
|
|
2515
2550
|
|
|
2516
2551
|
selecting_columns = list(set(selecting_columns))
|
|
2517
2552
|
# sorting: first columns from X, then generated features, then enriched features
|
|
2518
|
-
sorted_selecting_columns = [c for c in
|
|
2553
|
+
sorted_selecting_columns = [c for c in validated_Xy.columns if c in selecting_columns]
|
|
2519
2554
|
for c in generated_features:
|
|
2520
2555
|
if c in selecting_columns and c not in sorted_selecting_columns:
|
|
2521
2556
|
sorted_selecting_columns.append(c)
|
|
@@ -2533,7 +2568,7 @@ if response.status_code == 200:
|
|
|
2533
2568
|
if add_fit_system_record_id:
|
|
2534
2569
|
result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
|
|
2535
2570
|
|
|
2536
|
-
return result, columns_renaming, generated_features
|
|
2571
|
+
return result, columns_renaming, generated_features, search_keys
|
|
2537
2572
|
|
|
2538
2573
|
def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
|
|
2539
2574
|
features_info = self._internal_features_info
|
|
@@ -2643,6 +2678,7 @@ if response.status_code == 200:
|
|
|
2643
2678
|
importance_threshold: Optional[float],
|
|
2644
2679
|
max_features: Optional[int],
|
|
2645
2680
|
remove_outliers_calc_metrics: Optional[bool],
|
|
2681
|
+
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
|
2646
2682
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
2647
2683
|
search_id_callback: Optional[Callable[[str], Any]] = None,
|
|
2648
2684
|
):
|
|
@@ -2948,6 +2984,7 @@ if response.status_code == 200:
|
|
|
2948
2984
|
runtime_parameters=runtime_parameters,
|
|
2949
2985
|
exclude_features_sources=exclude_features_sources,
|
|
2950
2986
|
force_downsampling=force_downsampling,
|
|
2987
|
+
auto_fe_parameters=auto_fe_parameters,
|
|
2951
2988
|
)
|
|
2952
2989
|
|
|
2953
2990
|
if search_id_callback is not None:
|
|
@@ -3712,23 +3749,19 @@ if response.status_code == 200:
|
|
|
3712
3749
|
|
|
3713
3750
|
def __enrich(
|
|
3714
3751
|
self,
|
|
3715
|
-
|
|
3752
|
+
input_df: pd.DataFrame,
|
|
3716
3753
|
result_features: Optional[pd.DataFrame],
|
|
3717
|
-
|
|
3718
|
-
is_transform=False,
|
|
3719
|
-
rows_to_drop: Optional[pd.DataFrame] = None,
|
|
3754
|
+
how: str = "inner",
|
|
3720
3755
|
drop_system_record_id=True,
|
|
3721
|
-
) ->
|
|
3756
|
+
) -> pd.DataFrame:
|
|
3722
3757
|
if result_features is None:
|
|
3723
3758
|
self.logger.error(f"result features not found by search_task_id: {self.get_search_id()}")
|
|
3724
3759
|
raise RuntimeError(self.bundle.get("features_wasnt_returned"))
|
|
3725
|
-
result_features = (
|
|
3726
|
-
result_features.drop(columns=EVAL_SET_INDEX)
|
|
3727
|
-
if EVAL_SET_INDEX in result_features.columns
|
|
3728
|
-
else result_features
|
|
3729
|
-
)
|
|
3730
3760
|
|
|
3731
|
-
|
|
3761
|
+
if EVAL_SET_INDEX in result_features.columns:
|
|
3762
|
+
result_features = result_features.drop(columns=EVAL_SET_INDEX)
|
|
3763
|
+
|
|
3764
|
+
comparing_columns = input_df.columns
|
|
3732
3765
|
dup_features = [
|
|
3733
3766
|
c
|
|
3734
3767
|
for c in comparing_columns
|
|
@@ -3738,63 +3771,39 @@ if response.status_code == 200:
|
|
|
3738
3771
|
self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
|
|
3739
3772
|
raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
|
|
3740
3773
|
|
|
3741
|
-
# index
|
|
3742
|
-
original_index_name =
|
|
3743
|
-
|
|
3774
|
+
# Handle index and column renaming
|
|
3775
|
+
original_index_name = input_df.index.name
|
|
3776
|
+
renamed_column = None
|
|
3777
|
+
|
|
3778
|
+
# Handle column rename if it conflicts with index name
|
|
3779
|
+
if original_index_name in input_df.columns:
|
|
3780
|
+
renamed_column = f"{original_index_name}_renamed"
|
|
3781
|
+
input_df = input_df.rename(columns={original_index_name: renamed_column})
|
|
3782
|
+
|
|
3783
|
+
# Reset index for the merge operation
|
|
3784
|
+
input_df = input_df.reset_index()
|
|
3785
|
+
|
|
3744
3786
|
# TODO drop system_record_id before merge
|
|
3787
|
+
# Merge with result features
|
|
3745
3788
|
result_features = pd.merge(
|
|
3746
|
-
|
|
3789
|
+
input_df,
|
|
3747
3790
|
result_features,
|
|
3748
3791
|
on=ENTITY_SYSTEM_RECORD_ID,
|
|
3749
|
-
how=
|
|
3792
|
+
how=how,
|
|
3750
3793
|
)
|
|
3794
|
+
|
|
3795
|
+
# Restore the index
|
|
3751
3796
|
result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
|
|
3752
3797
|
result_features.index.name = original_index_name
|
|
3753
3798
|
|
|
3754
|
-
|
|
3755
|
-
|
|
3756
|
-
result_features = result_features
|
|
3757
|
-
~result_features[ENTITY_SYSTEM_RECORD_ID].isin(rows_to_drop[ENTITY_SYSTEM_RECORD_ID])
|
|
3758
|
-
]
|
|
3759
|
-
self.logger.info(f"After dropping target outliers size: {len(result_features)}")
|
|
3760
|
-
|
|
3761
|
-
result_eval_sets = {}
|
|
3762
|
-
if not is_transform and EVAL_SET_INDEX in result_features.columns:
|
|
3763
|
-
result_train_features = result_features.loc[result_features[EVAL_SET_INDEX] == 0].copy()
|
|
3764
|
-
eval_set_indices = list(result_features[EVAL_SET_INDEX].unique())
|
|
3765
|
-
if 0 in eval_set_indices:
|
|
3766
|
-
eval_set_indices.remove(0)
|
|
3767
|
-
for eval_set_index in eval_set_indices:
|
|
3768
|
-
result_eval_sets[eval_set_index] = result_features.loc[
|
|
3769
|
-
result_features[EVAL_SET_INDEX] == eval_set_index
|
|
3770
|
-
].copy()
|
|
3771
|
-
result_train_features = result_train_features.drop(columns=EVAL_SET_INDEX)
|
|
3772
|
-
else:
|
|
3773
|
-
result_train_features = result_features
|
|
3774
|
-
|
|
3775
|
-
if is_transform:
|
|
3776
|
-
index_name = X.index.name
|
|
3777
|
-
renamed_column = None
|
|
3778
|
-
if index_name in X.columns:
|
|
3779
|
-
renamed_column = f"{index_name}_renamed"
|
|
3780
|
-
X = X.rename(columns={index_name: renamed_column})
|
|
3781
|
-
result_train = pd.concat([X.reset_index(), result_train_features.reset_index(drop=True)], axis=1).set_index(
|
|
3782
|
-
index_name or DEFAULT_INDEX
|
|
3783
|
-
)
|
|
3784
|
-
result_train.index.name = index_name
|
|
3785
|
-
if renamed_column is not None:
|
|
3786
|
-
result_train = result_train.rename(columns={renamed_column: index_name})
|
|
3787
|
-
else:
|
|
3788
|
-
result_train = result_train_features
|
|
3799
|
+
# Restore renamed column if needed
|
|
3800
|
+
if renamed_column is not None:
|
|
3801
|
+
result_features = result_features.rename(columns={renamed_column: original_index_name})
|
|
3789
3802
|
|
|
3790
3803
|
if drop_system_record_id:
|
|
3791
|
-
|
|
3792
|
-
for eval_set_index in result_eval_sets.keys():
|
|
3793
|
-
result_eval_sets[eval_set_index] = result_eval_sets[eval_set_index].drop(
|
|
3794
|
-
columns=[SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID], errors="ignore"
|
|
3795
|
-
)
|
|
3804
|
+
result_features = result_features.drop(columns=[SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID], errors="ignore")
|
|
3796
3805
|
|
|
3797
|
-
return
|
|
3806
|
+
return result_features
|
|
3798
3807
|
|
|
3799
3808
|
def __prepare_feature_importances(
|
|
3800
3809
|
self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
|
@@ -4075,10 +4084,7 @@ if response.status_code == 200:
|
|
|
4075
4084
|
)
|
|
4076
4085
|
|
|
4077
4086
|
if all(k == SearchKey.CUSTOM_KEY for k in valid_search_keys.values()):
|
|
4078
|
-
|
|
4079
|
-
msg = self.bundle.get("only_custom_keys")
|
|
4080
|
-
else:
|
|
4081
|
-
msg = self.bundle.get("unregistered_only_personal_keys")
|
|
4087
|
+
msg = self.bundle.get("unregistered_only_personal_keys")
|
|
4082
4088
|
self.logger.warning(msg + f" Provided search keys: {search_keys}")
|
|
4083
4089
|
raise ValidationError(msg)
|
|
4084
4090
|
|