upgini 1.2.91a3906.dev1__py3-none-any.whl → 1.2.92__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/autofe/unary.py +0 -8
- upgini/dataset.py +58 -109
- upgini/features_enricher.py +225 -187
- upgini/metadata.py +3 -0
- upgini/metrics.py +12 -11
- upgini/resource_bundle/strings.properties +2 -0
- upgini/utils/feature_info.py +2 -2
- upgini/utils/sample_utils.py +416 -0
- upgini/utils/target_utils.py +3 -199
- {upgini-1.2.91a3906.dev1.dist-info → upgini-1.2.92.dist-info}/METADATA +1 -1
- {upgini-1.2.91a3906.dev1.dist-info → upgini-1.2.92.dist-info}/RECORD +14 -13
- {upgini-1.2.91a3906.dev1.dist-info → upgini-1.2.92.dist-info}/WHEEL +0 -0
- {upgini-1.2.91a3906.dev1.dist-info → upgini-1.2.92.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
@@ -5,7 +5,6 @@ import hashlib
|
|
5
5
|
import itertools
|
6
6
|
import json
|
7
7
|
import logging
|
8
|
-
import numbers
|
9
8
|
import os
|
10
9
|
import sys
|
11
10
|
import tempfile
|
@@ -30,6 +29,7 @@ from scipy.stats import ks_2samp
|
|
30
29
|
from sklearn.base import TransformerMixin
|
31
30
|
from sklearn.exceptions import NotFittedError
|
32
31
|
from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
|
32
|
+
from sklearn.preprocessing import OrdinalEncoder
|
33
33
|
|
34
34
|
from upgini.autofe.feature import Feature
|
35
35
|
from upgini.autofe.timeseries import TimeSeriesBase
|
@@ -118,9 +118,9 @@ except Exception:
|
|
118
118
|
CustomFallbackProgressBar as ProgressBar,
|
119
119
|
)
|
120
120
|
|
121
|
+
from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
|
121
122
|
from upgini.utils.sort import sort_columns
|
122
123
|
from upgini.utils.target_utils import (
|
123
|
-
balance_undersample_forced,
|
124
124
|
calculate_psi,
|
125
125
|
define_task,
|
126
126
|
)
|
@@ -242,6 +242,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
242
242
|
disable_force_downsampling: bool = False,
|
243
243
|
id_columns: Optional[List[str]] = None,
|
244
244
|
generate_search_key_features: bool = True,
|
245
|
+
sample_config: Optional[SampleConfig] = None,
|
245
246
|
**kwargs,
|
246
247
|
):
|
247
248
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
@@ -286,6 +287,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
286
287
|
|
287
288
|
self.search_keys = search_keys or {}
|
288
289
|
self.id_columns = id_columns
|
290
|
+
self.id_columns_encoder = None
|
289
291
|
self.country_code = country_code
|
290
292
|
self.__validate_search_keys(search_keys, search_id)
|
291
293
|
|
@@ -299,6 +301,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
299
301
|
self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
300
302
|
self.metrics: Optional[pd.DataFrame] = None
|
301
303
|
self.feature_names_ = []
|
304
|
+
self.external_source_feature_names = []
|
302
305
|
self.zero_shap_client_features = []
|
303
306
|
self.feature_importances_ = []
|
304
307
|
self.search_id = search_id
|
@@ -359,10 +362,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
359
362
|
self.columns_for_online_api = columns_for_online_api
|
360
363
|
if columns_for_online_api is not None:
|
361
364
|
self.runtime_parameters.properties["columns_for_online_api"] = ",".join(columns_for_online_api)
|
362
|
-
|
363
|
-
|
364
|
-
Dataset.FIT_SAMPLE_THRESHOLD = int(maybe_downsampling_limit)
|
365
|
-
Dataset.FIT_SAMPLE_ROWS = int(maybe_downsampling_limit)
|
365
|
+
|
366
|
+
self.sample_config = self._get_sample_config(sample_config)
|
366
367
|
|
367
368
|
self.raise_validation_error = raise_validation_error
|
368
369
|
self.exclude_columns = exclude_columns
|
@@ -375,6 +376,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
375
376
|
self.autofe_features_display_handle = None
|
376
377
|
self.report_button_handle = None
|
377
378
|
|
379
|
+
def _get_sample_config(self, sample_config: Optional[SampleConfig] = None):
|
380
|
+
sample_config = sample_config or SampleConfig(force_sample_size=Dataset.FORCE_SAMPLE_SIZE)
|
381
|
+
|
382
|
+
maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
|
383
|
+
if maybe_downsampling_limit is not None:
|
384
|
+
sample_config.fit_sample_rows = int(maybe_downsampling_limit)
|
385
|
+
sample_config.fit_sample_threshold = int(maybe_downsampling_limit)
|
386
|
+
|
387
|
+
return sample_config
|
388
|
+
|
378
389
|
def _get_api_key(self):
|
379
390
|
return self._api_key
|
380
391
|
|
@@ -928,16 +939,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
928
939
|
):
|
929
940
|
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
930
941
|
|
931
|
-
validated_X = self.
|
932
|
-
|
933
|
-
validated_eval_set = (
|
934
|
-
[self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in effective_eval_set]
|
935
|
-
if effective_eval_set is not None
|
936
|
-
else None
|
942
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
943
|
+
effective_X, effective_y, effective_eval_set
|
937
944
|
)
|
938
945
|
|
939
946
|
if self.X is None:
|
940
947
|
self.X = X
|
948
|
+
self.id_columns_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1).fit(
|
949
|
+
X[self.id_columns or []]
|
950
|
+
)
|
941
951
|
if self.y is None:
|
942
952
|
self.y = y
|
943
953
|
if self.eval_set is None:
|
@@ -971,6 +981,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
971
981
|
client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
|
972
982
|
estimator, validated_X, self.search_keys
|
973
983
|
)
|
984
|
+
if self.id_columns and self.id_columns_encoder is not None:
|
985
|
+
if cat_features_from_backend:
|
986
|
+
cat_features_from_backend = [
|
987
|
+
c
|
988
|
+
for c in cat_features_from_backend
|
989
|
+
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
990
|
+
]
|
991
|
+
if client_cat_features:
|
992
|
+
client_cat_features = [
|
993
|
+
c
|
994
|
+
for c in client_cat_features
|
995
|
+
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
996
|
+
]
|
974
997
|
for cat_feature in cat_features_from_backend:
|
975
998
|
original_cat_feature = self.fit_columns_renaming.get(cat_feature)
|
976
999
|
if original_cat_feature in self.search_keys:
|
@@ -1245,7 +1268,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1245
1268
|
metrics.append(eval_metrics)
|
1246
1269
|
|
1247
1270
|
if updating_shaps is not None:
|
1248
|
-
self.
|
1271
|
+
decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
|
1272
|
+
self._update_shap_values(trace_id, decoded_X, updating_shaps, silent=not internal_call)
|
1249
1273
|
|
1250
1274
|
metrics_df = pd.DataFrame(metrics)
|
1251
1275
|
mean_target_hdr = self.bundle.get("quality_metrics_mean_target_header")
|
@@ -1499,16 +1523,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
1499
1523
|
):
|
1500
1524
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
1501
1525
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
1502
|
-
validated_X = self._validate_X(X)
|
1503
|
-
validated_y = self._validate_y(validated_X, y)
|
1504
1526
|
checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
1505
|
-
validated_eval_set = (
|
1506
|
-
[self._validate_eval_set_pair(validated_X, eval_set_pair) for eval_set_pair in checked_eval_set]
|
1507
|
-
if checked_eval_set
|
1508
|
-
else None
|
1509
|
-
)
|
1527
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set)
|
1510
1528
|
|
1511
|
-
sampled_data = self.
|
1529
|
+
sampled_data = self._get_enriched_for_metrics(
|
1512
1530
|
trace_id,
|
1513
1531
|
validated_X,
|
1514
1532
|
validated_y,
|
@@ -1582,7 +1600,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
1582
1600
|
fitting_enriched_X = fitting_enriched_X.drop(columns=columns_with_high_cardinality, errors="ignore")
|
1583
1601
|
|
1584
1602
|
# Detect and drop constant columns
|
1585
|
-
constant_columns =
|
1603
|
+
constant_columns = [
|
1604
|
+
c
|
1605
|
+
for c in FeaturesValidator.find_constant_features(fitting_X)
|
1606
|
+
if self.fit_columns_renaming.get(c, c) not in (self.id_columns or [])
|
1607
|
+
]
|
1586
1608
|
if len(constant_columns) > 0:
|
1587
1609
|
self.logger.warning(f"Constant columns {constant_columns} will be dropped for metrics calculation")
|
1588
1610
|
fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
|
@@ -1625,6 +1647,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1625
1647
|
fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
|
1626
1648
|
)
|
1627
1649
|
fitting_X = fitting_X[fitting_x_columns]
|
1650
|
+
fitting_X, _ = self._encode_id_columns(fitting_X, self.fit_columns_renaming)
|
1628
1651
|
self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
|
1629
1652
|
fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
|
1630
1653
|
fitting_enriched_x_columns = sort_columns(
|
@@ -1636,6 +1659,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1636
1659
|
logger=self.logger,
|
1637
1660
|
)
|
1638
1661
|
fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
|
1662
|
+
fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X, self.fit_columns_renaming)
|
1639
1663
|
self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
|
1640
1664
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
1641
1665
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
@@ -1663,6 +1687,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
1663
1687
|
.astype(np.float64)
|
1664
1688
|
)
|
1665
1689
|
|
1690
|
+
fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X, self.fit_columns_renaming)
|
1691
|
+
fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X, self.fit_columns_renaming)
|
1692
|
+
|
1693
|
+
if len(unknown_dict) > 0:
|
1694
|
+
print(self.bundle.get("unknown_id_column_value_in_eval_set").format(unknown_dict))
|
1695
|
+
|
1666
1696
|
fitting_eval_set_dict[idx] = (
|
1667
1697
|
fitting_eval_X,
|
1668
1698
|
eval_y_sorted,
|
@@ -1684,7 +1714,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1684
1714
|
)
|
1685
1715
|
|
1686
1716
|
@dataclass
|
1687
|
-
class
|
1717
|
+
class _EnrichedDataForMetrics:
|
1688
1718
|
X_sampled: pd.DataFrame
|
1689
1719
|
y_sampled: pd.Series
|
1690
1720
|
enriched_X: pd.DataFrame
|
@@ -1692,7 +1722,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1692
1722
|
search_keys: Dict[str, SearchKey]
|
1693
1723
|
columns_renaming: Dict[str, str]
|
1694
1724
|
|
1695
|
-
def
|
1725
|
+
def _get_enriched_for_metrics(
|
1696
1726
|
self,
|
1697
1727
|
trace_id: str,
|
1698
1728
|
validated_X: Union[pd.DataFrame, pd.Series, np.ndarray, None],
|
@@ -1704,7 +1734,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1704
1734
|
remove_outliers_calc_metrics: Optional[bool],
|
1705
1735
|
progress_bar: Optional[ProgressBar],
|
1706
1736
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
1707
|
-
) ->
|
1737
|
+
) -> _EnrichedDataForMetrics:
|
1708
1738
|
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
1709
1739
|
cached_sampled_datasets = self.__cached_sampled_datasets.get(datasets_hash)
|
1710
1740
|
if cached_sampled_datasets is not None and is_input_same_as_fit and remove_outliers_calc_metrics is None:
|
@@ -1712,7 +1742,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1712
1742
|
return self.__get_sampled_cached_enriched(datasets_hash, exclude_features_sources)
|
1713
1743
|
elif len(self.feature_importances_) == 0:
|
1714
1744
|
self.logger.info("No external features selected. So use only input datasets for metrics calculation")
|
1715
|
-
return self.
|
1745
|
+
return self.__get_enriched_as_input(validated_X, validated_y, eval_set, is_demo_dataset)
|
1716
1746
|
# TODO save and check if dataset was deduplicated - use imbalance branch for such case
|
1717
1747
|
elif (
|
1718
1748
|
not self.imbalanced
|
@@ -1721,14 +1751,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
1721
1751
|
and self.df_with_original_index is not None
|
1722
1752
|
):
|
1723
1753
|
self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
|
1724
|
-
return self.
|
1754
|
+
return self.__get_enriched_from_fit(eval_set, trace_id, remove_outliers_calc_metrics)
|
1725
1755
|
else:
|
1726
1756
|
self.logger.info(
|
1727
1757
|
"Dataset is imbalanced or exclude_features_sources or X was passed or this is saved search."
|
1728
1758
|
" Run transform"
|
1729
1759
|
)
|
1730
1760
|
print(self.bundle.get("prepare_data_for_metrics"))
|
1731
|
-
return self.
|
1761
|
+
return self.__get_enriched_from_transform(
|
1732
1762
|
validated_X,
|
1733
1763
|
validated_y,
|
1734
1764
|
eval_set,
|
@@ -1740,7 +1770,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1740
1770
|
|
1741
1771
|
def __get_sampled_cached_enriched(
|
1742
1772
|
self, datasets_hash: str, exclude_features_sources: Optional[List[str]]
|
1743
|
-
) ->
|
1773
|
+
) -> _EnrichedDataForMetrics:
|
1744
1774
|
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
|
1745
1775
|
self.__cached_sampled_datasets[datasets_hash]
|
1746
1776
|
)
|
@@ -1757,9 +1787,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
1757
1787
|
search_keys,
|
1758
1788
|
)
|
1759
1789
|
|
1760
|
-
def
|
1790
|
+
def __get_enriched_as_input(
|
1761
1791
|
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
|
1762
|
-
) ->
|
1792
|
+
) -> _EnrichedDataForMetrics:
|
1763
1793
|
eval_set_sampled_dict = {}
|
1764
1794
|
|
1765
1795
|
df = validated_X.copy()
|
@@ -1801,24 +1831,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
1801
1831
|
normalizer = Normalizer(self.bundle, self.logger)
|
1802
1832
|
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
1803
1833
|
columns_renaming = normalizer.columns_renaming
|
1804
|
-
# columns_renaming = {c: c for c in df.columns}
|
1805
1834
|
|
1806
1835
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
1807
|
-
|
1808
|
-
num_samples = _num_samples(df)
|
1809
|
-
sample_threshold, sample_rows = (
|
1810
|
-
(Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD, Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS)
|
1811
|
-
if eval_set is not None
|
1812
|
-
else (Dataset.FIT_SAMPLE_THRESHOLD, Dataset.FIT_SAMPLE_ROWS)
|
1813
|
-
)
|
1814
|
-
|
1815
1836
|
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET, columns_renaming, silent=True)
|
1837
|
+
|
1816
1838
|
# Sample after sorting by system_record_id for idempotency
|
1817
1839
|
df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
|
1818
|
-
|
1819
|
-
if num_samples > sample_threshold:
|
1820
|
-
self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
|
1821
|
-
df = df.sample(n=sample_rows, random_state=self.random_state)
|
1840
|
+
df = self.__downsample_for_metrics(df)
|
1822
1841
|
|
1823
1842
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
1824
1843
|
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
@@ -1847,12 +1866,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
1847
1866
|
search_keys,
|
1848
1867
|
)
|
1849
1868
|
|
1850
|
-
def
|
1869
|
+
def __get_enriched_from_fit(
|
1851
1870
|
self,
|
1852
1871
|
eval_set: Optional[List[tuple]],
|
1853
1872
|
trace_id: str,
|
1854
1873
|
remove_outliers_calc_metrics: Optional[bool],
|
1855
|
-
) ->
|
1874
|
+
) -> _EnrichedDataForMetrics:
|
1856
1875
|
eval_set_sampled_dict = {}
|
1857
1876
|
search_keys = self.fit_search_keys
|
1858
1877
|
|
@@ -1951,7 +1970,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1951
1970
|
search_keys,
|
1952
1971
|
)
|
1953
1972
|
|
1954
|
-
def
|
1973
|
+
def __get_enriched_from_transform(
|
1955
1974
|
self,
|
1956
1975
|
validated_X: pd.DataFrame,
|
1957
1976
|
validated_y: pd.Series,
|
@@ -1960,7 +1979,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1960
1979
|
trace_id: str,
|
1961
1980
|
progress_bar: Optional[ProgressBar],
|
1962
1981
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
1963
|
-
) ->
|
1982
|
+
) -> _EnrichedDataForMetrics:
|
1964
1983
|
has_eval_set = eval_set is not None
|
1965
1984
|
|
1966
1985
|
self.logger.info(f"Transform {'with' if has_eval_set else 'without'} eval_set")
|
@@ -2017,17 +2036,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
2017
2036
|
)
|
2018
2037
|
|
2019
2038
|
def __combine_train_and_eval_sets(
|
2020
|
-
self,
|
2039
|
+
self, X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[List[tuple]] = None
|
2021
2040
|
) -> pd.DataFrame:
|
2022
|
-
df =
|
2023
|
-
|
2024
|
-
|
2041
|
+
df = X.copy()
|
2042
|
+
if y is not None:
|
2043
|
+
df[TARGET] = y
|
2044
|
+
if not eval_set:
|
2025
2045
|
return df
|
2026
2046
|
|
2027
2047
|
df[EVAL_SET_INDEX] = 0
|
2028
2048
|
|
2029
2049
|
for idx, eval_pair in enumerate(eval_set):
|
2030
|
-
eval_x, eval_y =
|
2050
|
+
eval_x, eval_y = eval_pair
|
2031
2051
|
eval_df_with_index = eval_x.copy()
|
2032
2052
|
eval_df_with_index[TARGET] = eval_y
|
2033
2053
|
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
@@ -2036,42 +2056,37 @@ class FeaturesEnricher(TransformerMixin):
|
|
2036
2056
|
return df
|
2037
2057
|
|
2038
2058
|
def __downsample_for_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
|
2059
|
+
force_downsampling = self.__use_force_downsampling(df)
|
2060
|
+
|
2061
|
+
sample_columns = SampleColumns(
|
2062
|
+
ids=self.id_columns,
|
2063
|
+
date=self._get_date_column(self.search_keys),
|
2064
|
+
target=TARGET,
|
2065
|
+
eval_set_index=EVAL_SET_INDEX,
|
2066
|
+
)
|
2067
|
+
|
2068
|
+
return sample(
|
2069
|
+
df,
|
2070
|
+
self.model_task_type,
|
2071
|
+
self.cv,
|
2072
|
+
self.sample_config,
|
2073
|
+
sample_columns,
|
2074
|
+
self.random_state,
|
2075
|
+
force_downsampling=force_downsampling,
|
2076
|
+
balance=False,
|
2077
|
+
logger=self.logger,
|
2078
|
+
bundle=self.bundle,
|
2079
|
+
warning_callback=self.__log_warning,
|
2080
|
+
)
|
2081
|
+
|
2082
|
+
def __use_force_downsampling(self, df: pd.DataFrame) -> bool:
|
2039
2083
|
num_samples = _num_samples(df)
|
2040
|
-
|
2084
|
+
return (
|
2041
2085
|
not self.disable_force_downsampling
|
2042
2086
|
and self.columns_for_online_api is not None
|
2043
2087
|
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
2044
2088
|
)
|
2045
2089
|
|
2046
|
-
if force_downsampling:
|
2047
|
-
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
2048
|
-
return balance_undersample_forced(
|
2049
|
-
df=df,
|
2050
|
-
target_column=TARGET,
|
2051
|
-
id_columns=self.id_columns,
|
2052
|
-
date_column=self._get_date_column(self.search_keys),
|
2053
|
-
task_type=self.model_task_type,
|
2054
|
-
cv_type=self.cv,
|
2055
|
-
random_state=self.random_state,
|
2056
|
-
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
2057
|
-
logger=self.logger,
|
2058
|
-
bundle=self.bundle,
|
2059
|
-
warning_callback=self.__log_warning,
|
2060
|
-
)
|
2061
|
-
elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
2062
|
-
if EVAL_SET_INDEX in df.columns:
|
2063
|
-
threshold = Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
|
2064
|
-
sample_size = Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS
|
2065
|
-
else:
|
2066
|
-
threshold = Dataset.FIT_SAMPLE_THRESHOLD
|
2067
|
-
sample_size = Dataset.FIT_SAMPLE_ROWS
|
2068
|
-
|
2069
|
-
if num_samples > threshold:
|
2070
|
-
self.logger.info(f"Downsampling from {num_samples} to {sample_size}")
|
2071
|
-
return df.sample(n=sample_size, random_state=self.random_state)
|
2072
|
-
|
2073
|
-
return df
|
2074
|
-
|
2075
2090
|
def __extract_train_data(
|
2076
2091
|
self, enriched_df: pd.DataFrame, x_columns: List[str]
|
2077
2092
|
) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
|
@@ -2107,7 +2122,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2107
2122
|
eval_set_sampled_dict: Dict[int, Tuple],
|
2108
2123
|
columns_renaming: Dict[str, str],
|
2109
2124
|
search_keys: Dict[str, SearchKey],
|
2110
|
-
) ->
|
2125
|
+
) -> _EnrichedDataForMetrics:
|
2111
2126
|
|
2112
2127
|
self.__cached_sampled_datasets[datasets_hash] = (
|
2113
2128
|
X_sampled,
|
@@ -2138,7 +2153,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2138
2153
|
for k, v in search_keys.items()
|
2139
2154
|
if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
|
2140
2155
|
}
|
2141
|
-
return FeaturesEnricher.
|
2156
|
+
return FeaturesEnricher._EnrichedDataForMetrics(
|
2142
2157
|
X_sampled=X_sampled,
|
2143
2158
|
y_sampled=y_sampled,
|
2144
2159
|
enriched_X=enriched_X,
|
@@ -2286,13 +2301,10 @@ if response.status_code == 200:
|
|
2286
2301
|
with MDC(trace_id=trace_id, search_id=search_id):
|
2287
2302
|
self.logger.info("Start transform")
|
2288
2303
|
|
2289
|
-
validated_X = self.
|
2290
|
-
|
2291
|
-
|
2292
|
-
|
2293
|
-
else:
|
2294
|
-
validated_y = None
|
2295
|
-
df = validated_X
|
2304
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
2305
|
+
X, y, eval_set=None, is_transform=True
|
2306
|
+
)
|
2307
|
+
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
|
2296
2308
|
|
2297
2309
|
validated_Xy = df.copy()
|
2298
2310
|
|
@@ -2346,7 +2358,9 @@ if response.status_code == 200:
|
|
2346
2358
|
|
2347
2359
|
is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
|
2348
2360
|
|
2349
|
-
columns_to_drop = [
|
2361
|
+
columns_to_drop = [
|
2362
|
+
c for c in df.columns if c in self.feature_names_ and c in self.external_source_feature_names
|
2363
|
+
]
|
2350
2364
|
if len(columns_to_drop) > 0:
|
2351
2365
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
2352
2366
|
self.logger.warning(msg)
|
@@ -2550,6 +2564,7 @@ if response.status_code == 200:
|
|
2550
2564
|
id_columns=self.__get_renamed_id_columns(columns_renaming),
|
2551
2565
|
date_column=self._get_date_column(search_keys),
|
2552
2566
|
date_format=self.date_format,
|
2567
|
+
sample_config=self.sample_config,
|
2553
2568
|
rest_client=self.rest_client,
|
2554
2569
|
logger=self.logger,
|
2555
2570
|
bundle=self.bundle,
|
@@ -2653,7 +2668,7 @@ if response.status_code == 200:
|
|
2653
2668
|
selecting_columns = [
|
2654
2669
|
c
|
2655
2670
|
for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
|
2656
|
-
if c not in self.zero_shap_client_features
|
2671
|
+
if c not in self.zero_shap_client_features or c in (self.id_columns or [])
|
2657
2672
|
]
|
2658
2673
|
selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
|
2659
2674
|
if add_fit_system_record_id:
|
@@ -2801,13 +2816,8 @@ if response.status_code == 200:
|
|
2801
2816
|
self.fit_dropped_features = set()
|
2802
2817
|
self.fit_generated_features = []
|
2803
2818
|
|
2804
|
-
validated_X = self.
|
2805
|
-
|
2806
|
-
validated_eval_set = (
|
2807
|
-
[self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in eval_set]
|
2808
|
-
if eval_set is not None
|
2809
|
-
else None
|
2810
|
-
)
|
2819
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set)
|
2820
|
+
|
2811
2821
|
is_demo_dataset = hash_input(validated_X, validated_y, validated_eval_set) in DEMO_DATASET_HASHES
|
2812
2822
|
if is_demo_dataset:
|
2813
2823
|
msg = self.bundle.get("demo_dataset_info")
|
@@ -2852,14 +2862,8 @@ if response.status_code == 200:
|
|
2852
2862
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
2853
2863
|
)
|
2854
2864
|
|
2855
|
-
df =
|
2856
|
-
|
2857
|
-
if validated_eval_set is not None and len(validated_eval_set) > 0:
|
2858
|
-
df[EVAL_SET_INDEX] = 0
|
2859
|
-
for idx, (eval_X, eval_y) in enumerate(validated_eval_set):
|
2860
|
-
eval_df = pd.concat([eval_X, eval_y], axis=1)
|
2861
|
-
eval_df[EVAL_SET_INDEX] = idx + 1
|
2862
|
-
df = pd.concat([df, eval_df])
|
2865
|
+
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
|
2866
|
+
self.id_columns_encoder = OrdinalEncoder().fit(df[self.id_columns or []])
|
2863
2867
|
|
2864
2868
|
self.fit_search_keys = self.search_keys.copy()
|
2865
2869
|
df = self.__handle_index_search_keys(df, self.fit_search_keys)
|
@@ -2970,47 +2974,8 @@ if response.status_code == 200:
|
|
2970
2974
|
# TODO check maybe need to drop _time column from df_with_original_index
|
2971
2975
|
|
2972
2976
|
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, self.fit_columns_renaming)
|
2973
|
-
|
2974
|
-
|
2975
|
-
email_column = self._get_email_column(self.fit_search_keys)
|
2976
|
-
hem_column = self._get_hem_column(self.fit_search_keys)
|
2977
|
-
if email_column:
|
2978
|
-
converter = EmailSearchKeyConverter(
|
2979
|
-
email_column,
|
2980
|
-
hem_column,
|
2981
|
-
self.fit_search_keys,
|
2982
|
-
self.fit_columns_renaming,
|
2983
|
-
list(unnest_search_keys.keys()),
|
2984
|
-
self.bundle,
|
2985
|
-
self.logger,
|
2986
|
-
)
|
2987
|
-
df = converter.convert(df)
|
2988
|
-
|
2989
|
-
ip_column = self._get_ip_column(self.fit_search_keys)
|
2990
|
-
if ip_column:
|
2991
|
-
converter = IpSearchKeyConverter(
|
2992
|
-
ip_column,
|
2993
|
-
self.fit_search_keys,
|
2994
|
-
self.fit_columns_renaming,
|
2995
|
-
list(unnest_search_keys.keys()),
|
2996
|
-
self.bundle,
|
2997
|
-
self.logger,
|
2998
|
-
)
|
2999
|
-
df = converter.convert(df)
|
3000
|
-
phone_column = self._get_phone_column(self.fit_search_keys)
|
3001
|
-
country_column = self._get_country_column(self.fit_search_keys)
|
3002
|
-
if phone_column:
|
3003
|
-
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
3004
|
-
df = converter.convert(df)
|
3005
|
-
|
3006
|
-
if country_column:
|
3007
|
-
converter = CountrySearchKeyConverter(country_column)
|
3008
|
-
df = converter.convert(df)
|
3009
|
-
|
3010
|
-
postal_code = self._get_postal_column(self.fit_search_keys)
|
3011
|
-
if postal_code:
|
3012
|
-
converter = PostalCodeSearchKeyConverter(postal_code)
|
3013
|
-
df = converter.convert(df)
|
2977
|
+
# Convert EMAIL to HEM etc after unnesting to do it only with one column
|
2978
|
+
df = self.__convert_unnestable_keys(df, unnest_search_keys)
|
3014
2979
|
|
3015
2980
|
non_feature_columns = [
|
3016
2981
|
self.TARGET_NAME,
|
@@ -3061,11 +3026,7 @@ if response.status_code == 200:
|
|
3061
3026
|
runtime_parameters = self._get_copy_of_runtime_parameters()
|
3062
3027
|
|
3063
3028
|
# Force downsampling to 7000 for API features generation
|
3064
|
-
force_downsampling = (
|
3065
|
-
not self.disable_force_downsampling
|
3066
|
-
and self.columns_for_online_api is not None
|
3067
|
-
and len(df) > Dataset.FORCE_SAMPLE_SIZE
|
3068
|
-
)
|
3029
|
+
force_downsampling = self.__use_force_downsampling(df)
|
3069
3030
|
if force_downsampling:
|
3070
3031
|
runtime_parameters.properties["fast_fit"] = True
|
3071
3032
|
|
@@ -3085,6 +3046,7 @@ if response.status_code == 200:
|
|
3085
3046
|
logger=self.logger,
|
3086
3047
|
bundle=self.bundle,
|
3087
3048
|
warning_callback=self.__log_warning,
|
3049
|
+
sample_config=self.sample_config,
|
3088
3050
|
)
|
3089
3051
|
dataset.columns_renaming = self.fit_columns_renaming
|
3090
3052
|
|
@@ -3240,6 +3202,49 @@ if response.status_code == 200:
|
|
3240
3202
|
if not self.warning_counter.has_warnings():
|
3241
3203
|
self.__display_support_link(self.bundle.get("all_ok_community_invite"))
|
3242
3204
|
|
3205
|
+
def __convert_unnestable_keys(self, df: pd.DataFrame, unnest_search_keys: Dict[str, str]):
|
3206
|
+
email_column = self._get_email_column(self.fit_search_keys)
|
3207
|
+
hem_column = self._get_hem_column(self.fit_search_keys)
|
3208
|
+
if email_column:
|
3209
|
+
converter = EmailSearchKeyConverter(
|
3210
|
+
email_column,
|
3211
|
+
hem_column,
|
3212
|
+
self.fit_search_keys,
|
3213
|
+
self.fit_columns_renaming,
|
3214
|
+
list(unnest_search_keys.keys()),
|
3215
|
+
self.bundle,
|
3216
|
+
self.logger,
|
3217
|
+
)
|
3218
|
+
df = converter.convert(df)
|
3219
|
+
|
3220
|
+
ip_column = self._get_ip_column(self.fit_search_keys)
|
3221
|
+
if ip_column:
|
3222
|
+
converter = IpSearchKeyConverter(
|
3223
|
+
ip_column,
|
3224
|
+
self.fit_search_keys,
|
3225
|
+
self.fit_columns_renaming,
|
3226
|
+
list(unnest_search_keys.keys()),
|
3227
|
+
self.bundle,
|
3228
|
+
self.logger,
|
3229
|
+
)
|
3230
|
+
df = converter.convert(df)
|
3231
|
+
phone_column = self._get_phone_column(self.fit_search_keys)
|
3232
|
+
country_column = self._get_country_column(self.fit_search_keys)
|
3233
|
+
if phone_column:
|
3234
|
+
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
3235
|
+
df = converter.convert(df)
|
3236
|
+
|
3237
|
+
if country_column:
|
3238
|
+
converter = CountrySearchKeyConverter(country_column)
|
3239
|
+
df = converter.convert(df)
|
3240
|
+
|
3241
|
+
postal_code = self._get_postal_column(self.fit_search_keys)
|
3242
|
+
if postal_code:
|
3243
|
+
converter = PostalCodeSearchKeyConverter(postal_code)
|
3244
|
+
df = converter.convert(df)
|
3245
|
+
|
3246
|
+
return df
|
3247
|
+
|
3243
3248
|
def __should_add_date_column(self):
|
3244
3249
|
return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
|
3245
3250
|
|
@@ -3282,6 +3287,57 @@ if response.status_code == 200:
|
|
3282
3287
|
search_keys_with_autodetection = {**self.search_keys, **self.autodetected_search_keys}
|
3283
3288
|
return [c for c, v in search_keys_with_autodetection.items() if v.value.value in keys]
|
3284
3289
|
|
3290
|
+
def _validate_train_eval(
|
3291
|
+
self,
|
3292
|
+
X: pd.DataFrame,
|
3293
|
+
y: Optional[pd.Series] = None,
|
3294
|
+
eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
|
3295
|
+
is_transform: bool = False,
|
3296
|
+
) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
|
3297
|
+
validated_X = self._validate_X(X, is_transform)
|
3298
|
+
validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
|
3299
|
+
validated_eval_set = self._validate_eval_set(validated_X, eval_set)
|
3300
|
+
return validated_X, validated_y, validated_eval_set
|
3301
|
+
|
3302
|
+
def _encode_id_columns(
|
3303
|
+
self,
|
3304
|
+
X: pd.DataFrame,
|
3305
|
+
columns_renaming: Optional[Dict[str, str]] = None,
|
3306
|
+
) -> Tuple[pd.DataFrame, Dict[str, List[Any]]]:
|
3307
|
+
columns_renaming = columns_renaming or {}
|
3308
|
+
unknown_dict = {}
|
3309
|
+
|
3310
|
+
if self.id_columns and self.id_columns_encoder is not None:
|
3311
|
+
inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
3312
|
+
renamed_id_columns = [
|
3313
|
+
inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
|
3314
|
+
]
|
3315
|
+
self.logger.info(f"Convert id columns to int: {renamed_id_columns}")
|
3316
|
+
encoded = self.id_columns_encoder.transform(X[renamed_id_columns].rename(columns=columns_renaming))
|
3317
|
+
for i, c in enumerate(renamed_id_columns):
|
3318
|
+
unknown_values = X[encoded[:, i] == -1][c].unique().tolist()
|
3319
|
+
if len(unknown_values) > 0:
|
3320
|
+
unknown_dict[c] = unknown_values
|
3321
|
+
X[renamed_id_columns] = encoded
|
3322
|
+
X = X.loc[(X[renamed_id_columns] != -1).all(axis=1)]
|
3323
|
+
|
3324
|
+
if len(unknown_dict) > 0:
|
3325
|
+
self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
|
3326
|
+
|
3327
|
+
return X, unknown_dict
|
3328
|
+
|
3329
|
+
def _decode_id_columns(self, X: pd.DataFrame, columns_renaming: Dict[str, str]):
|
3330
|
+
columns_renaming = columns_renaming or {}
|
3331
|
+
if self.id_columns and self.id_columns_encoder is not None:
|
3332
|
+
inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
3333
|
+
renamed_id_columns = [
|
3334
|
+
inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
|
3335
|
+
]
|
3336
|
+
decoded = self.id_columns_encoder.inverse_transform(X[renamed_id_columns].rename(columns=columns_renaming))
|
3337
|
+
X[renamed_id_columns] = decoded
|
3338
|
+
|
3339
|
+
return X
|
3340
|
+
|
3285
3341
|
def _validate_X(self, X, is_transform=False) -> pd.DataFrame:
|
3286
3342
|
if isinstance(X, pd.DataFrame):
|
3287
3343
|
if isinstance(X.columns, pd.MultiIndex) or isinstance(X.index, pd.MultiIndex):
|
@@ -3323,7 +3379,9 @@ if response.status_code == 200:
|
|
3323
3379
|
|
3324
3380
|
return validated_X
|
3325
3381
|
|
3326
|
-
def _validate_y(self, X: pd.DataFrame, y) -> pd.Series:
|
3382
|
+
def _validate_y(self, X: pd.DataFrame, y, enforce_y: bool = True) -> Optional[pd.Series]:
|
3383
|
+
if y is None and not enforce_y:
|
3384
|
+
return None
|
3327
3385
|
if (
|
3328
3386
|
not isinstance(y, pd.Series)
|
3329
3387
|
and not isinstance(y, pd.DataFrame)
|
@@ -3370,6 +3428,11 @@ if response.status_code == 200:
|
|
3370
3428
|
|
3371
3429
|
return validated_y
|
3372
3430
|
|
3431
|
+
def _validate_eval_set(self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]]):
|
3432
|
+
if eval_set is None:
|
3433
|
+
return None
|
3434
|
+
return [self._validate_eval_set_pair(X, eval_pair) for eval_pair in eval_set]
|
3435
|
+
|
3373
3436
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
3374
3437
|
if len(eval_pair) != 2:
|
3375
3438
|
raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
@@ -3450,7 +3513,7 @@ if response.status_code == 200:
|
|
3450
3513
|
raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
|
3451
3514
|
|
3452
3515
|
# Check for duplicates between train and eval sets by comparing all values
|
3453
|
-
train_eval_intersection = pd.merge(X, validated_eval_X, how=
|
3516
|
+
train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
|
3454
3517
|
if len(train_eval_intersection) > 0:
|
3455
3518
|
raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
|
3456
3519
|
|
@@ -3980,7 +4043,7 @@ if response.status_code == 200:
|
|
3980
4043
|
if features_meta is None:
|
3981
4044
|
raise Exception(self.bundle.get("missing_features_meta"))
|
3982
4045
|
|
3983
|
-
return [f.name for f in features_meta if f.type == "categorical"]
|
4046
|
+
return [f.name for f in features_meta if f.type == "categorical" and f.name not in (self.id_columns or [])]
|
3984
4047
|
|
3985
4048
|
def __prepare_feature_importances(
|
3986
4049
|
self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
@@ -3999,6 +4062,7 @@ if response.status_code == 200:
|
|
3999
4062
|
df = df.rename(columns=original_names_dict)
|
4000
4063
|
|
4001
4064
|
self.feature_names_ = []
|
4065
|
+
self.external_source_feature_names = []
|
4002
4066
|
self.zero_shap_client_features = []
|
4003
4067
|
self.feature_importances_ = []
|
4004
4068
|
features_info = []
|
@@ -4030,6 +4094,9 @@ if response.status_code == 200:
|
|
4030
4094
|
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
4031
4095
|
is_client_feature = original_name in df.columns
|
4032
4096
|
|
4097
|
+
if not is_client_feature:
|
4098
|
+
self.external_source_feature_names.append(original_name)
|
4099
|
+
|
4033
4100
|
# TODO make a decision about selected features based on special flag from mlb
|
4034
4101
|
if original_shaps.get(feature_meta.name, 0.0) == 0.0:
|
4035
4102
|
if is_client_feature and self.fit_select_features:
|
@@ -4623,35 +4690,6 @@ if response.status_code == 200:
|
|
4623
4690
|
self.logger.warning("Failed to dump input files", exc_info=True)
|
4624
4691
|
|
4625
4692
|
|
4626
|
-
def _num_samples(x):
|
4627
|
-
"""Return number of samples in array-like x."""
|
4628
|
-
if x is None:
|
4629
|
-
return 0
|
4630
|
-
message = "Expected sequence or array-like, got %s" % type(x)
|
4631
|
-
if hasattr(x, "fit") and callable(x.fit):
|
4632
|
-
# Don't get num_samples from an ensembles length!
|
4633
|
-
raise TypeError(message)
|
4634
|
-
|
4635
|
-
if not hasattr(x, "__len__") and not hasattr(x, "shape"):
|
4636
|
-
if hasattr(x, "__array__"):
|
4637
|
-
x = np.asarray(x)
|
4638
|
-
else:
|
4639
|
-
raise TypeError(message)
|
4640
|
-
|
4641
|
-
if hasattr(x, "shape") and x.shape is not None:
|
4642
|
-
if len(x.shape) == 0:
|
4643
|
-
raise TypeError("Singleton array %r cannot be considered a valid collection." % x)
|
4644
|
-
# Check that shape is returning an integer or default to len
|
4645
|
-
# Dask dataframes may not return numeric shape[0] value
|
4646
|
-
if isinstance(x.shape[0], numbers.Integral):
|
4647
|
-
return x.shape[0]
|
4648
|
-
|
4649
|
-
try:
|
4650
|
-
return len(x)
|
4651
|
-
except TypeError as type_error:
|
4652
|
-
raise TypeError(message) from type_error
|
4653
|
-
|
4654
|
-
|
4655
4693
|
def is_frames_equal(first, second, bundle: ResourceBundle) -> bool:
|
4656
4694
|
if (isinstance(first, pd.DataFrame) and isinstance(second, pd.DataFrame)) or (
|
4657
4695
|
isinstance(first, pd.Series) and isinstance(second, pd.Series)
|