upgini 1.2.91a3884.dev5__py3-none-any.whl → 1.2.91a3906.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/autofe/unary.py +8 -0
- upgini/dataset.py +107 -58
- upgini/features_enricher.py +191 -227
- upgini/metadata.py +0 -3
- upgini/metrics.py +11 -12
- upgini/resource_bundle/strings.properties +0 -2
- upgini/utils/target_utils.py +199 -3
- {upgini-1.2.91a3884.dev5.dist-info → upgini-1.2.91a3906.dev1.dist-info}/METADATA +1 -1
- {upgini-1.2.91a3884.dev5.dist-info → upgini-1.2.91a3906.dev1.dist-info}/RECORD +12 -13
- upgini/utils/sample_utils.py +0 -414
- {upgini-1.2.91a3884.dev5.dist-info → upgini-1.2.91a3906.dev1.dist-info}/WHEEL +0 -0
- {upgini-1.2.91a3884.dev5.dist-info → upgini-1.2.91a3906.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
@@ -30,7 +30,6 @@ from scipy.stats import ks_2samp
|
|
30
30
|
from sklearn.base import TransformerMixin
|
31
31
|
from sklearn.exceptions import NotFittedError
|
32
32
|
from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
|
33
|
-
from sklearn.preprocessing import OrdinalEncoder
|
34
33
|
|
35
34
|
from upgini.autofe.feature import Feature
|
36
35
|
from upgini.autofe.timeseries import TimeSeriesBase
|
@@ -119,9 +118,9 @@ except Exception:
|
|
119
118
|
CustomFallbackProgressBar as ProgressBar,
|
120
119
|
)
|
121
120
|
|
122
|
-
from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
|
123
121
|
from upgini.utils.sort import sort_columns
|
124
122
|
from upgini.utils.target_utils import (
|
123
|
+
balance_undersample_forced,
|
125
124
|
calculate_psi,
|
126
125
|
define_task,
|
127
126
|
)
|
@@ -243,7 +242,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
243
242
|
disable_force_downsampling: bool = False,
|
244
243
|
id_columns: Optional[List[str]] = None,
|
245
244
|
generate_search_key_features: bool = True,
|
246
|
-
sample_config: Optional[SampleConfig] = None,
|
247
245
|
**kwargs,
|
248
246
|
):
|
249
247
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
@@ -288,7 +286,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
288
286
|
|
289
287
|
self.search_keys = search_keys or {}
|
290
288
|
self.id_columns = id_columns
|
291
|
-
self.id_columns_encoder = None
|
292
289
|
self.country_code = country_code
|
293
290
|
self.__validate_search_keys(search_keys, search_id)
|
294
291
|
|
@@ -302,7 +299,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
302
299
|
self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
303
300
|
self.metrics: Optional[pd.DataFrame] = None
|
304
301
|
self.feature_names_ = []
|
305
|
-
self.
|
302
|
+
self.zero_shap_client_features = []
|
306
303
|
self.feature_importances_ = []
|
307
304
|
self.search_id = search_id
|
308
305
|
self.disable_force_downsampling = disable_force_downsampling
|
@@ -362,8 +359,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
362
359
|
self.columns_for_online_api = columns_for_online_api
|
363
360
|
if columns_for_online_api is not None:
|
364
361
|
self.runtime_parameters.properties["columns_for_online_api"] = ",".join(columns_for_online_api)
|
365
|
-
|
366
|
-
|
362
|
+
maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
|
363
|
+
if maybe_downsampling_limit is not None:
|
364
|
+
Dataset.FIT_SAMPLE_THRESHOLD = int(maybe_downsampling_limit)
|
365
|
+
Dataset.FIT_SAMPLE_ROWS = int(maybe_downsampling_limit)
|
367
366
|
|
368
367
|
self.raise_validation_error = raise_validation_error
|
369
368
|
self.exclude_columns = exclude_columns
|
@@ -376,16 +375,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
376
375
|
self.autofe_features_display_handle = None
|
377
376
|
self.report_button_handle = None
|
378
377
|
|
379
|
-
def _get_sample_config(self, sample_config: Optional[SampleConfig] = None):
|
380
|
-
sample_config = sample_config or SampleConfig(force_sample_size=Dataset.FORCE_SAMPLE_SIZE)
|
381
|
-
|
382
|
-
maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
|
383
|
-
if maybe_downsampling_limit is not None:
|
384
|
-
sample_config.fit_sample_rows = int(maybe_downsampling_limit)
|
385
|
-
sample_config.fit_sample_threshold = int(maybe_downsampling_limit)
|
386
|
-
|
387
|
-
return sample_config
|
388
|
-
|
389
378
|
def _get_api_key(self):
|
390
379
|
return self._api_key
|
391
380
|
|
@@ -939,15 +928,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
939
928
|
):
|
940
929
|
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
941
930
|
|
942
|
-
validated_X
|
943
|
-
|
931
|
+
validated_X = self._validate_X(effective_X)
|
932
|
+
validated_y = self._validate_y(validated_X, effective_y)
|
933
|
+
validated_eval_set = (
|
934
|
+
[self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in effective_eval_set]
|
935
|
+
if effective_eval_set is not None
|
936
|
+
else None
|
944
937
|
)
|
945
938
|
|
946
939
|
if self.X is None:
|
947
940
|
self.X = X
|
948
|
-
self.id_columns_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1).fit(
|
949
|
-
X[self.id_columns or []]
|
950
|
-
)
|
951
941
|
if self.y is None:
|
952
942
|
self.y = y
|
953
943
|
if self.eval_set is None:
|
@@ -981,19 +971,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
981
971
|
client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
|
982
972
|
estimator, validated_X, self.search_keys
|
983
973
|
)
|
984
|
-
if self.id_columns and self.id_columns_encoder is not None:
|
985
|
-
if cat_features_from_backend:
|
986
|
-
cat_features_from_backend = [
|
987
|
-
c
|
988
|
-
for c in cat_features_from_backend
|
989
|
-
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
990
|
-
]
|
991
|
-
if client_cat_features:
|
992
|
-
client_cat_features = [
|
993
|
-
c
|
994
|
-
for c in client_cat_features
|
995
|
-
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
996
|
-
]
|
997
974
|
for cat_feature in cat_features_from_backend:
|
998
975
|
original_cat_feature = self.fit_columns_renaming.get(cat_feature)
|
999
976
|
if original_cat_feature in self.search_keys:
|
@@ -1268,8 +1245,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1268
1245
|
metrics.append(eval_metrics)
|
1269
1246
|
|
1270
1247
|
if updating_shaps is not None:
|
1271
|
-
|
1272
|
-
self._update_shap_values(trace_id, decoded_X, updating_shaps, silent=not internal_call)
|
1248
|
+
self._update_shap_values(trace_id, fitting_X, updating_shaps, silent=not internal_call)
|
1273
1249
|
|
1274
1250
|
metrics_df = pd.DataFrame(metrics)
|
1275
1251
|
mean_target_hdr = self.bundle.get("quality_metrics_mean_target_header")
|
@@ -1523,10 +1499,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
1523
1499
|
):
|
1524
1500
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
1525
1501
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
1502
|
+
validated_X = self._validate_X(X)
|
1503
|
+
validated_y = self._validate_y(validated_X, y)
|
1526
1504
|
checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
1527
|
-
|
1505
|
+
validated_eval_set = (
|
1506
|
+
[self._validate_eval_set_pair(validated_X, eval_set_pair) for eval_set_pair in checked_eval_set]
|
1507
|
+
if checked_eval_set
|
1508
|
+
else None
|
1509
|
+
)
|
1528
1510
|
|
1529
|
-
sampled_data = self.
|
1511
|
+
sampled_data = self._sample_data_for_metrics(
|
1530
1512
|
trace_id,
|
1531
1513
|
validated_X,
|
1532
1514
|
validated_y,
|
@@ -1600,11 +1582,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1600
1582
|
fitting_enriched_X = fitting_enriched_X.drop(columns=columns_with_high_cardinality, errors="ignore")
|
1601
1583
|
|
1602
1584
|
# Detect and drop constant columns
|
1603
|
-
constant_columns =
|
1604
|
-
c
|
1605
|
-
for c in FeaturesValidator.find_constant_features(fitting_X)
|
1606
|
-
if self.fit_columns_renaming.get(c, c) not in (self.id_columns or [])
|
1607
|
-
]
|
1585
|
+
constant_columns = FeaturesValidator.find_constant_features(fitting_X)
|
1608
1586
|
if len(constant_columns) > 0:
|
1609
1587
|
self.logger.warning(f"Constant columns {constant_columns} will be dropped for metrics calculation")
|
1610
1588
|
fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
|
@@ -1647,7 +1625,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1647
1625
|
fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
|
1648
1626
|
)
|
1649
1627
|
fitting_X = fitting_X[fitting_x_columns]
|
1650
|
-
fitting_X, _ = self._encode_id_columns(fitting_X, self.fit_columns_renaming)
|
1651
1628
|
self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
|
1652
1629
|
fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
|
1653
1630
|
fitting_enriched_x_columns = sort_columns(
|
@@ -1659,7 +1636,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1659
1636
|
logger=self.logger,
|
1660
1637
|
)
|
1661
1638
|
fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
|
1662
|
-
fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X, self.fit_columns_renaming)
|
1663
1639
|
self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
|
1664
1640
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
1665
1641
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
@@ -1687,12 +1663,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1687
1663
|
.astype(np.float64)
|
1688
1664
|
)
|
1689
1665
|
|
1690
|
-
fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X, self.fit_columns_renaming)
|
1691
|
-
fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X, self.fit_columns_renaming)
|
1692
|
-
|
1693
|
-
if len(unknown_dict) > 0:
|
1694
|
-
print(self.bundle.get("unknown_id_column_value_in_eval_set").format(unknown_dict))
|
1695
|
-
|
1696
1666
|
fitting_eval_set_dict[idx] = (
|
1697
1667
|
fitting_eval_X,
|
1698
1668
|
eval_y_sorted,
|
@@ -1714,7 +1684,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1714
1684
|
)
|
1715
1685
|
|
1716
1686
|
@dataclass
|
1717
|
-
class
|
1687
|
+
class _SampledDataForMetrics:
|
1718
1688
|
X_sampled: pd.DataFrame
|
1719
1689
|
y_sampled: pd.Series
|
1720
1690
|
enriched_X: pd.DataFrame
|
@@ -1722,7 +1692,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1722
1692
|
search_keys: Dict[str, SearchKey]
|
1723
1693
|
columns_renaming: Dict[str, str]
|
1724
1694
|
|
1725
|
-
def
|
1695
|
+
def _sample_data_for_metrics(
|
1726
1696
|
self,
|
1727
1697
|
trace_id: str,
|
1728
1698
|
validated_X: Union[pd.DataFrame, pd.Series, np.ndarray, None],
|
@@ -1734,7 +1704,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1734
1704
|
remove_outliers_calc_metrics: Optional[bool],
|
1735
1705
|
progress_bar: Optional[ProgressBar],
|
1736
1706
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
1737
|
-
) ->
|
1707
|
+
) -> _SampledDataForMetrics:
|
1738
1708
|
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
1739
1709
|
cached_sampled_datasets = self.__cached_sampled_datasets.get(datasets_hash)
|
1740
1710
|
if cached_sampled_datasets is not None and is_input_same_as_fit and remove_outliers_calc_metrics is None:
|
@@ -1742,7 +1712,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1742
1712
|
return self.__get_sampled_cached_enriched(datasets_hash, exclude_features_sources)
|
1743
1713
|
elif len(self.feature_importances_) == 0:
|
1744
1714
|
self.logger.info("No external features selected. So use only input datasets for metrics calculation")
|
1745
|
-
return self.
|
1715
|
+
return self.__sample_only_input(validated_X, validated_y, eval_set, is_demo_dataset)
|
1746
1716
|
# TODO save and check if dataset was deduplicated - use imbalance branch for such case
|
1747
1717
|
elif (
|
1748
1718
|
not self.imbalanced
|
@@ -1751,14 +1721,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
1751
1721
|
and self.df_with_original_index is not None
|
1752
1722
|
):
|
1753
1723
|
self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
|
1754
|
-
return self.
|
1724
|
+
return self.__sample_balanced(eval_set, trace_id, remove_outliers_calc_metrics)
|
1755
1725
|
else:
|
1756
1726
|
self.logger.info(
|
1757
1727
|
"Dataset is imbalanced or exclude_features_sources or X was passed or this is saved search."
|
1758
1728
|
" Run transform"
|
1759
1729
|
)
|
1760
1730
|
print(self.bundle.get("prepare_data_for_metrics"))
|
1761
|
-
return self.
|
1731
|
+
return self.__sample_imbalanced(
|
1762
1732
|
validated_X,
|
1763
1733
|
validated_y,
|
1764
1734
|
eval_set,
|
@@ -1770,7 +1740,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1770
1740
|
|
1771
1741
|
def __get_sampled_cached_enriched(
|
1772
1742
|
self, datasets_hash: str, exclude_features_sources: Optional[List[str]]
|
1773
|
-
) ->
|
1743
|
+
) -> _SampledDataForMetrics:
|
1774
1744
|
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
|
1775
1745
|
self.__cached_sampled_datasets[datasets_hash]
|
1776
1746
|
)
|
@@ -1787,9 +1757,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
1787
1757
|
search_keys,
|
1788
1758
|
)
|
1789
1759
|
|
1790
|
-
def
|
1760
|
+
def __sample_only_input(
|
1791
1761
|
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
|
1792
|
-
) ->
|
1762
|
+
) -> _SampledDataForMetrics:
|
1793
1763
|
eval_set_sampled_dict = {}
|
1794
1764
|
|
1795
1765
|
df = validated_X.copy()
|
@@ -1831,13 +1801,24 @@ class FeaturesEnricher(TransformerMixin):
|
|
1831
1801
|
normalizer = Normalizer(self.bundle, self.logger)
|
1832
1802
|
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
1833
1803
|
columns_renaming = normalizer.columns_renaming
|
1804
|
+
# columns_renaming = {c: c for c in df.columns}
|
1834
1805
|
|
1835
1806
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
1836
|
-
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET, columns_renaming, silent=True)
|
1837
1807
|
|
1808
|
+
num_samples = _num_samples(df)
|
1809
|
+
sample_threshold, sample_rows = (
|
1810
|
+
(Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD, Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS)
|
1811
|
+
if eval_set is not None
|
1812
|
+
else (Dataset.FIT_SAMPLE_THRESHOLD, Dataset.FIT_SAMPLE_ROWS)
|
1813
|
+
)
|
1814
|
+
|
1815
|
+
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET, columns_renaming, silent=True)
|
1838
1816
|
# Sample after sorting by system_record_id for idempotency
|
1839
1817
|
df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
|
1840
|
-
|
1818
|
+
|
1819
|
+
if num_samples > sample_threshold:
|
1820
|
+
self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
|
1821
|
+
df = df.sample(n=sample_rows, random_state=self.random_state)
|
1841
1822
|
|
1842
1823
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
1843
1824
|
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
@@ -1866,12 +1847,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
1866
1847
|
search_keys,
|
1867
1848
|
)
|
1868
1849
|
|
1869
|
-
def
|
1850
|
+
def __sample_balanced(
|
1870
1851
|
self,
|
1871
1852
|
eval_set: Optional[List[tuple]],
|
1872
1853
|
trace_id: str,
|
1873
1854
|
remove_outliers_calc_metrics: Optional[bool],
|
1874
|
-
) ->
|
1855
|
+
) -> _SampledDataForMetrics:
|
1875
1856
|
eval_set_sampled_dict = {}
|
1876
1857
|
search_keys = self.fit_search_keys
|
1877
1858
|
|
@@ -1970,7 +1951,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1970
1951
|
search_keys,
|
1971
1952
|
)
|
1972
1953
|
|
1973
|
-
def
|
1954
|
+
def __sample_imbalanced(
|
1974
1955
|
self,
|
1975
1956
|
validated_X: pd.DataFrame,
|
1976
1957
|
validated_y: pd.Series,
|
@@ -1979,7 +1960,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1979
1960
|
trace_id: str,
|
1980
1961
|
progress_bar: Optional[ProgressBar],
|
1981
1962
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
1982
|
-
) ->
|
1963
|
+
) -> _SampledDataForMetrics:
|
1983
1964
|
has_eval_set = eval_set is not None
|
1984
1965
|
|
1985
1966
|
self.logger.info(f"Transform {'with' if has_eval_set else 'without'} eval_set")
|
@@ -2036,58 +2017,61 @@ class FeaturesEnricher(TransformerMixin):
|
|
2036
2017
|
)
|
2037
2018
|
|
2038
2019
|
def __combine_train_and_eval_sets(
|
2039
|
-
self,
|
2020
|
+
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]]
|
2040
2021
|
) -> pd.DataFrame:
|
2041
|
-
df =
|
2042
|
-
|
2043
|
-
|
2044
|
-
if not eval_set:
|
2022
|
+
df = validated_X.copy()
|
2023
|
+
df[TARGET] = validated_y
|
2024
|
+
if eval_set is None:
|
2045
2025
|
return df
|
2046
2026
|
|
2047
2027
|
df[EVAL_SET_INDEX] = 0
|
2048
2028
|
|
2049
2029
|
for idx, eval_pair in enumerate(eval_set):
|
2050
|
-
eval_x, eval_y = eval_pair
|
2030
|
+
eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
|
2051
2031
|
eval_df_with_index = eval_x.copy()
|
2052
|
-
|
2053
|
-
eval_df_with_index[TARGET] = eval_y
|
2032
|
+
eval_df_with_index[TARGET] = eval_y
|
2054
2033
|
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
2055
2034
|
df = pd.concat([df, eval_df_with_index])
|
2056
2035
|
|
2057
2036
|
return df
|
2058
2037
|
|
2059
2038
|
def __downsample_for_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
|
2060
|
-
force_downsampling = self.__use_force_downsampling(df)
|
2061
|
-
|
2062
|
-
sample_columns = SampleColumns(
|
2063
|
-
ids=self.id_columns,
|
2064
|
-
date=self._get_date_column(self.search_keys),
|
2065
|
-
target=TARGET,
|
2066
|
-
eval_set_index=EVAL_SET_INDEX,
|
2067
|
-
)
|
2068
|
-
|
2069
|
-
return sample(
|
2070
|
-
df,
|
2071
|
-
self.model_task_type,
|
2072
|
-
self.cv,
|
2073
|
-
self.sample_config,
|
2074
|
-
sample_columns,
|
2075
|
-
self.random_state,
|
2076
|
-
force_downsampling=force_downsampling,
|
2077
|
-
balance=False,
|
2078
|
-
logger=self.logger,
|
2079
|
-
bundle=self.bundle,
|
2080
|
-
warning_callback=self.__log_warning,
|
2081
|
-
)
|
2082
|
-
|
2083
|
-
def __use_force_downsampling(self, df: pd.DataFrame) -> bool:
|
2084
2039
|
num_samples = _num_samples(df)
|
2085
|
-
|
2040
|
+
force_downsampling = (
|
2086
2041
|
not self.disable_force_downsampling
|
2087
2042
|
and self.columns_for_online_api is not None
|
2088
2043
|
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
2089
2044
|
)
|
2090
2045
|
|
2046
|
+
if force_downsampling:
|
2047
|
+
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
2048
|
+
return balance_undersample_forced(
|
2049
|
+
df=df,
|
2050
|
+
target_column=TARGET,
|
2051
|
+
id_columns=self.id_columns,
|
2052
|
+
date_column=self._get_date_column(self.search_keys),
|
2053
|
+
task_type=self.model_task_type,
|
2054
|
+
cv_type=self.cv,
|
2055
|
+
random_state=self.random_state,
|
2056
|
+
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
2057
|
+
logger=self.logger,
|
2058
|
+
bundle=self.bundle,
|
2059
|
+
warning_callback=self.__log_warning,
|
2060
|
+
)
|
2061
|
+
elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
2062
|
+
if EVAL_SET_INDEX in df.columns:
|
2063
|
+
threshold = Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
|
2064
|
+
sample_size = Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS
|
2065
|
+
else:
|
2066
|
+
threshold = Dataset.FIT_SAMPLE_THRESHOLD
|
2067
|
+
sample_size = Dataset.FIT_SAMPLE_ROWS
|
2068
|
+
|
2069
|
+
if num_samples > threshold:
|
2070
|
+
self.logger.info(f"Downsampling from {num_samples} to {sample_size}")
|
2071
|
+
return df.sample(n=sample_size, random_state=self.random_state)
|
2072
|
+
|
2073
|
+
return df
|
2074
|
+
|
2091
2075
|
def __extract_train_data(
|
2092
2076
|
self, enriched_df: pd.DataFrame, x_columns: List[str]
|
2093
2077
|
) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
|
@@ -2123,7 +2107,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2123
2107
|
eval_set_sampled_dict: Dict[int, Tuple],
|
2124
2108
|
columns_renaming: Dict[str, str],
|
2125
2109
|
search_keys: Dict[str, SearchKey],
|
2126
|
-
) ->
|
2110
|
+
) -> _SampledDataForMetrics:
|
2127
2111
|
|
2128
2112
|
self.__cached_sampled_datasets[datasets_hash] = (
|
2129
2113
|
X_sampled,
|
@@ -2154,7 +2138,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2154
2138
|
for k, v in search_keys.items()
|
2155
2139
|
if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
|
2156
2140
|
}
|
2157
|
-
return FeaturesEnricher.
|
2141
|
+
return FeaturesEnricher._SampledDataForMetrics(
|
2158
2142
|
X_sampled=X_sampled,
|
2159
2143
|
y_sampled=y_sampled,
|
2160
2144
|
enriched_X=enriched_X,
|
@@ -2302,10 +2286,13 @@ if response.status_code == 200:
|
|
2302
2286
|
with MDC(trace_id=trace_id, search_id=search_id):
|
2303
2287
|
self.logger.info("Start transform")
|
2304
2288
|
|
2305
|
-
validated_X
|
2306
|
-
|
2307
|
-
|
2308
|
-
|
2289
|
+
validated_X = self._validate_X(X, is_transform=True)
|
2290
|
+
if y is not None:
|
2291
|
+
validated_y = self._validate_y(validated_X, y)
|
2292
|
+
df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set=None)
|
2293
|
+
else:
|
2294
|
+
validated_y = None
|
2295
|
+
df = validated_X
|
2309
2296
|
|
2310
2297
|
validated_Xy = df.copy()
|
2311
2298
|
|
@@ -2359,9 +2346,7 @@ if response.status_code == 200:
|
|
2359
2346
|
|
2360
2347
|
is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
|
2361
2348
|
|
2362
|
-
columns_to_drop = [
|
2363
|
-
c for c in df.columns if c in self.feature_names_ and c in self.external_source_feature_names
|
2364
|
-
]
|
2349
|
+
columns_to_drop = [c for c in df.columns if c in self.feature_names_]
|
2365
2350
|
if len(columns_to_drop) > 0:
|
2366
2351
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
2367
2352
|
self.logger.warning(msg)
|
@@ -2565,7 +2550,6 @@ if response.status_code == 200:
|
|
2565
2550
|
id_columns=self.__get_renamed_id_columns(columns_renaming),
|
2566
2551
|
date_column=self._get_date_column(search_keys),
|
2567
2552
|
date_format=self.date_format,
|
2568
|
-
sample_config=self.sample_config,
|
2569
2553
|
rest_client=self.rest_client,
|
2570
2554
|
logger=self.logger,
|
2571
2555
|
bundle=self.bundle,
|
@@ -2669,7 +2653,7 @@ if response.status_code == 200:
|
|
2669
2653
|
selecting_columns = [
|
2670
2654
|
c
|
2671
2655
|
for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
|
2672
|
-
if c not in self.
|
2656
|
+
if c not in self.zero_shap_client_features
|
2673
2657
|
]
|
2674
2658
|
selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
|
2675
2659
|
if add_fit_system_record_id:
|
@@ -2817,8 +2801,13 @@ if response.status_code == 200:
|
|
2817
2801
|
self.fit_dropped_features = set()
|
2818
2802
|
self.fit_generated_features = []
|
2819
2803
|
|
2820
|
-
validated_X
|
2821
|
-
|
2804
|
+
validated_X = self._validate_X(X)
|
2805
|
+
validated_y = self._validate_y(validated_X, y)
|
2806
|
+
validated_eval_set = (
|
2807
|
+
[self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in eval_set]
|
2808
|
+
if eval_set is not None
|
2809
|
+
else None
|
2810
|
+
)
|
2822
2811
|
is_demo_dataset = hash_input(validated_X, validated_y, validated_eval_set) in DEMO_DATASET_HASHES
|
2823
2812
|
if is_demo_dataset:
|
2824
2813
|
msg = self.bundle.get("demo_dataset_info")
|
@@ -2863,8 +2852,14 @@ if response.status_code == 200:
|
|
2863
2852
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
2864
2853
|
)
|
2865
2854
|
|
2866
|
-
df =
|
2867
|
-
|
2855
|
+
df = pd.concat([validated_X, validated_y], axis=1)
|
2856
|
+
|
2857
|
+
if validated_eval_set is not None and len(validated_eval_set) > 0:
|
2858
|
+
df[EVAL_SET_INDEX] = 0
|
2859
|
+
for idx, (eval_X, eval_y) in enumerate(validated_eval_set):
|
2860
|
+
eval_df = pd.concat([eval_X, eval_y], axis=1)
|
2861
|
+
eval_df[EVAL_SET_INDEX] = idx + 1
|
2862
|
+
df = pd.concat([df, eval_df])
|
2868
2863
|
|
2869
2864
|
self.fit_search_keys = self.search_keys.copy()
|
2870
2865
|
df = self.__handle_index_search_keys(df, self.fit_search_keys)
|
@@ -2975,8 +2970,47 @@ if response.status_code == 200:
|
|
2975
2970
|
# TODO check maybe need to drop _time column from df_with_original_index
|
2976
2971
|
|
2977
2972
|
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, self.fit_columns_renaming)
|
2978
|
-
|
2979
|
-
|
2973
|
+
|
2974
|
+
# Convert EMAIL to HEM after unnesting to do it only with one column
|
2975
|
+
email_column = self._get_email_column(self.fit_search_keys)
|
2976
|
+
hem_column = self._get_hem_column(self.fit_search_keys)
|
2977
|
+
if email_column:
|
2978
|
+
converter = EmailSearchKeyConverter(
|
2979
|
+
email_column,
|
2980
|
+
hem_column,
|
2981
|
+
self.fit_search_keys,
|
2982
|
+
self.fit_columns_renaming,
|
2983
|
+
list(unnest_search_keys.keys()),
|
2984
|
+
self.bundle,
|
2985
|
+
self.logger,
|
2986
|
+
)
|
2987
|
+
df = converter.convert(df)
|
2988
|
+
|
2989
|
+
ip_column = self._get_ip_column(self.fit_search_keys)
|
2990
|
+
if ip_column:
|
2991
|
+
converter = IpSearchKeyConverter(
|
2992
|
+
ip_column,
|
2993
|
+
self.fit_search_keys,
|
2994
|
+
self.fit_columns_renaming,
|
2995
|
+
list(unnest_search_keys.keys()),
|
2996
|
+
self.bundle,
|
2997
|
+
self.logger,
|
2998
|
+
)
|
2999
|
+
df = converter.convert(df)
|
3000
|
+
phone_column = self._get_phone_column(self.fit_search_keys)
|
3001
|
+
country_column = self._get_country_column(self.fit_search_keys)
|
3002
|
+
if phone_column:
|
3003
|
+
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
3004
|
+
df = converter.convert(df)
|
3005
|
+
|
3006
|
+
if country_column:
|
3007
|
+
converter = CountrySearchKeyConverter(country_column)
|
3008
|
+
df = converter.convert(df)
|
3009
|
+
|
3010
|
+
postal_code = self._get_postal_column(self.fit_search_keys)
|
3011
|
+
if postal_code:
|
3012
|
+
converter = PostalCodeSearchKeyConverter(postal_code)
|
3013
|
+
df = converter.convert(df)
|
2980
3014
|
|
2981
3015
|
non_feature_columns = [
|
2982
3016
|
self.TARGET_NAME,
|
@@ -3027,7 +3061,11 @@ if response.status_code == 200:
|
|
3027
3061
|
runtime_parameters = self._get_copy_of_runtime_parameters()
|
3028
3062
|
|
3029
3063
|
# Force downsampling to 7000 for API features generation
|
3030
|
-
force_downsampling =
|
3064
|
+
force_downsampling = (
|
3065
|
+
not self.disable_force_downsampling
|
3066
|
+
and self.columns_for_online_api is not None
|
3067
|
+
and len(df) > Dataset.FORCE_SAMPLE_SIZE
|
3068
|
+
)
|
3031
3069
|
if force_downsampling:
|
3032
3070
|
runtime_parameters.properties["fast_fit"] = True
|
3033
3071
|
|
@@ -3047,7 +3085,6 @@ if response.status_code == 200:
|
|
3047
3085
|
logger=self.logger,
|
3048
3086
|
bundle=self.bundle,
|
3049
3087
|
warning_callback=self.__log_warning,
|
3050
|
-
sample_config=self.sample_config,
|
3051
3088
|
)
|
3052
3089
|
dataset.columns_renaming = self.fit_columns_renaming
|
3053
3090
|
|
@@ -3203,49 +3240,6 @@ if response.status_code == 200:
|
|
3203
3240
|
if not self.warning_counter.has_warnings():
|
3204
3241
|
self.__display_support_link(self.bundle.get("all_ok_community_invite"))
|
3205
3242
|
|
3206
|
-
def __convert_unnestable_keys(self, df: pd.DataFrame, unnest_search_keys: Dict[str, str]):
|
3207
|
-
email_column = self._get_email_column(self.fit_search_keys)
|
3208
|
-
hem_column = self._get_hem_column(self.fit_search_keys)
|
3209
|
-
if email_column:
|
3210
|
-
converter = EmailSearchKeyConverter(
|
3211
|
-
email_column,
|
3212
|
-
hem_column,
|
3213
|
-
self.fit_search_keys,
|
3214
|
-
self.fit_columns_renaming,
|
3215
|
-
list(unnest_search_keys.keys()),
|
3216
|
-
self.bundle,
|
3217
|
-
self.logger,
|
3218
|
-
)
|
3219
|
-
df = converter.convert(df)
|
3220
|
-
|
3221
|
-
ip_column = self._get_ip_column(self.fit_search_keys)
|
3222
|
-
if ip_column:
|
3223
|
-
converter = IpSearchKeyConverter(
|
3224
|
-
ip_column,
|
3225
|
-
self.fit_search_keys,
|
3226
|
-
self.fit_columns_renaming,
|
3227
|
-
list(unnest_search_keys.keys()),
|
3228
|
-
self.bundle,
|
3229
|
-
self.logger,
|
3230
|
-
)
|
3231
|
-
df = converter.convert(df)
|
3232
|
-
phone_column = self._get_phone_column(self.fit_search_keys)
|
3233
|
-
country_column = self._get_country_column(self.fit_search_keys)
|
3234
|
-
if phone_column:
|
3235
|
-
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
3236
|
-
df = converter.convert(df)
|
3237
|
-
|
3238
|
-
if country_column:
|
3239
|
-
converter = CountrySearchKeyConverter(country_column)
|
3240
|
-
df = converter.convert(df)
|
3241
|
-
|
3242
|
-
postal_code = self._get_postal_column(self.fit_search_keys)
|
3243
|
-
if postal_code:
|
3244
|
-
converter = PostalCodeSearchKeyConverter(postal_code)
|
3245
|
-
df = converter.convert(df)
|
3246
|
-
|
3247
|
-
return df
|
3248
|
-
|
3249
3243
|
def __should_add_date_column(self):
|
3250
3244
|
return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
|
3251
3245
|
|
@@ -3288,57 +3282,6 @@ if response.status_code == 200:
|
|
3288
3282
|
search_keys_with_autodetection = {**self.search_keys, **self.autodetected_search_keys}
|
3289
3283
|
return [c for c, v in search_keys_with_autodetection.items() if v.value.value in keys]
|
3290
3284
|
|
3291
|
-
def _validate_train_eval(
|
3292
|
-
self,
|
3293
|
-
X: pd.DataFrame,
|
3294
|
-
y: Optional[pd.Series] = None,
|
3295
|
-
eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
|
3296
|
-
is_transform: bool = False,
|
3297
|
-
) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
|
3298
|
-
validated_X = self._validate_X(X, is_transform)
|
3299
|
-
validated_y = self._validate_y(validated_X, y)
|
3300
|
-
validated_eval_set = self._validate_eval_set(validated_X, eval_set)
|
3301
|
-
return validated_X, validated_y, validated_eval_set
|
3302
|
-
|
3303
|
-
def _encode_id_columns(
|
3304
|
-
self,
|
3305
|
-
X: pd.DataFrame,
|
3306
|
-
columns_renaming: Optional[Dict[str, str]] = None,
|
3307
|
-
) -> Tuple[pd.DataFrame, Dict[str, List[Any]]]:
|
3308
|
-
columns_renaming = columns_renaming or {}
|
3309
|
-
unknown_dict = {}
|
3310
|
-
|
3311
|
-
if self.id_columns and self.id_columns_encoder is not None:
|
3312
|
-
inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
3313
|
-
renamed_id_columns = [
|
3314
|
-
inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
|
3315
|
-
]
|
3316
|
-
self.logger.info(f"Convert id columns to int: {renamed_id_columns}")
|
3317
|
-
encoded = self.id_columns_encoder.transform(X[renamed_id_columns].rename(columns=columns_renaming))
|
3318
|
-
for i, c in enumerate(renamed_id_columns):
|
3319
|
-
unknown_values = X[encoded[:, i] == -1][c].unique().tolist()
|
3320
|
-
if len(unknown_values) > 0:
|
3321
|
-
unknown_dict[c] = unknown_values
|
3322
|
-
X[renamed_id_columns] = encoded
|
3323
|
-
X = X.loc[(X[renamed_id_columns] != -1).all(axis=1)]
|
3324
|
-
|
3325
|
-
if len(unknown_dict) > 0:
|
3326
|
-
self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
|
3327
|
-
|
3328
|
-
return X, unknown_dict
|
3329
|
-
|
3330
|
-
def _decode_id_columns(self, X: pd.DataFrame, columns_renaming: Dict[str, str]):
|
3331
|
-
columns_renaming = columns_renaming or {}
|
3332
|
-
if self.id_columns and self.id_columns_encoder is not None:
|
3333
|
-
inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
3334
|
-
renamed_id_columns = [
|
3335
|
-
inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
|
3336
|
-
]
|
3337
|
-
decoded = self.id_columns_encoder.inverse_transform(X[renamed_id_columns].rename(columns=columns_renaming))
|
3338
|
-
X[renamed_id_columns] = decoded
|
3339
|
-
|
3340
|
-
return X
|
3341
|
-
|
3342
3285
|
def _validate_X(self, X, is_transform=False) -> pd.DataFrame:
|
3343
3286
|
if isinstance(X, pd.DataFrame):
|
3344
3287
|
if isinstance(X.columns, pd.MultiIndex) or isinstance(X.index, pd.MultiIndex):
|
@@ -3380,9 +3323,7 @@ if response.status_code == 200:
|
|
3380
3323
|
|
3381
3324
|
return validated_X
|
3382
3325
|
|
3383
|
-
def _validate_y(self, X: pd.DataFrame, y) ->
|
3384
|
-
if y is None:
|
3385
|
-
return None
|
3326
|
+
def _validate_y(self, X: pd.DataFrame, y) -> pd.Series:
|
3386
3327
|
if (
|
3387
3328
|
not isinstance(y, pd.Series)
|
3388
3329
|
and not isinstance(y, pd.DataFrame)
|
@@ -3429,11 +3370,6 @@ if response.status_code == 200:
|
|
3429
3370
|
|
3430
3371
|
return validated_y
|
3431
3372
|
|
3432
|
-
def _validate_eval_set(self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]]):
|
3433
|
-
if eval_set is None:
|
3434
|
-
return None
|
3435
|
-
return [self._validate_eval_set_pair(X, eval_pair) for eval_pair in eval_set]
|
3436
|
-
|
3437
3373
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
3438
3374
|
if len(eval_pair) != 2:
|
3439
3375
|
raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
@@ -3514,7 +3450,7 @@ if response.status_code == 200:
|
|
3514
3450
|
raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
|
3515
3451
|
|
3516
3452
|
# Check for duplicates between train and eval sets by comparing all values
|
3517
|
-
train_eval_intersection = pd.merge(X, validated_eval_X, how=
|
3453
|
+
train_eval_intersection = pd.merge(X, validated_eval_X, how='inner')
|
3518
3454
|
if len(train_eval_intersection) > 0:
|
3519
3455
|
raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
|
3520
3456
|
|
@@ -4044,7 +3980,7 @@ if response.status_code == 200:
|
|
4044
3980
|
if features_meta is None:
|
4045
3981
|
raise Exception(self.bundle.get("missing_features_meta"))
|
4046
3982
|
|
4047
|
-
return [f.name for f in features_meta if f.type == "categorical"
|
3983
|
+
return [f.name for f in features_meta if f.type == "categorical"]
|
4048
3984
|
|
4049
3985
|
def __prepare_feature_importances(
|
4050
3986
|
self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
@@ -4063,7 +3999,7 @@ if response.status_code == 200:
|
|
4063
3999
|
df = df.rename(columns=original_names_dict)
|
4064
4000
|
|
4065
4001
|
self.feature_names_ = []
|
4066
|
-
self.
|
4002
|
+
self.zero_shap_client_features = []
|
4067
4003
|
self.feature_importances_ = []
|
4068
4004
|
features_info = []
|
4069
4005
|
features_info_without_links = []
|
@@ -4094,11 +4030,10 @@ if response.status_code == 200:
|
|
4094
4030
|
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
4095
4031
|
is_client_feature = original_name in df.columns
|
4096
4032
|
|
4097
|
-
if not is_client_feature:
|
4098
|
-
self.external_source_feature_names.append(original_name)
|
4099
|
-
|
4100
4033
|
# TODO make a decision about selected features based on special flag from mlb
|
4101
4034
|
if original_shaps.get(feature_meta.name, 0.0) == 0.0:
|
4035
|
+
if is_client_feature and self.fit_select_features:
|
4036
|
+
self.zero_shap_client_features.append(original_name)
|
4102
4037
|
continue
|
4103
4038
|
|
4104
4039
|
# Use only important features
|
@@ -4688,6 +4623,35 @@ if response.status_code == 200:
|
|
4688
4623
|
self.logger.warning("Failed to dump input files", exc_info=True)
|
4689
4624
|
|
4690
4625
|
|
4626
|
+
def _num_samples(x):
|
4627
|
+
"""Return number of samples in array-like x."""
|
4628
|
+
if x is None:
|
4629
|
+
return 0
|
4630
|
+
message = "Expected sequence or array-like, got %s" % type(x)
|
4631
|
+
if hasattr(x, "fit") and callable(x.fit):
|
4632
|
+
# Don't get num_samples from an ensembles length!
|
4633
|
+
raise TypeError(message)
|
4634
|
+
|
4635
|
+
if not hasattr(x, "__len__") and not hasattr(x, "shape"):
|
4636
|
+
if hasattr(x, "__array__"):
|
4637
|
+
x = np.asarray(x)
|
4638
|
+
else:
|
4639
|
+
raise TypeError(message)
|
4640
|
+
|
4641
|
+
if hasattr(x, "shape") and x.shape is not None:
|
4642
|
+
if len(x.shape) == 0:
|
4643
|
+
raise TypeError("Singleton array %r cannot be considered a valid collection." % x)
|
4644
|
+
# Check that shape is returning an integer or default to len
|
4645
|
+
# Dask dataframes may not return numeric shape[0] value
|
4646
|
+
if isinstance(x.shape[0], numbers.Integral):
|
4647
|
+
return x.shape[0]
|
4648
|
+
|
4649
|
+
try:
|
4650
|
+
return len(x)
|
4651
|
+
except TypeError as type_error:
|
4652
|
+
raise TypeError(message) from type_error
|
4653
|
+
|
4654
|
+
|
4691
4655
|
def is_frames_equal(first, second, bundle: ResourceBundle) -> bool:
|
4692
4656
|
if (isinstance(first, pd.DataFrame) and isinstance(second, pd.DataFrame)) or (
|
4693
4657
|
isinstance(first, pd.Series) and isinstance(second, pd.Series)
|