upgini 1.2.91a3884.dev4__py3-none-any.whl → 1.2.91a3906.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/autofe/unary.py +8 -0
- upgini/dataset.py +96 -55
- upgini/features_enricher.py +187 -220
- upgini/metadata.py +0 -3
- upgini/metrics.py +11 -12
- upgini/resource_bundle/strings.properties +0 -2
- upgini/utils/target_utils.py +199 -3
- {upgini-1.2.91a3884.dev4.dist-info → upgini-1.2.91a3906.dev1.dist-info}/METADATA +1 -1
- {upgini-1.2.91a3884.dev4.dist-info → upgini-1.2.91a3906.dev1.dist-info}/RECORD +12 -13
- upgini/utils/sample_utils.py +0 -414
- {upgini-1.2.91a3884.dev4.dist-info → upgini-1.2.91a3906.dev1.dist-info}/WHEEL +0 -0
- {upgini-1.2.91a3884.dev4.dist-info → upgini-1.2.91a3906.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
@@ -30,7 +30,6 @@ from scipy.stats import ks_2samp
|
|
30
30
|
from sklearn.base import TransformerMixin
|
31
31
|
from sklearn.exceptions import NotFittedError
|
32
32
|
from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
|
33
|
-
from sklearn.preprocessing import OrdinalEncoder
|
34
33
|
|
35
34
|
from upgini.autofe.feature import Feature
|
36
35
|
from upgini.autofe.timeseries import TimeSeriesBase
|
@@ -119,9 +118,9 @@ except Exception:
|
|
119
118
|
CustomFallbackProgressBar as ProgressBar,
|
120
119
|
)
|
121
120
|
|
122
|
-
from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
|
123
121
|
from upgini.utils.sort import sort_columns
|
124
122
|
from upgini.utils.target_utils import (
|
123
|
+
balance_undersample_forced,
|
125
124
|
calculate_psi,
|
126
125
|
define_task,
|
127
126
|
)
|
@@ -243,7 +242,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
243
242
|
disable_force_downsampling: bool = False,
|
244
243
|
id_columns: Optional[List[str]] = None,
|
245
244
|
generate_search_key_features: bool = True,
|
246
|
-
sample_config: Optional[SampleConfig] = None,
|
247
245
|
**kwargs,
|
248
246
|
):
|
249
247
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
@@ -288,7 +286,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
288
286
|
|
289
287
|
self.search_keys = search_keys or {}
|
290
288
|
self.id_columns = id_columns
|
291
|
-
self.id_columns_encoder = None
|
292
289
|
self.country_code = country_code
|
293
290
|
self.__validate_search_keys(search_keys, search_id)
|
294
291
|
|
@@ -362,8 +359,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
362
359
|
self.columns_for_online_api = columns_for_online_api
|
363
360
|
if columns_for_online_api is not None:
|
364
361
|
self.runtime_parameters.properties["columns_for_online_api"] = ",".join(columns_for_online_api)
|
365
|
-
|
366
|
-
|
362
|
+
maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
|
363
|
+
if maybe_downsampling_limit is not None:
|
364
|
+
Dataset.FIT_SAMPLE_THRESHOLD = int(maybe_downsampling_limit)
|
365
|
+
Dataset.FIT_SAMPLE_ROWS = int(maybe_downsampling_limit)
|
367
366
|
|
368
367
|
self.raise_validation_error = raise_validation_error
|
369
368
|
self.exclude_columns = exclude_columns
|
@@ -376,16 +375,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
376
375
|
self.autofe_features_display_handle = None
|
377
376
|
self.report_button_handle = None
|
378
377
|
|
379
|
-
def _get_sample_config(self, sample_config: Optional[SampleConfig] = None):
|
380
|
-
sample_config = sample_config or SampleConfig(force_sample_size=Dataset.FORCE_SAMPLE_SIZE)
|
381
|
-
|
382
|
-
maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
|
383
|
-
if maybe_downsampling_limit is not None:
|
384
|
-
sample_config.fit_sample_rows = int(maybe_downsampling_limit)
|
385
|
-
sample_config.fit_sample_threshold = int(maybe_downsampling_limit)
|
386
|
-
|
387
|
-
return sample_config
|
388
|
-
|
389
378
|
def _get_api_key(self):
|
390
379
|
return self._api_key
|
391
380
|
|
@@ -939,15 +928,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
939
928
|
):
|
940
929
|
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
941
930
|
|
942
|
-
validated_X
|
943
|
-
|
931
|
+
validated_X = self._validate_X(effective_X)
|
932
|
+
validated_y = self._validate_y(validated_X, effective_y)
|
933
|
+
validated_eval_set = (
|
934
|
+
[self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in effective_eval_set]
|
935
|
+
if effective_eval_set is not None
|
936
|
+
else None
|
944
937
|
)
|
945
938
|
|
946
939
|
if self.X is None:
|
947
940
|
self.X = X
|
948
|
-
self.id_columns_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1).fit(
|
949
|
-
X[self.id_columns or []]
|
950
|
-
)
|
951
941
|
if self.y is None:
|
952
942
|
self.y = y
|
953
943
|
if self.eval_set is None:
|
@@ -981,19 +971,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
981
971
|
client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
|
982
972
|
estimator, validated_X, self.search_keys
|
983
973
|
)
|
984
|
-
if self.id_columns_encoder is not None:
|
985
|
-
if cat_features_from_backend:
|
986
|
-
cat_features_from_backend = [
|
987
|
-
c
|
988
|
-
for c in cat_features_from_backend
|
989
|
-
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
990
|
-
]
|
991
|
-
if client_cat_features:
|
992
|
-
client_cat_features = [
|
993
|
-
c
|
994
|
-
for c in client_cat_features
|
995
|
-
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
996
|
-
]
|
997
974
|
for cat_feature in cat_features_from_backend:
|
998
975
|
original_cat_feature = self.fit_columns_renaming.get(cat_feature)
|
999
976
|
if original_cat_feature in self.search_keys:
|
@@ -1268,8 +1245,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1268
1245
|
metrics.append(eval_metrics)
|
1269
1246
|
|
1270
1247
|
if updating_shaps is not None:
|
1271
|
-
|
1272
|
-
self._update_shap_values(trace_id, decoded_X, updating_shaps, silent=not internal_call)
|
1248
|
+
self._update_shap_values(trace_id, fitting_X, updating_shaps, silent=not internal_call)
|
1273
1249
|
|
1274
1250
|
metrics_df = pd.DataFrame(metrics)
|
1275
1251
|
mean_target_hdr = self.bundle.get("quality_metrics_mean_target_header")
|
@@ -1523,10 +1499,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
1523
1499
|
):
|
1524
1500
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
1525
1501
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
1502
|
+
validated_X = self._validate_X(X)
|
1503
|
+
validated_y = self._validate_y(validated_X, y)
|
1526
1504
|
checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
1527
|
-
|
1505
|
+
validated_eval_set = (
|
1506
|
+
[self._validate_eval_set_pair(validated_X, eval_set_pair) for eval_set_pair in checked_eval_set]
|
1507
|
+
if checked_eval_set
|
1508
|
+
else None
|
1509
|
+
)
|
1528
1510
|
|
1529
|
-
sampled_data = self.
|
1511
|
+
sampled_data = self._sample_data_for_metrics(
|
1530
1512
|
trace_id,
|
1531
1513
|
validated_X,
|
1532
1514
|
validated_y,
|
@@ -1600,11 +1582,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1600
1582
|
fitting_enriched_X = fitting_enriched_X.drop(columns=columns_with_high_cardinality, errors="ignore")
|
1601
1583
|
|
1602
1584
|
# Detect and drop constant columns
|
1603
|
-
constant_columns =
|
1604
|
-
c
|
1605
|
-
for c in FeaturesValidator.find_constant_features(fitting_X)
|
1606
|
-
if self.fit_columns_renaming.get(c, c) not in (self.id_columns or [])
|
1607
|
-
]
|
1585
|
+
constant_columns = FeaturesValidator.find_constant_features(fitting_X)
|
1608
1586
|
if len(constant_columns) > 0:
|
1609
1587
|
self.logger.warning(f"Constant columns {constant_columns} will be dropped for metrics calculation")
|
1610
1588
|
fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
|
@@ -1647,7 +1625,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1647
1625
|
fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
|
1648
1626
|
)
|
1649
1627
|
fitting_X = fitting_X[fitting_x_columns]
|
1650
|
-
fitting_X, _ = self._encode_id_columns(fitting_X, self.fit_columns_renaming)
|
1651
1628
|
self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
|
1652
1629
|
fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
|
1653
1630
|
fitting_enriched_x_columns = sort_columns(
|
@@ -1659,7 +1636,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1659
1636
|
logger=self.logger,
|
1660
1637
|
)
|
1661
1638
|
fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
|
1662
|
-
fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X, self.fit_columns_renaming)
|
1663
1639
|
self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
|
1664
1640
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
1665
1641
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
@@ -1687,12 +1663,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1687
1663
|
.astype(np.float64)
|
1688
1664
|
)
|
1689
1665
|
|
1690
|
-
fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X, self.fit_columns_renaming)
|
1691
|
-
fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X, self.fit_columns_renaming)
|
1692
|
-
|
1693
|
-
if len(unknown_dict) > 0:
|
1694
|
-
print(self.bundle.get("unknown_id_column_value_in_eval_set").format(unknown_dict))
|
1695
|
-
|
1696
1666
|
fitting_eval_set_dict[idx] = (
|
1697
1667
|
fitting_eval_X,
|
1698
1668
|
eval_y_sorted,
|
@@ -1714,7 +1684,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1714
1684
|
)
|
1715
1685
|
|
1716
1686
|
@dataclass
|
1717
|
-
class
|
1687
|
+
class _SampledDataForMetrics:
|
1718
1688
|
X_sampled: pd.DataFrame
|
1719
1689
|
y_sampled: pd.Series
|
1720
1690
|
enriched_X: pd.DataFrame
|
@@ -1722,7 +1692,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1722
1692
|
search_keys: Dict[str, SearchKey]
|
1723
1693
|
columns_renaming: Dict[str, str]
|
1724
1694
|
|
1725
|
-
def
|
1695
|
+
def _sample_data_for_metrics(
|
1726
1696
|
self,
|
1727
1697
|
trace_id: str,
|
1728
1698
|
validated_X: Union[pd.DataFrame, pd.Series, np.ndarray, None],
|
@@ -1734,7 +1704,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1734
1704
|
remove_outliers_calc_metrics: Optional[bool],
|
1735
1705
|
progress_bar: Optional[ProgressBar],
|
1736
1706
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
1737
|
-
) ->
|
1707
|
+
) -> _SampledDataForMetrics:
|
1738
1708
|
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
1739
1709
|
cached_sampled_datasets = self.__cached_sampled_datasets.get(datasets_hash)
|
1740
1710
|
if cached_sampled_datasets is not None and is_input_same_as_fit and remove_outliers_calc_metrics is None:
|
@@ -1742,7 +1712,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1742
1712
|
return self.__get_sampled_cached_enriched(datasets_hash, exclude_features_sources)
|
1743
1713
|
elif len(self.feature_importances_) == 0:
|
1744
1714
|
self.logger.info("No external features selected. So use only input datasets for metrics calculation")
|
1745
|
-
return self.
|
1715
|
+
return self.__sample_only_input(validated_X, validated_y, eval_set, is_demo_dataset)
|
1746
1716
|
# TODO save and check if dataset was deduplicated - use imbalance branch for such case
|
1747
1717
|
elif (
|
1748
1718
|
not self.imbalanced
|
@@ -1751,14 +1721,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
1751
1721
|
and self.df_with_original_index is not None
|
1752
1722
|
):
|
1753
1723
|
self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
|
1754
|
-
return self.
|
1724
|
+
return self.__sample_balanced(eval_set, trace_id, remove_outliers_calc_metrics)
|
1755
1725
|
else:
|
1756
1726
|
self.logger.info(
|
1757
1727
|
"Dataset is imbalanced or exclude_features_sources or X was passed or this is saved search."
|
1758
1728
|
" Run transform"
|
1759
1729
|
)
|
1760
1730
|
print(self.bundle.get("prepare_data_for_metrics"))
|
1761
|
-
return self.
|
1731
|
+
return self.__sample_imbalanced(
|
1762
1732
|
validated_X,
|
1763
1733
|
validated_y,
|
1764
1734
|
eval_set,
|
@@ -1770,7 +1740,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1770
1740
|
|
1771
1741
|
def __get_sampled_cached_enriched(
|
1772
1742
|
self, datasets_hash: str, exclude_features_sources: Optional[List[str]]
|
1773
|
-
) ->
|
1743
|
+
) -> _SampledDataForMetrics:
|
1774
1744
|
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
|
1775
1745
|
self.__cached_sampled_datasets[datasets_hash]
|
1776
1746
|
)
|
@@ -1787,9 +1757,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
1787
1757
|
search_keys,
|
1788
1758
|
)
|
1789
1759
|
|
1790
|
-
def
|
1760
|
+
def __sample_only_input(
|
1791
1761
|
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
|
1792
|
-
) ->
|
1762
|
+
) -> _SampledDataForMetrics:
|
1793
1763
|
eval_set_sampled_dict = {}
|
1794
1764
|
|
1795
1765
|
df = validated_X.copy()
|
@@ -1831,13 +1801,24 @@ class FeaturesEnricher(TransformerMixin):
|
|
1831
1801
|
normalizer = Normalizer(self.bundle, self.logger)
|
1832
1802
|
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
1833
1803
|
columns_renaming = normalizer.columns_renaming
|
1804
|
+
# columns_renaming = {c: c for c in df.columns}
|
1834
1805
|
|
1835
1806
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
1836
|
-
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET, columns_renaming, silent=True)
|
1837
1807
|
|
1808
|
+
num_samples = _num_samples(df)
|
1809
|
+
sample_threshold, sample_rows = (
|
1810
|
+
(Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD, Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS)
|
1811
|
+
if eval_set is not None
|
1812
|
+
else (Dataset.FIT_SAMPLE_THRESHOLD, Dataset.FIT_SAMPLE_ROWS)
|
1813
|
+
)
|
1814
|
+
|
1815
|
+
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET, columns_renaming, silent=True)
|
1838
1816
|
# Sample after sorting by system_record_id for idempotency
|
1839
1817
|
df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
|
1840
|
-
|
1818
|
+
|
1819
|
+
if num_samples > sample_threshold:
|
1820
|
+
self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
|
1821
|
+
df = df.sample(n=sample_rows, random_state=self.random_state)
|
1841
1822
|
|
1842
1823
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
1843
1824
|
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
@@ -1866,12 +1847,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
1866
1847
|
search_keys,
|
1867
1848
|
)
|
1868
1849
|
|
1869
|
-
def
|
1850
|
+
def __sample_balanced(
|
1870
1851
|
self,
|
1871
1852
|
eval_set: Optional[List[tuple]],
|
1872
1853
|
trace_id: str,
|
1873
1854
|
remove_outliers_calc_metrics: Optional[bool],
|
1874
|
-
) ->
|
1855
|
+
) -> _SampledDataForMetrics:
|
1875
1856
|
eval_set_sampled_dict = {}
|
1876
1857
|
search_keys = self.fit_search_keys
|
1877
1858
|
|
@@ -1970,7 +1951,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1970
1951
|
search_keys,
|
1971
1952
|
)
|
1972
1953
|
|
1973
|
-
def
|
1954
|
+
def __sample_imbalanced(
|
1974
1955
|
self,
|
1975
1956
|
validated_X: pd.DataFrame,
|
1976
1957
|
validated_y: pd.Series,
|
@@ -1979,7 +1960,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1979
1960
|
trace_id: str,
|
1980
1961
|
progress_bar: Optional[ProgressBar],
|
1981
1962
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
1982
|
-
) ->
|
1963
|
+
) -> _SampledDataForMetrics:
|
1983
1964
|
has_eval_set = eval_set is not None
|
1984
1965
|
|
1985
1966
|
self.logger.info(f"Transform {'with' if has_eval_set else 'without'} eval_set")
|
@@ -2036,58 +2017,61 @@ class FeaturesEnricher(TransformerMixin):
|
|
2036
2017
|
)
|
2037
2018
|
|
2038
2019
|
def __combine_train_and_eval_sets(
|
2039
|
-
self,
|
2020
|
+
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]]
|
2040
2021
|
) -> pd.DataFrame:
|
2041
|
-
df =
|
2042
|
-
|
2043
|
-
|
2044
|
-
if not eval_set:
|
2022
|
+
df = validated_X.copy()
|
2023
|
+
df[TARGET] = validated_y
|
2024
|
+
if eval_set is None:
|
2045
2025
|
return df
|
2046
2026
|
|
2047
2027
|
df[EVAL_SET_INDEX] = 0
|
2048
2028
|
|
2049
2029
|
for idx, eval_pair in enumerate(eval_set):
|
2050
|
-
eval_x, eval_y = eval_pair
|
2030
|
+
eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
|
2051
2031
|
eval_df_with_index = eval_x.copy()
|
2052
|
-
|
2053
|
-
eval_df_with_index[TARGET] = eval_y
|
2032
|
+
eval_df_with_index[TARGET] = eval_y
|
2054
2033
|
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
2055
2034
|
df = pd.concat([df, eval_df_with_index])
|
2056
2035
|
|
2057
2036
|
return df
|
2058
2037
|
|
2059
2038
|
def __downsample_for_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
|
2060
|
-
force_downsampling = self.__use_force_downsampling(df)
|
2061
|
-
|
2062
|
-
sample_columns = SampleColumns(
|
2063
|
-
ids=self.id_columns,
|
2064
|
-
date=self._get_date_column(self.search_keys),
|
2065
|
-
target=TARGET,
|
2066
|
-
eval_set_index=EVAL_SET_INDEX,
|
2067
|
-
)
|
2068
|
-
|
2069
|
-
return sample(
|
2070
|
-
df,
|
2071
|
-
self.model_task_type,
|
2072
|
-
self.cv,
|
2073
|
-
self.sample_config,
|
2074
|
-
sample_columns,
|
2075
|
-
self.random_state,
|
2076
|
-
force_downsampling=force_downsampling,
|
2077
|
-
balance=False,
|
2078
|
-
logger=self.logger,
|
2079
|
-
bundle=self.bundle,
|
2080
|
-
warning_callback=self.__log_warning,
|
2081
|
-
)
|
2082
|
-
|
2083
|
-
def __use_force_downsampling(self, df: pd.DataFrame) -> bool:
|
2084
2039
|
num_samples = _num_samples(df)
|
2085
|
-
|
2040
|
+
force_downsampling = (
|
2086
2041
|
not self.disable_force_downsampling
|
2087
2042
|
and self.columns_for_online_api is not None
|
2088
2043
|
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
2089
2044
|
)
|
2090
2045
|
|
2046
|
+
if force_downsampling:
|
2047
|
+
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
2048
|
+
return balance_undersample_forced(
|
2049
|
+
df=df,
|
2050
|
+
target_column=TARGET,
|
2051
|
+
id_columns=self.id_columns,
|
2052
|
+
date_column=self._get_date_column(self.search_keys),
|
2053
|
+
task_type=self.model_task_type,
|
2054
|
+
cv_type=self.cv,
|
2055
|
+
random_state=self.random_state,
|
2056
|
+
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
2057
|
+
logger=self.logger,
|
2058
|
+
bundle=self.bundle,
|
2059
|
+
warning_callback=self.__log_warning,
|
2060
|
+
)
|
2061
|
+
elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
2062
|
+
if EVAL_SET_INDEX in df.columns:
|
2063
|
+
threshold = Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
|
2064
|
+
sample_size = Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS
|
2065
|
+
else:
|
2066
|
+
threshold = Dataset.FIT_SAMPLE_THRESHOLD
|
2067
|
+
sample_size = Dataset.FIT_SAMPLE_ROWS
|
2068
|
+
|
2069
|
+
if num_samples > threshold:
|
2070
|
+
self.logger.info(f"Downsampling from {num_samples} to {sample_size}")
|
2071
|
+
return df.sample(n=sample_size, random_state=self.random_state)
|
2072
|
+
|
2073
|
+
return df
|
2074
|
+
|
2091
2075
|
def __extract_train_data(
|
2092
2076
|
self, enriched_df: pd.DataFrame, x_columns: List[str]
|
2093
2077
|
) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
|
@@ -2123,7 +2107,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2123
2107
|
eval_set_sampled_dict: Dict[int, Tuple],
|
2124
2108
|
columns_renaming: Dict[str, str],
|
2125
2109
|
search_keys: Dict[str, SearchKey],
|
2126
|
-
) ->
|
2110
|
+
) -> _SampledDataForMetrics:
|
2127
2111
|
|
2128
2112
|
self.__cached_sampled_datasets[datasets_hash] = (
|
2129
2113
|
X_sampled,
|
@@ -2154,7 +2138,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2154
2138
|
for k, v in search_keys.items()
|
2155
2139
|
if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
|
2156
2140
|
}
|
2157
|
-
return FeaturesEnricher.
|
2141
|
+
return FeaturesEnricher._SampledDataForMetrics(
|
2158
2142
|
X_sampled=X_sampled,
|
2159
2143
|
y_sampled=y_sampled,
|
2160
2144
|
enriched_X=enriched_X,
|
@@ -2302,10 +2286,13 @@ if response.status_code == 200:
|
|
2302
2286
|
with MDC(trace_id=trace_id, search_id=search_id):
|
2303
2287
|
self.logger.info("Start transform")
|
2304
2288
|
|
2305
|
-
validated_X
|
2306
|
-
|
2307
|
-
|
2308
|
-
|
2289
|
+
validated_X = self._validate_X(X, is_transform=True)
|
2290
|
+
if y is not None:
|
2291
|
+
validated_y = self._validate_y(validated_X, y)
|
2292
|
+
df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set=None)
|
2293
|
+
else:
|
2294
|
+
validated_y = None
|
2295
|
+
df = validated_X
|
2309
2296
|
|
2310
2297
|
validated_Xy = df.copy()
|
2311
2298
|
|
@@ -2359,7 +2346,7 @@ if response.status_code == 200:
|
|
2359
2346
|
|
2360
2347
|
is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
|
2361
2348
|
|
2362
|
-
columns_to_drop = [c for c in df.columns if c in self.feature_names_
|
2349
|
+
columns_to_drop = [c for c in df.columns if c in self.feature_names_]
|
2363
2350
|
if len(columns_to_drop) > 0:
|
2364
2351
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
2365
2352
|
self.logger.warning(msg)
|
@@ -2563,7 +2550,6 @@ if response.status_code == 200:
|
|
2563
2550
|
id_columns=self.__get_renamed_id_columns(columns_renaming),
|
2564
2551
|
date_column=self._get_date_column(search_keys),
|
2565
2552
|
date_format=self.date_format,
|
2566
|
-
sample_config=self.sample_config,
|
2567
2553
|
rest_client=self.rest_client,
|
2568
2554
|
logger=self.logger,
|
2569
2555
|
bundle=self.bundle,
|
@@ -2667,7 +2653,7 @@ if response.status_code == 200:
|
|
2667
2653
|
selecting_columns = [
|
2668
2654
|
c
|
2669
2655
|
for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
|
2670
|
-
if c not in self.zero_shap_client_features
|
2656
|
+
if c not in self.zero_shap_client_features
|
2671
2657
|
]
|
2672
2658
|
selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
|
2673
2659
|
if add_fit_system_record_id:
|
@@ -2815,8 +2801,13 @@ if response.status_code == 200:
|
|
2815
2801
|
self.fit_dropped_features = set()
|
2816
2802
|
self.fit_generated_features = []
|
2817
2803
|
|
2818
|
-
validated_X
|
2819
|
-
|
2804
|
+
validated_X = self._validate_X(X)
|
2805
|
+
validated_y = self._validate_y(validated_X, y)
|
2806
|
+
validated_eval_set = (
|
2807
|
+
[self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in eval_set]
|
2808
|
+
if eval_set is not None
|
2809
|
+
else None
|
2810
|
+
)
|
2820
2811
|
is_demo_dataset = hash_input(validated_X, validated_y, validated_eval_set) in DEMO_DATASET_HASHES
|
2821
2812
|
if is_demo_dataset:
|
2822
2813
|
msg = self.bundle.get("demo_dataset_info")
|
@@ -2861,8 +2852,14 @@ if response.status_code == 200:
|
|
2861
2852
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
2862
2853
|
)
|
2863
2854
|
|
2864
|
-
df =
|
2865
|
-
|
2855
|
+
df = pd.concat([validated_X, validated_y], axis=1)
|
2856
|
+
|
2857
|
+
if validated_eval_set is not None and len(validated_eval_set) > 0:
|
2858
|
+
df[EVAL_SET_INDEX] = 0
|
2859
|
+
for idx, (eval_X, eval_y) in enumerate(validated_eval_set):
|
2860
|
+
eval_df = pd.concat([eval_X, eval_y], axis=1)
|
2861
|
+
eval_df[EVAL_SET_INDEX] = idx + 1
|
2862
|
+
df = pd.concat([df, eval_df])
|
2866
2863
|
|
2867
2864
|
self.fit_search_keys = self.search_keys.copy()
|
2868
2865
|
df = self.__handle_index_search_keys(df, self.fit_search_keys)
|
@@ -2973,8 +2970,47 @@ if response.status_code == 200:
|
|
2973
2970
|
# TODO check maybe need to drop _time column from df_with_original_index
|
2974
2971
|
|
2975
2972
|
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, self.fit_columns_renaming)
|
2976
|
-
|
2977
|
-
|
2973
|
+
|
2974
|
+
# Convert EMAIL to HEM after unnesting to do it only with one column
|
2975
|
+
email_column = self._get_email_column(self.fit_search_keys)
|
2976
|
+
hem_column = self._get_hem_column(self.fit_search_keys)
|
2977
|
+
if email_column:
|
2978
|
+
converter = EmailSearchKeyConverter(
|
2979
|
+
email_column,
|
2980
|
+
hem_column,
|
2981
|
+
self.fit_search_keys,
|
2982
|
+
self.fit_columns_renaming,
|
2983
|
+
list(unnest_search_keys.keys()),
|
2984
|
+
self.bundle,
|
2985
|
+
self.logger,
|
2986
|
+
)
|
2987
|
+
df = converter.convert(df)
|
2988
|
+
|
2989
|
+
ip_column = self._get_ip_column(self.fit_search_keys)
|
2990
|
+
if ip_column:
|
2991
|
+
converter = IpSearchKeyConverter(
|
2992
|
+
ip_column,
|
2993
|
+
self.fit_search_keys,
|
2994
|
+
self.fit_columns_renaming,
|
2995
|
+
list(unnest_search_keys.keys()),
|
2996
|
+
self.bundle,
|
2997
|
+
self.logger,
|
2998
|
+
)
|
2999
|
+
df = converter.convert(df)
|
3000
|
+
phone_column = self._get_phone_column(self.fit_search_keys)
|
3001
|
+
country_column = self._get_country_column(self.fit_search_keys)
|
3002
|
+
if phone_column:
|
3003
|
+
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
3004
|
+
df = converter.convert(df)
|
3005
|
+
|
3006
|
+
if country_column:
|
3007
|
+
converter = CountrySearchKeyConverter(country_column)
|
3008
|
+
df = converter.convert(df)
|
3009
|
+
|
3010
|
+
postal_code = self._get_postal_column(self.fit_search_keys)
|
3011
|
+
if postal_code:
|
3012
|
+
converter = PostalCodeSearchKeyConverter(postal_code)
|
3013
|
+
df = converter.convert(df)
|
2978
3014
|
|
2979
3015
|
non_feature_columns = [
|
2980
3016
|
self.TARGET_NAME,
|
@@ -3025,7 +3061,11 @@ if response.status_code == 200:
|
|
3025
3061
|
runtime_parameters = self._get_copy_of_runtime_parameters()
|
3026
3062
|
|
3027
3063
|
# Force downsampling to 7000 for API features generation
|
3028
|
-
force_downsampling =
|
3064
|
+
force_downsampling = (
|
3065
|
+
not self.disable_force_downsampling
|
3066
|
+
and self.columns_for_online_api is not None
|
3067
|
+
and len(df) > Dataset.FORCE_SAMPLE_SIZE
|
3068
|
+
)
|
3029
3069
|
if force_downsampling:
|
3030
3070
|
runtime_parameters.properties["fast_fit"] = True
|
3031
3071
|
|
@@ -3045,7 +3085,6 @@ if response.status_code == 200:
|
|
3045
3085
|
logger=self.logger,
|
3046
3086
|
bundle=self.bundle,
|
3047
3087
|
warning_callback=self.__log_warning,
|
3048
|
-
sample_config=self.sample_config,
|
3049
3088
|
)
|
3050
3089
|
dataset.columns_renaming = self.fit_columns_renaming
|
3051
3090
|
|
@@ -3201,49 +3240,6 @@ if response.status_code == 200:
|
|
3201
3240
|
if not self.warning_counter.has_warnings():
|
3202
3241
|
self.__display_support_link(self.bundle.get("all_ok_community_invite"))
|
3203
3242
|
|
3204
|
-
def __convert_unnestable_keys(self, df: pd.DataFrame, unnest_search_keys: Dict[str, str]):
|
3205
|
-
email_column = self._get_email_column(self.fit_search_keys)
|
3206
|
-
hem_column = self._get_hem_column(self.fit_search_keys)
|
3207
|
-
if email_column:
|
3208
|
-
converter = EmailSearchKeyConverter(
|
3209
|
-
email_column,
|
3210
|
-
hem_column,
|
3211
|
-
self.fit_search_keys,
|
3212
|
-
self.fit_columns_renaming,
|
3213
|
-
list(unnest_search_keys.keys()),
|
3214
|
-
self.bundle,
|
3215
|
-
self.logger,
|
3216
|
-
)
|
3217
|
-
df = converter.convert(df)
|
3218
|
-
|
3219
|
-
ip_column = self._get_ip_column(self.fit_search_keys)
|
3220
|
-
if ip_column:
|
3221
|
-
converter = IpSearchKeyConverter(
|
3222
|
-
ip_column,
|
3223
|
-
self.fit_search_keys,
|
3224
|
-
self.fit_columns_renaming,
|
3225
|
-
list(unnest_search_keys.keys()),
|
3226
|
-
self.bundle,
|
3227
|
-
self.logger,
|
3228
|
-
)
|
3229
|
-
df = converter.convert(df)
|
3230
|
-
phone_column = self._get_phone_column(self.fit_search_keys)
|
3231
|
-
country_column = self._get_country_column(self.fit_search_keys)
|
3232
|
-
if phone_column:
|
3233
|
-
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
3234
|
-
df = converter.convert(df)
|
3235
|
-
|
3236
|
-
if country_column:
|
3237
|
-
converter = CountrySearchKeyConverter(country_column)
|
3238
|
-
df = converter.convert(df)
|
3239
|
-
|
3240
|
-
postal_code = self._get_postal_column(self.fit_search_keys)
|
3241
|
-
if postal_code:
|
3242
|
-
converter = PostalCodeSearchKeyConverter(postal_code)
|
3243
|
-
df = converter.convert(df)
|
3244
|
-
|
3245
|
-
return df
|
3246
|
-
|
3247
3243
|
def __should_add_date_column(self):
|
3248
3244
|
return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
|
3249
3245
|
|
@@ -3286,57 +3282,6 @@ if response.status_code == 200:
|
|
3286
3282
|
search_keys_with_autodetection = {**self.search_keys, **self.autodetected_search_keys}
|
3287
3283
|
return [c for c, v in search_keys_with_autodetection.items() if v.value.value in keys]
|
3288
3284
|
|
3289
|
-
def _validate_train_eval(
|
3290
|
-
self,
|
3291
|
-
X: pd.DataFrame,
|
3292
|
-
y: Optional[pd.Series] = None,
|
3293
|
-
eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
|
3294
|
-
is_transform: bool = False,
|
3295
|
-
) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
|
3296
|
-
validated_X = self._validate_X(X, is_transform)
|
3297
|
-
validated_y = self._validate_y(validated_X, y)
|
3298
|
-
validated_eval_set = self._validate_eval_set(validated_X, eval_set)
|
3299
|
-
return validated_X, validated_y, validated_eval_set
|
3300
|
-
|
3301
|
-
def _encode_id_columns(
|
3302
|
-
self,
|
3303
|
-
X: pd.DataFrame,
|
3304
|
-
columns_renaming: Optional[Dict[str, str]] = None,
|
3305
|
-
) -> Tuple[pd.DataFrame, Dict[str, List[Any]]]:
|
3306
|
-
columns_renaming = columns_renaming or {}
|
3307
|
-
unknown_dict = {}
|
3308
|
-
|
3309
|
-
if self.id_columns and self.id_columns_encoder is not None:
|
3310
|
-
inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
3311
|
-
renamed_id_columns = [
|
3312
|
-
inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
|
3313
|
-
]
|
3314
|
-
self.logger.info(f"Convert id columns to int: {renamed_id_columns}")
|
3315
|
-
encoded = self.id_columns_encoder.transform(X[renamed_id_columns].rename(columns=columns_renaming))
|
3316
|
-
for i, c in enumerate(renamed_id_columns):
|
3317
|
-
unknown_values = X[encoded[:, i] == -1][c].unique().tolist()
|
3318
|
-
if len(unknown_values) > 0:
|
3319
|
-
unknown_dict[c] = unknown_values
|
3320
|
-
X[renamed_id_columns] = encoded
|
3321
|
-
X = X.loc[(X[renamed_id_columns] != -1).all(axis=1)]
|
3322
|
-
|
3323
|
-
if len(unknown_dict) > 0:
|
3324
|
-
self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
|
3325
|
-
|
3326
|
-
return X, unknown_dict
|
3327
|
-
|
3328
|
-
def _decode_id_columns(self, X: pd.DataFrame, columns_renaming: Dict[str, str]):
|
3329
|
-
columns_renaming = columns_renaming or {}
|
3330
|
-
if self.id_columns and self.id_columns_encoder is not None:
|
3331
|
-
inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
3332
|
-
renamed_id_columns = [
|
3333
|
-
inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
|
3334
|
-
]
|
3335
|
-
decoded = self.id_columns_encoder.inverse_transform(X[renamed_id_columns].rename(columns=columns_renaming))
|
3336
|
-
X[renamed_id_columns] = decoded
|
3337
|
-
|
3338
|
-
return X
|
3339
|
-
|
3340
3285
|
def _validate_X(self, X, is_transform=False) -> pd.DataFrame:
|
3341
3286
|
if isinstance(X, pd.DataFrame):
|
3342
3287
|
if isinstance(X.columns, pd.MultiIndex) or isinstance(X.index, pd.MultiIndex):
|
@@ -3378,9 +3323,7 @@ if response.status_code == 200:
|
|
3378
3323
|
|
3379
3324
|
return validated_X
|
3380
3325
|
|
3381
|
-
def _validate_y(self, X: pd.DataFrame, y) ->
|
3382
|
-
if y is None:
|
3383
|
-
return None
|
3326
|
+
def _validate_y(self, X: pd.DataFrame, y) -> pd.Series:
|
3384
3327
|
if (
|
3385
3328
|
not isinstance(y, pd.Series)
|
3386
3329
|
and not isinstance(y, pd.DataFrame)
|
@@ -3427,11 +3370,6 @@ if response.status_code == 200:
|
|
3427
3370
|
|
3428
3371
|
return validated_y
|
3429
3372
|
|
3430
|
-
def _validate_eval_set(self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]]):
|
3431
|
-
if eval_set is None:
|
3432
|
-
return None
|
3433
|
-
return [self._validate_eval_set_pair(X, eval_pair) for eval_pair in eval_set]
|
3434
|
-
|
3435
3373
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
3436
3374
|
if len(eval_pair) != 2:
|
3437
3375
|
raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
@@ -3512,7 +3450,7 @@ if response.status_code == 200:
|
|
3512
3450
|
raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
|
3513
3451
|
|
3514
3452
|
# Check for duplicates between train and eval sets by comparing all values
|
3515
|
-
train_eval_intersection = pd.merge(X, validated_eval_X, how=
|
3453
|
+
train_eval_intersection = pd.merge(X, validated_eval_X, how='inner')
|
3516
3454
|
if len(train_eval_intersection) > 0:
|
3517
3455
|
raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
|
3518
3456
|
|
@@ -4042,7 +3980,7 @@ if response.status_code == 200:
|
|
4042
3980
|
if features_meta is None:
|
4043
3981
|
raise Exception(self.bundle.get("missing_features_meta"))
|
4044
3982
|
|
4045
|
-
return [f.name for f in features_meta if f.type == "categorical"
|
3983
|
+
return [f.name for f in features_meta if f.type == "categorical"]
|
4046
3984
|
|
4047
3985
|
def __prepare_feature_importances(
|
4048
3986
|
self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
@@ -4685,6 +4623,35 @@ if response.status_code == 200:
|
|
4685
4623
|
self.logger.warning("Failed to dump input files", exc_info=True)
|
4686
4624
|
|
4687
4625
|
|
4626
|
+
def _num_samples(x):
|
4627
|
+
"""Return number of samples in array-like x."""
|
4628
|
+
if x is None:
|
4629
|
+
return 0
|
4630
|
+
message = "Expected sequence or array-like, got %s" % type(x)
|
4631
|
+
if hasattr(x, "fit") and callable(x.fit):
|
4632
|
+
# Don't get num_samples from an ensembles length!
|
4633
|
+
raise TypeError(message)
|
4634
|
+
|
4635
|
+
if not hasattr(x, "__len__") and not hasattr(x, "shape"):
|
4636
|
+
if hasattr(x, "__array__"):
|
4637
|
+
x = np.asarray(x)
|
4638
|
+
else:
|
4639
|
+
raise TypeError(message)
|
4640
|
+
|
4641
|
+
if hasattr(x, "shape") and x.shape is not None:
|
4642
|
+
if len(x.shape) == 0:
|
4643
|
+
raise TypeError("Singleton array %r cannot be considered a valid collection." % x)
|
4644
|
+
# Check that shape is returning an integer or default to len
|
4645
|
+
# Dask dataframes may not return numeric shape[0] value
|
4646
|
+
if isinstance(x.shape[0], numbers.Integral):
|
4647
|
+
return x.shape[0]
|
4648
|
+
|
4649
|
+
try:
|
4650
|
+
return len(x)
|
4651
|
+
except TypeError as type_error:
|
4652
|
+
raise TypeError(message) from type_error
|
4653
|
+
|
4654
|
+
|
4688
4655
|
def is_frames_equal(first, second, bundle: ResourceBundle) -> bool:
|
4689
4656
|
if (isinstance(first, pd.DataFrame) and isinstance(second, pd.DataFrame)) or (
|
4690
4657
|
isinstance(first, pd.Series) and isinstance(second, pd.Series)
|