upgini 1.2.90__py3-none-any.whl → 1.2.91a3884.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/dataset.py +55 -96
- upgini/features_enricher.py +207 -187
- upgini/metadata.py +3 -0
- upgini/metrics.py +11 -10
- upgini/resource_bundle/strings.properties +2 -0
- upgini/utils/sample_utils.py +410 -0
- upgini/utils/target_utils.py +3 -199
- {upgini-1.2.90.dist-info → upgini-1.2.91a3884.dev2.dist-info}/METADATA +10 -1
- {upgini-1.2.90.dist-info → upgini-1.2.91a3884.dev2.dist-info}/RECORD +12 -11
- {upgini-1.2.90.dist-info → upgini-1.2.91a3884.dev2.dist-info}/WHEEL +1 -1
- {upgini-1.2.90.dist-info → upgini-1.2.91a3884.dev2.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
@@ -30,6 +30,7 @@ from scipy.stats import ks_2samp
|
|
30
30
|
from sklearn.base import TransformerMixin
|
31
31
|
from sklearn.exceptions import NotFittedError
|
32
32
|
from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
|
33
|
+
from sklearn.preprocessing import OrdinalEncoder
|
33
34
|
|
34
35
|
from upgini.autofe.feature import Feature
|
35
36
|
from upgini.autofe.timeseries import TimeSeriesBase
|
@@ -118,9 +119,9 @@ except Exception:
|
|
118
119
|
CustomFallbackProgressBar as ProgressBar,
|
119
120
|
)
|
120
121
|
|
122
|
+
from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
|
121
123
|
from upgini.utils.sort import sort_columns
|
122
124
|
from upgini.utils.target_utils import (
|
123
|
-
balance_undersample_forced,
|
124
125
|
calculate_psi,
|
125
126
|
define_task,
|
126
127
|
)
|
@@ -242,6 +243,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
242
243
|
disable_force_downsampling: bool = False,
|
243
244
|
id_columns: Optional[List[str]] = None,
|
244
245
|
generate_search_key_features: bool = True,
|
246
|
+
sample_config: Optional[SampleConfig] = None,
|
245
247
|
**kwargs,
|
246
248
|
):
|
247
249
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
@@ -286,6 +288,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
286
288
|
|
287
289
|
self.search_keys = search_keys or {}
|
288
290
|
self.id_columns = id_columns
|
291
|
+
self.id_columns_encoder = None
|
289
292
|
self.country_code = country_code
|
290
293
|
self.__validate_search_keys(search_keys, search_id)
|
291
294
|
|
@@ -359,10 +362,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
359
362
|
self.columns_for_online_api = columns_for_online_api
|
360
363
|
if columns_for_online_api is not None:
|
361
364
|
self.runtime_parameters.properties["columns_for_online_api"] = ",".join(columns_for_online_api)
|
362
|
-
|
363
|
-
|
364
|
-
Dataset.FIT_SAMPLE_THRESHOLD = int(maybe_downsampling_limit)
|
365
|
-
Dataset.FIT_SAMPLE_ROWS = int(maybe_downsampling_limit)
|
365
|
+
|
366
|
+
self.sample_config = self._get_sample_config(sample_config)
|
366
367
|
|
367
368
|
self.raise_validation_error = raise_validation_error
|
368
369
|
self.exclude_columns = exclude_columns
|
@@ -375,6 +376,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
375
376
|
self.autofe_features_display_handle = None
|
376
377
|
self.report_button_handle = None
|
377
378
|
|
379
|
+
def _get_sample_config(self, sample_config: Optional[SampleConfig] = None):
|
380
|
+
sample_config = sample_config or SampleConfig(force_sample_size=Dataset.FORCE_SAMPLE_SIZE)
|
381
|
+
|
382
|
+
maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
|
383
|
+
if maybe_downsampling_limit is not None:
|
384
|
+
sample_config.fit_sample_rows = int(maybe_downsampling_limit)
|
385
|
+
sample_config.fit_sample_threshold = int(maybe_downsampling_limit)
|
386
|
+
|
387
|
+
return sample_config
|
388
|
+
|
378
389
|
def _get_api_key(self):
|
379
390
|
return self._api_key
|
380
391
|
|
@@ -928,16 +939,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
928
939
|
):
|
929
940
|
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
930
941
|
|
931
|
-
validated_X = self.
|
932
|
-
|
933
|
-
validated_eval_set = (
|
934
|
-
[self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in effective_eval_set]
|
935
|
-
if effective_eval_set is not None
|
936
|
-
else None
|
942
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
943
|
+
effective_X, effective_y, effective_eval_set
|
937
944
|
)
|
938
945
|
|
939
946
|
if self.X is None:
|
940
947
|
self.X = X
|
948
|
+
self.id_columns_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1).fit(
|
949
|
+
X[self.id_columns or []]
|
950
|
+
)
|
941
951
|
if self.y is None:
|
942
952
|
self.y = y
|
943
953
|
if self.eval_set is None:
|
@@ -1245,7 +1255,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1245
1255
|
metrics.append(eval_metrics)
|
1246
1256
|
|
1247
1257
|
if updating_shaps is not None:
|
1248
|
-
self.
|
1258
|
+
decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
|
1259
|
+
self._update_shap_values(trace_id, decoded_X, updating_shaps, silent=not internal_call)
|
1249
1260
|
|
1250
1261
|
metrics_df = pd.DataFrame(metrics)
|
1251
1262
|
mean_target_hdr = self.bundle.get("quality_metrics_mean_target_header")
|
@@ -1499,16 +1510,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
1499
1510
|
):
|
1500
1511
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
1501
1512
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
1502
|
-
validated_X = self._validate_X(X)
|
1503
|
-
validated_y = self._validate_y(validated_X, y)
|
1504
1513
|
checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
1505
|
-
validated_eval_set = (
|
1506
|
-
[self._validate_eval_set_pair(validated_X, eval_set_pair) for eval_set_pair in checked_eval_set]
|
1507
|
-
if checked_eval_set
|
1508
|
-
else None
|
1509
|
-
)
|
1514
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set)
|
1510
1515
|
|
1511
|
-
sampled_data = self.
|
1516
|
+
sampled_data = self._get_enriched_for_metrics(
|
1512
1517
|
trace_id,
|
1513
1518
|
validated_X,
|
1514
1519
|
validated_y,
|
@@ -1582,7 +1587,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
1582
1587
|
fitting_enriched_X = fitting_enriched_X.drop(columns=columns_with_high_cardinality, errors="ignore")
|
1583
1588
|
|
1584
1589
|
# Detect and drop constant columns
|
1585
|
-
constant_columns =
|
1590
|
+
constant_columns = [
|
1591
|
+
c
|
1592
|
+
for c in FeaturesValidator.find_constant_features(fitting_X)
|
1593
|
+
if self.fit_columns_renaming.get(c, c) not in (self.id_columns or [])
|
1594
|
+
]
|
1586
1595
|
if len(constant_columns) > 0:
|
1587
1596
|
self.logger.warning(f"Constant columns {constant_columns} will be dropped for metrics calculation")
|
1588
1597
|
fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
|
@@ -1625,6 +1634,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1625
1634
|
fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
|
1626
1635
|
)
|
1627
1636
|
fitting_X = fitting_X[fitting_x_columns]
|
1637
|
+
fitting_X, _ = self._encode_id_columns(fitting_X, self.fit_columns_renaming)
|
1628
1638
|
self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
|
1629
1639
|
fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
|
1630
1640
|
fitting_enriched_x_columns = sort_columns(
|
@@ -1636,6 +1646,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1636
1646
|
logger=self.logger,
|
1637
1647
|
)
|
1638
1648
|
fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
|
1649
|
+
fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X, self.fit_columns_renaming)
|
1639
1650
|
self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
|
1640
1651
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
1641
1652
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
@@ -1663,6 +1674,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
1663
1674
|
.astype(np.float64)
|
1664
1675
|
)
|
1665
1676
|
|
1677
|
+
fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X, self.fit_columns_renaming)
|
1678
|
+
fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X, self.fit_columns_renaming)
|
1679
|
+
|
1680
|
+
if len(unknown_dict) > 0:
|
1681
|
+
print(self.bundle.get("unknown_id_column_value_in_eval_set").format(unknown_dict))
|
1682
|
+
|
1666
1683
|
fitting_eval_set_dict[idx] = (
|
1667
1684
|
fitting_eval_X,
|
1668
1685
|
eval_y_sorted,
|
@@ -1684,7 +1701,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1684
1701
|
)
|
1685
1702
|
|
1686
1703
|
@dataclass
|
1687
|
-
class
|
1704
|
+
class _EnrichedDataForMetrics:
|
1688
1705
|
X_sampled: pd.DataFrame
|
1689
1706
|
y_sampled: pd.Series
|
1690
1707
|
enriched_X: pd.DataFrame
|
@@ -1692,7 +1709,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1692
1709
|
search_keys: Dict[str, SearchKey]
|
1693
1710
|
columns_renaming: Dict[str, str]
|
1694
1711
|
|
1695
|
-
def
|
1712
|
+
def _get_enriched_for_metrics(
|
1696
1713
|
self,
|
1697
1714
|
trace_id: str,
|
1698
1715
|
validated_X: Union[pd.DataFrame, pd.Series, np.ndarray, None],
|
@@ -1704,7 +1721,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1704
1721
|
remove_outliers_calc_metrics: Optional[bool],
|
1705
1722
|
progress_bar: Optional[ProgressBar],
|
1706
1723
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
1707
|
-
) ->
|
1724
|
+
) -> _EnrichedDataForMetrics:
|
1708
1725
|
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
1709
1726
|
cached_sampled_datasets = self.__cached_sampled_datasets.get(datasets_hash)
|
1710
1727
|
if cached_sampled_datasets is not None and is_input_same_as_fit and remove_outliers_calc_metrics is None:
|
@@ -1712,7 +1729,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1712
1729
|
return self.__get_sampled_cached_enriched(datasets_hash, exclude_features_sources)
|
1713
1730
|
elif len(self.feature_importances_) == 0:
|
1714
1731
|
self.logger.info("No external features selected. So use only input datasets for metrics calculation")
|
1715
|
-
return self.
|
1732
|
+
return self.__get_enriched_as_input(validated_X, validated_y, eval_set, is_demo_dataset)
|
1716
1733
|
# TODO save and check if dataset was deduplicated - use imbalance branch for such case
|
1717
1734
|
elif (
|
1718
1735
|
not self.imbalanced
|
@@ -1721,14 +1738,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
1721
1738
|
and self.df_with_original_index is not None
|
1722
1739
|
):
|
1723
1740
|
self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
|
1724
|
-
return self.
|
1741
|
+
return self.__get_enriched_from_fit(eval_set, trace_id, remove_outliers_calc_metrics)
|
1725
1742
|
else:
|
1726
1743
|
self.logger.info(
|
1727
1744
|
"Dataset is imbalanced or exclude_features_sources or X was passed or this is saved search."
|
1728
1745
|
" Run transform"
|
1729
1746
|
)
|
1730
1747
|
print(self.bundle.get("prepare_data_for_metrics"))
|
1731
|
-
return self.
|
1748
|
+
return self.__get_enriched_from_transform(
|
1732
1749
|
validated_X,
|
1733
1750
|
validated_y,
|
1734
1751
|
eval_set,
|
@@ -1740,7 +1757,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1740
1757
|
|
1741
1758
|
def __get_sampled_cached_enriched(
|
1742
1759
|
self, datasets_hash: str, exclude_features_sources: Optional[List[str]]
|
1743
|
-
) ->
|
1760
|
+
) -> _EnrichedDataForMetrics:
|
1744
1761
|
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
|
1745
1762
|
self.__cached_sampled_datasets[datasets_hash]
|
1746
1763
|
)
|
@@ -1757,9 +1774,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
1757
1774
|
search_keys,
|
1758
1775
|
)
|
1759
1776
|
|
1760
|
-
def
|
1777
|
+
def __get_enriched_as_input(
|
1761
1778
|
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
|
1762
|
-
) ->
|
1779
|
+
) -> _EnrichedDataForMetrics:
|
1763
1780
|
eval_set_sampled_dict = {}
|
1764
1781
|
|
1765
1782
|
df = validated_X.copy()
|
@@ -1801,24 +1818,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
1801
1818
|
normalizer = Normalizer(self.bundle, self.logger)
|
1802
1819
|
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
1803
1820
|
columns_renaming = normalizer.columns_renaming
|
1804
|
-
# columns_renaming = {c: c for c in df.columns}
|
1805
1821
|
|
1806
1822
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
1807
|
-
|
1808
|
-
num_samples = _num_samples(df)
|
1809
|
-
sample_threshold, sample_rows = (
|
1810
|
-
(Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD, Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS)
|
1811
|
-
if eval_set is not None
|
1812
|
-
else (Dataset.FIT_SAMPLE_THRESHOLD, Dataset.FIT_SAMPLE_ROWS)
|
1813
|
-
)
|
1814
|
-
|
1815
1823
|
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET, columns_renaming, silent=True)
|
1824
|
+
|
1816
1825
|
# Sample after sorting by system_record_id for idempotency
|
1817
1826
|
df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
|
1818
|
-
|
1819
|
-
if num_samples > sample_threshold:
|
1820
|
-
self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
|
1821
|
-
df = df.sample(n=sample_rows, random_state=self.random_state)
|
1827
|
+
df = self.__downsample_for_metrics(df)
|
1822
1828
|
|
1823
1829
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
1824
1830
|
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
@@ -1847,12 +1853,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
1847
1853
|
search_keys,
|
1848
1854
|
)
|
1849
1855
|
|
1850
|
-
def
|
1856
|
+
def __get_enriched_from_fit(
|
1851
1857
|
self,
|
1852
1858
|
eval_set: Optional[List[tuple]],
|
1853
1859
|
trace_id: str,
|
1854
1860
|
remove_outliers_calc_metrics: Optional[bool],
|
1855
|
-
) ->
|
1861
|
+
) -> _EnrichedDataForMetrics:
|
1856
1862
|
eval_set_sampled_dict = {}
|
1857
1863
|
search_keys = self.fit_search_keys
|
1858
1864
|
|
@@ -1951,7 +1957,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1951
1957
|
search_keys,
|
1952
1958
|
)
|
1953
1959
|
|
1954
|
-
def
|
1960
|
+
def __get_enriched_from_transform(
|
1955
1961
|
self,
|
1956
1962
|
validated_X: pd.DataFrame,
|
1957
1963
|
validated_y: pd.Series,
|
@@ -1960,7 +1966,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1960
1966
|
trace_id: str,
|
1961
1967
|
progress_bar: Optional[ProgressBar],
|
1962
1968
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
1963
|
-
) ->
|
1969
|
+
) -> _EnrichedDataForMetrics:
|
1964
1970
|
has_eval_set = eval_set is not None
|
1965
1971
|
|
1966
1972
|
self.logger.info(f"Transform {'with' if has_eval_set else 'without'} eval_set")
|
@@ -2017,61 +2023,58 @@ class FeaturesEnricher(TransformerMixin):
|
|
2017
2023
|
)
|
2018
2024
|
|
2019
2025
|
def __combine_train_and_eval_sets(
|
2020
|
-
self,
|
2026
|
+
self, X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[List[tuple]] = None
|
2021
2027
|
) -> pd.DataFrame:
|
2022
|
-
df =
|
2023
|
-
|
2024
|
-
|
2028
|
+
df = X.copy()
|
2029
|
+
if y is not None:
|
2030
|
+
df[TARGET] = y
|
2031
|
+
if not eval_set:
|
2025
2032
|
return df
|
2026
2033
|
|
2027
2034
|
df[EVAL_SET_INDEX] = 0
|
2028
2035
|
|
2029
2036
|
for idx, eval_pair in enumerate(eval_set):
|
2030
|
-
eval_x, eval_y =
|
2037
|
+
eval_x, eval_y = eval_pair
|
2031
2038
|
eval_df_with_index = eval_x.copy()
|
2032
|
-
|
2039
|
+
if eval_y is not None:
|
2040
|
+
eval_df_with_index[TARGET] = eval_y
|
2033
2041
|
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
2034
2042
|
df = pd.concat([df, eval_df_with_index])
|
2035
2043
|
|
2036
2044
|
return df
|
2037
2045
|
|
2038
2046
|
def __downsample_for_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
|
2047
|
+
force_downsampling = self.__use_force_downsampling(df)
|
2048
|
+
|
2049
|
+
sample_columns = SampleColumns(
|
2050
|
+
ids=self.id_columns,
|
2051
|
+
date=self._get_date_column(self.search_keys),
|
2052
|
+
target=TARGET,
|
2053
|
+
eval_set_index=EVAL_SET_INDEX,
|
2054
|
+
)
|
2055
|
+
|
2056
|
+
return sample(
|
2057
|
+
df,
|
2058
|
+
self.model_task_type,
|
2059
|
+
self.cv,
|
2060
|
+
self.sample_config,
|
2061
|
+
sample_columns,
|
2062
|
+
self.random_state,
|
2063
|
+
force_downsampling=force_downsampling,
|
2064
|
+
balance=False,
|
2065
|
+
logger=self.logger,
|
2066
|
+
bundle=self.bundle,
|
2067
|
+
warning_callback=self.__log_warning,
|
2068
|
+
)
|
2069
|
+
|
2070
|
+
def __use_force_downsampling(self, df: pd.DataFrame) -> bool:
|
2039
2071
|
num_samples = _num_samples(df)
|
2040
|
-
|
2072
|
+
return (
|
2041
2073
|
not self.disable_force_downsampling
|
2042
2074
|
and self.columns_for_online_api is not None
|
2043
2075
|
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
2044
2076
|
)
|
2045
2077
|
|
2046
|
-
if force_downsampling:
|
2047
|
-
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
2048
|
-
return balance_undersample_forced(
|
2049
|
-
df=df,
|
2050
|
-
target_column=TARGET,
|
2051
|
-
id_columns=self.id_columns,
|
2052
|
-
date_column=self._get_date_column(self.search_keys),
|
2053
|
-
task_type=self.model_task_type,
|
2054
|
-
cv_type=self.cv,
|
2055
|
-
random_state=self.random_state,
|
2056
|
-
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
2057
|
-
logger=self.logger,
|
2058
|
-
bundle=self.bundle,
|
2059
|
-
warning_callback=self.__log_warning,
|
2060
|
-
)
|
2061
|
-
elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
2062
|
-
if EVAL_SET_INDEX in df.columns:
|
2063
|
-
threshold = Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
|
2064
|
-
sample_size = Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS
|
2065
|
-
else:
|
2066
|
-
threshold = Dataset.FIT_SAMPLE_THRESHOLD
|
2067
|
-
sample_size = Dataset.FIT_SAMPLE_ROWS
|
2068
|
-
|
2069
|
-
if num_samples > threshold:
|
2070
|
-
self.logger.info(f"Downsampling from {num_samples} to {sample_size}")
|
2071
|
-
return df.sample(n=sample_size, random_state=self.random_state)
|
2072
|
-
|
2073
|
-
return df
|
2074
|
-
|
2075
2078
|
def __extract_train_data(
|
2076
2079
|
self, enriched_df: pd.DataFrame, x_columns: List[str]
|
2077
2080
|
) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
|
@@ -2107,7 +2110,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2107
2110
|
eval_set_sampled_dict: Dict[int, Tuple],
|
2108
2111
|
columns_renaming: Dict[str, str],
|
2109
2112
|
search_keys: Dict[str, SearchKey],
|
2110
|
-
) ->
|
2113
|
+
) -> _EnrichedDataForMetrics:
|
2111
2114
|
|
2112
2115
|
self.__cached_sampled_datasets[datasets_hash] = (
|
2113
2116
|
X_sampled,
|
@@ -2138,7 +2141,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2138
2141
|
for k, v in search_keys.items()
|
2139
2142
|
if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
|
2140
2143
|
}
|
2141
|
-
return FeaturesEnricher.
|
2144
|
+
return FeaturesEnricher._EnrichedDataForMetrics(
|
2142
2145
|
X_sampled=X_sampled,
|
2143
2146
|
y_sampled=y_sampled,
|
2144
2147
|
enriched_X=enriched_X,
|
@@ -2286,13 +2289,10 @@ if response.status_code == 200:
|
|
2286
2289
|
with MDC(trace_id=trace_id, search_id=search_id):
|
2287
2290
|
self.logger.info("Start transform")
|
2288
2291
|
|
2289
|
-
validated_X = self.
|
2290
|
-
|
2291
|
-
|
2292
|
-
|
2293
|
-
else:
|
2294
|
-
validated_y = None
|
2295
|
-
df = validated_X
|
2292
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
2293
|
+
X, y, eval_set=None, is_transform=True
|
2294
|
+
)
|
2295
|
+
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
|
2296
2296
|
|
2297
2297
|
validated_Xy = df.copy()
|
2298
2298
|
|
@@ -2346,7 +2346,7 @@ if response.status_code == 200:
|
|
2346
2346
|
|
2347
2347
|
is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
|
2348
2348
|
|
2349
|
-
columns_to_drop = [c for c in df.columns if c in self.feature_names_]
|
2349
|
+
columns_to_drop = [c for c in df.columns if c in self.feature_names_ and c not in (self.id_columns or [])]
|
2350
2350
|
if len(columns_to_drop) > 0:
|
2351
2351
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
2352
2352
|
self.logger.warning(msg)
|
@@ -2550,6 +2550,7 @@ if response.status_code == 200:
|
|
2550
2550
|
id_columns=self.__get_renamed_id_columns(columns_renaming),
|
2551
2551
|
date_column=self._get_date_column(search_keys),
|
2552
2552
|
date_format=self.date_format,
|
2553
|
+
sample_config=self.sample_config,
|
2553
2554
|
rest_client=self.rest_client,
|
2554
2555
|
logger=self.logger,
|
2555
2556
|
bundle=self.bundle,
|
@@ -2653,7 +2654,7 @@ if response.status_code == 200:
|
|
2653
2654
|
selecting_columns = [
|
2654
2655
|
c
|
2655
2656
|
for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
|
2656
|
-
if c not in self.zero_shap_client_features
|
2657
|
+
if c not in self.zero_shap_client_features or c in (self.id_columns or [])
|
2657
2658
|
]
|
2658
2659
|
selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
|
2659
2660
|
if add_fit_system_record_id:
|
@@ -2801,13 +2802,8 @@ if response.status_code == 200:
|
|
2801
2802
|
self.fit_dropped_features = set()
|
2802
2803
|
self.fit_generated_features = []
|
2803
2804
|
|
2804
|
-
validated_X = self.
|
2805
|
-
|
2806
|
-
validated_eval_set = (
|
2807
|
-
[self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in eval_set]
|
2808
|
-
if eval_set is not None
|
2809
|
-
else None
|
2810
|
-
)
|
2805
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set)
|
2806
|
+
|
2811
2807
|
is_demo_dataset = hash_input(validated_X, validated_y, validated_eval_set) in DEMO_DATASET_HASHES
|
2812
2808
|
if is_demo_dataset:
|
2813
2809
|
msg = self.bundle.get("demo_dataset_info")
|
@@ -2852,14 +2848,8 @@ if response.status_code == 200:
|
|
2852
2848
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
2853
2849
|
)
|
2854
2850
|
|
2855
|
-
df =
|
2856
|
-
|
2857
|
-
if validated_eval_set is not None and len(validated_eval_set) > 0:
|
2858
|
-
df[EVAL_SET_INDEX] = 0
|
2859
|
-
for idx, (eval_X, eval_y) in enumerate(validated_eval_set):
|
2860
|
-
eval_df = pd.concat([eval_X, eval_y], axis=1)
|
2861
|
-
eval_df[EVAL_SET_INDEX] = idx + 1
|
2862
|
-
df = pd.concat([df, eval_df])
|
2851
|
+
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
|
2852
|
+
self.id_columns_encoder = OrdinalEncoder().fit(df[self.id_columns or []])
|
2863
2853
|
|
2864
2854
|
self.fit_search_keys = self.search_keys.copy()
|
2865
2855
|
df = self.__handle_index_search_keys(df, self.fit_search_keys)
|
@@ -2970,47 +2960,8 @@ if response.status_code == 200:
|
|
2970
2960
|
# TODO check maybe need to drop _time column from df_with_original_index
|
2971
2961
|
|
2972
2962
|
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, self.fit_columns_renaming)
|
2973
|
-
|
2974
|
-
|
2975
|
-
email_column = self._get_email_column(self.fit_search_keys)
|
2976
|
-
hem_column = self._get_hem_column(self.fit_search_keys)
|
2977
|
-
if email_column:
|
2978
|
-
converter = EmailSearchKeyConverter(
|
2979
|
-
email_column,
|
2980
|
-
hem_column,
|
2981
|
-
self.fit_search_keys,
|
2982
|
-
self.fit_columns_renaming,
|
2983
|
-
list(unnest_search_keys.keys()),
|
2984
|
-
self.bundle,
|
2985
|
-
self.logger,
|
2986
|
-
)
|
2987
|
-
df = converter.convert(df)
|
2988
|
-
|
2989
|
-
ip_column = self._get_ip_column(self.fit_search_keys)
|
2990
|
-
if ip_column:
|
2991
|
-
converter = IpSearchKeyConverter(
|
2992
|
-
ip_column,
|
2993
|
-
self.fit_search_keys,
|
2994
|
-
self.fit_columns_renaming,
|
2995
|
-
list(unnest_search_keys.keys()),
|
2996
|
-
self.bundle,
|
2997
|
-
self.logger,
|
2998
|
-
)
|
2999
|
-
df = converter.convert(df)
|
3000
|
-
phone_column = self._get_phone_column(self.fit_search_keys)
|
3001
|
-
country_column = self._get_country_column(self.fit_search_keys)
|
3002
|
-
if phone_column:
|
3003
|
-
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
3004
|
-
df = converter.convert(df)
|
3005
|
-
|
3006
|
-
if country_column:
|
3007
|
-
converter = CountrySearchKeyConverter(country_column)
|
3008
|
-
df = converter.convert(df)
|
3009
|
-
|
3010
|
-
postal_code = self._get_postal_column(self.fit_search_keys)
|
3011
|
-
if postal_code:
|
3012
|
-
converter = PostalCodeSearchKeyConverter(postal_code)
|
3013
|
-
df = converter.convert(df)
|
2963
|
+
# Convert EMAIL to HEM etc after unnesting to do it only with one column
|
2964
|
+
df = self.__convert_unnestable_keys(df, unnest_search_keys)
|
3014
2965
|
|
3015
2966
|
non_feature_columns = [
|
3016
2967
|
self.TARGET_NAME,
|
@@ -3061,11 +3012,7 @@ if response.status_code == 200:
|
|
3061
3012
|
runtime_parameters = self._get_copy_of_runtime_parameters()
|
3062
3013
|
|
3063
3014
|
# Force downsampling to 7000 for API features generation
|
3064
|
-
force_downsampling = (
|
3065
|
-
not self.disable_force_downsampling
|
3066
|
-
and self.columns_for_online_api is not None
|
3067
|
-
and len(df) > Dataset.FORCE_SAMPLE_SIZE
|
3068
|
-
)
|
3015
|
+
force_downsampling = self.__use_force_downsampling(df)
|
3069
3016
|
if force_downsampling:
|
3070
3017
|
runtime_parameters.properties["fast_fit"] = True
|
3071
3018
|
|
@@ -3085,6 +3032,7 @@ if response.status_code == 200:
|
|
3085
3032
|
logger=self.logger,
|
3086
3033
|
bundle=self.bundle,
|
3087
3034
|
warning_callback=self.__log_warning,
|
3035
|
+
sample_config=self.sample_config,
|
3088
3036
|
)
|
3089
3037
|
dataset.columns_renaming = self.fit_columns_renaming
|
3090
3038
|
|
@@ -3240,6 +3188,49 @@ if response.status_code == 200:
|
|
3240
3188
|
if not self.warning_counter.has_warnings():
|
3241
3189
|
self.__display_support_link(self.bundle.get("all_ok_community_invite"))
|
3242
3190
|
|
3191
|
+
def __convert_unnestable_keys(self, df: pd.DataFrame, unnest_search_keys: Dict[str, str]):
|
3192
|
+
email_column = self._get_email_column(self.fit_search_keys)
|
3193
|
+
hem_column = self._get_hem_column(self.fit_search_keys)
|
3194
|
+
if email_column:
|
3195
|
+
converter = EmailSearchKeyConverter(
|
3196
|
+
email_column,
|
3197
|
+
hem_column,
|
3198
|
+
self.fit_search_keys,
|
3199
|
+
self.fit_columns_renaming,
|
3200
|
+
list(unnest_search_keys.keys()),
|
3201
|
+
self.bundle,
|
3202
|
+
self.logger,
|
3203
|
+
)
|
3204
|
+
df = converter.convert(df)
|
3205
|
+
|
3206
|
+
ip_column = self._get_ip_column(self.fit_search_keys)
|
3207
|
+
if ip_column:
|
3208
|
+
converter = IpSearchKeyConverter(
|
3209
|
+
ip_column,
|
3210
|
+
self.fit_search_keys,
|
3211
|
+
self.fit_columns_renaming,
|
3212
|
+
list(unnest_search_keys.keys()),
|
3213
|
+
self.bundle,
|
3214
|
+
self.logger,
|
3215
|
+
)
|
3216
|
+
df = converter.convert(df)
|
3217
|
+
phone_column = self._get_phone_column(self.fit_search_keys)
|
3218
|
+
country_column = self._get_country_column(self.fit_search_keys)
|
3219
|
+
if phone_column:
|
3220
|
+
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
3221
|
+
df = converter.convert(df)
|
3222
|
+
|
3223
|
+
if country_column:
|
3224
|
+
converter = CountrySearchKeyConverter(country_column)
|
3225
|
+
df = converter.convert(df)
|
3226
|
+
|
3227
|
+
postal_code = self._get_postal_column(self.fit_search_keys)
|
3228
|
+
if postal_code:
|
3229
|
+
converter = PostalCodeSearchKeyConverter(postal_code)
|
3230
|
+
df = converter.convert(df)
|
3231
|
+
|
3232
|
+
return df
|
3233
|
+
|
3243
3234
|
def __should_add_date_column(self):
|
3244
3235
|
return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
|
3245
3236
|
|
@@ -3282,6 +3273,57 @@ if response.status_code == 200:
|
|
3282
3273
|
search_keys_with_autodetection = {**self.search_keys, **self.autodetected_search_keys}
|
3283
3274
|
return [c for c, v in search_keys_with_autodetection.items() if v.value.value in keys]
|
3284
3275
|
|
3276
|
+
def _validate_train_eval(
|
3277
|
+
self,
|
3278
|
+
X: pd.DataFrame,
|
3279
|
+
y: Optional[pd.Series] = None,
|
3280
|
+
eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
|
3281
|
+
is_transform: bool = False,
|
3282
|
+
) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
|
3283
|
+
validated_X = self._validate_X(X, is_transform)
|
3284
|
+
validated_y = self._validate_y(validated_X, y)
|
3285
|
+
validated_eval_set = self._validate_eval_set(validated_X, eval_set)
|
3286
|
+
return validated_X, validated_y, validated_eval_set
|
3287
|
+
|
3288
|
+
def _encode_id_columns(
|
3289
|
+
self,
|
3290
|
+
X: pd.DataFrame,
|
3291
|
+
columns_renaming: Optional[Dict[str, str]] = None,
|
3292
|
+
) -> Tuple[pd.DataFrame, Dict[str, List[Any]]]:
|
3293
|
+
columns_renaming = columns_renaming or {}
|
3294
|
+
unknown_dict = {}
|
3295
|
+
|
3296
|
+
if self.id_columns and self.id_columns_encoder is not None:
|
3297
|
+
inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
3298
|
+
renamed_id_columns = [
|
3299
|
+
inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
|
3300
|
+
]
|
3301
|
+
self.logger.info(f"Convert id columns to int: {renamed_id_columns}")
|
3302
|
+
encoded = self.id_columns_encoder.transform(X[renamed_id_columns].rename(columns=columns_renaming))
|
3303
|
+
for i, c in enumerate(renamed_id_columns):
|
3304
|
+
unknown_values = X[encoded[:, i] == -1][c].unique().tolist()
|
3305
|
+
if len(unknown_values) > 0:
|
3306
|
+
unknown_dict[c] = unknown_values
|
3307
|
+
X[renamed_id_columns] = encoded
|
3308
|
+
X = X.loc[(X[renamed_id_columns] != -1).all(axis=1)]
|
3309
|
+
|
3310
|
+
if len(unknown_dict) > 0:
|
3311
|
+
self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
|
3312
|
+
|
3313
|
+
return X, unknown_dict
|
3314
|
+
|
3315
|
+
def _decode_id_columns(self, X: pd.DataFrame, columns_renaming: Dict[str, str]):
|
3316
|
+
columns_renaming = columns_renaming or {}
|
3317
|
+
if self.id_columns and self.id_columns_encoder is not None:
|
3318
|
+
inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
3319
|
+
renamed_id_columns = [
|
3320
|
+
inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
|
3321
|
+
]
|
3322
|
+
decoded = self.id_columns_encoder.inverse_transform(X[renamed_id_columns].rename(columns=columns_renaming))
|
3323
|
+
X[renamed_id_columns] = decoded
|
3324
|
+
|
3325
|
+
return X
|
3326
|
+
|
3285
3327
|
def _validate_X(self, X, is_transform=False) -> pd.DataFrame:
|
3286
3328
|
if isinstance(X, pd.DataFrame):
|
3287
3329
|
if isinstance(X.columns, pd.MultiIndex) or isinstance(X.index, pd.MultiIndex):
|
@@ -3323,7 +3365,9 @@ if response.status_code == 200:
|
|
3323
3365
|
|
3324
3366
|
return validated_X
|
3325
3367
|
|
3326
|
-
def _validate_y(self, X: pd.DataFrame, y) -> pd.Series:
|
3368
|
+
def _validate_y(self, X: pd.DataFrame, y) -> Optional[pd.Series]:
|
3369
|
+
if y is None:
|
3370
|
+
return None
|
3327
3371
|
if (
|
3328
3372
|
not isinstance(y, pd.Series)
|
3329
3373
|
and not isinstance(y, pd.DataFrame)
|
@@ -3370,6 +3414,11 @@ if response.status_code == 200:
|
|
3370
3414
|
|
3371
3415
|
return validated_y
|
3372
3416
|
|
3417
|
+
def _validate_eval_set(self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]]):
|
3418
|
+
if eval_set is None:
|
3419
|
+
return None
|
3420
|
+
return [self._validate_eval_set_pair(X, eval_pair) for eval_pair in eval_set]
|
3421
|
+
|
3373
3422
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
3374
3423
|
if len(eval_pair) != 2:
|
3375
3424
|
raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
@@ -3450,7 +3499,7 @@ if response.status_code == 200:
|
|
3450
3499
|
raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
|
3451
3500
|
|
3452
3501
|
# Check for duplicates between train and eval sets by comparing all values
|
3453
|
-
train_eval_intersection = pd.merge(X, validated_eval_X, how=
|
3502
|
+
train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
|
3454
3503
|
if len(train_eval_intersection) > 0:
|
3455
3504
|
raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
|
3456
3505
|
|
@@ -3980,7 +4029,7 @@ if response.status_code == 200:
|
|
3980
4029
|
if features_meta is None:
|
3981
4030
|
raise Exception(self.bundle.get("missing_features_meta"))
|
3982
4031
|
|
3983
|
-
return [f.name for f in features_meta if f.type == "categorical"]
|
4032
|
+
return [f.name for f in features_meta if f.type == "categorical" and f.name not in (self.id_columns or [])]
|
3984
4033
|
|
3985
4034
|
def __prepare_feature_importances(
|
3986
4035
|
self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
@@ -4623,35 +4672,6 @@ if response.status_code == 200:
|
|
4623
4672
|
self.logger.warning("Failed to dump input files", exc_info=True)
|
4624
4673
|
|
4625
4674
|
|
4626
|
-
def _num_samples(x):
|
4627
|
-
"""Return number of samples in array-like x."""
|
4628
|
-
if x is None:
|
4629
|
-
return 0
|
4630
|
-
message = "Expected sequence or array-like, got %s" % type(x)
|
4631
|
-
if hasattr(x, "fit") and callable(x.fit):
|
4632
|
-
# Don't get num_samples from an ensembles length!
|
4633
|
-
raise TypeError(message)
|
4634
|
-
|
4635
|
-
if not hasattr(x, "__len__") and not hasattr(x, "shape"):
|
4636
|
-
if hasattr(x, "__array__"):
|
4637
|
-
x = np.asarray(x)
|
4638
|
-
else:
|
4639
|
-
raise TypeError(message)
|
4640
|
-
|
4641
|
-
if hasattr(x, "shape") and x.shape is not None:
|
4642
|
-
if len(x.shape) == 0:
|
4643
|
-
raise TypeError("Singleton array %r cannot be considered a valid collection." % x)
|
4644
|
-
# Check that shape is returning an integer or default to len
|
4645
|
-
# Dask dataframes may not return numeric shape[0] value
|
4646
|
-
if isinstance(x.shape[0], numbers.Integral):
|
4647
|
-
return x.shape[0]
|
4648
|
-
|
4649
|
-
try:
|
4650
|
-
return len(x)
|
4651
|
-
except TypeError as type_error:
|
4652
|
-
raise TypeError(message) from type_error
|
4653
|
-
|
4654
|
-
|
4655
4675
|
def is_frames_equal(first, second, bundle: ResourceBundle) -> bool:
|
4656
4676
|
if (isinstance(first, pd.DataFrame) and isinstance(second, pd.DataFrame)) or (
|
4657
4677
|
isinstance(first, pd.Series) and isinstance(second, pd.Series)
|