upgini 1.2.21__tar.gz → 1.2.23__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.21 → upgini-1.2.23}/PKG-INFO +1 -1
- upgini-1.2.23/src/upgini/__about__.py +1 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/features_enricher.py +81 -79
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/metrics.py +13 -8
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/normalizer/normalize_utils.py +2 -14
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/resource_bundle/strings.properties +45 -48
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/datetime_utils.py +5 -26
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/deduplicate_utils.py +41 -33
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/features_validator.py +8 -15
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/warning_counter.py +1 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/version_validator.py +7 -3
- upgini-1.2.21/src/upgini/__about__.py +0 -1
- {upgini-1.2.21 → upgini-1.2.23}/.gitignore +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/LICENSE +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/README.md +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/pyproject.toml +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/__init__.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/ads.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/dataset.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/errors.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/http.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/metadata.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/search_task.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/spinner.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/track_info.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.23"
|
|
@@ -77,8 +77,8 @@ from upgini.utils.cv_utils import CVConfig, get_groups
|
|
|
77
77
|
from upgini.utils.datetime_utils import (
|
|
78
78
|
DateTimeSearchKeyConverter,
|
|
79
79
|
is_blocked_time_series,
|
|
80
|
+
is_dates_distribution_valid,
|
|
80
81
|
is_time_series,
|
|
81
|
-
validate_dates_distribution,
|
|
82
82
|
)
|
|
83
83
|
from upgini.utils.deduplicate_utils import (
|
|
84
84
|
clean_full_duplicates,
|
|
@@ -263,7 +263,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
263
263
|
dict()
|
|
264
264
|
)
|
|
265
265
|
|
|
266
|
-
validate_version(self.logger)
|
|
266
|
+
validate_version(self.logger, self.__log_warning)
|
|
267
267
|
self.search_keys = search_keys or {}
|
|
268
268
|
self.country_code = country_code
|
|
269
269
|
self.__validate_search_keys(search_keys, search_id)
|
|
@@ -723,7 +723,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
723
723
|
|
|
724
724
|
start_time = time.time()
|
|
725
725
|
try:
|
|
726
|
-
result, _ = self.__inner_transform(
|
|
726
|
+
result, _, _ = self.__inner_transform(
|
|
727
727
|
trace_id,
|
|
728
728
|
X,
|
|
729
729
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -951,9 +951,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
951
951
|
gc.collect()
|
|
952
952
|
|
|
953
953
|
if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
|
|
954
|
-
|
|
955
|
-
self.logger.warning("No client or free relevant ADS features found to calculate metrics")
|
|
956
|
-
self.warning_counter.increment()
|
|
954
|
+
self.__log_warning(self.bundle.get("metrics_no_important_free_features"))
|
|
957
955
|
return None
|
|
958
956
|
|
|
959
957
|
print(self.bundle.get("metrics_start"))
|
|
@@ -1654,9 +1652,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1654
1652
|
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1655
1653
|
generated_features = []
|
|
1656
1654
|
if date_column is not None:
|
|
1657
|
-
converter = DateTimeSearchKeyConverter(
|
|
1658
|
-
date_column, self.date_format, self.logger, self.bundle, silent_mode=True
|
|
1659
|
-
)
|
|
1655
|
+
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
1660
1656
|
df = converter.convert(df, keep_time=True)
|
|
1661
1657
|
generated_features = converter.generated_features
|
|
1662
1658
|
|
|
@@ -1666,11 +1662,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1666
1662
|
df = generator.generate(df)
|
|
1667
1663
|
generated_features.extend(generator.generated_features)
|
|
1668
1664
|
|
|
1669
|
-
normalizer = Normalizer(self.bundle, self.logger
|
|
1665
|
+
normalizer = Normalizer(self.bundle, self.logger)
|
|
1670
1666
|
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
1671
1667
|
columns_renaming = normalizer.columns_renaming
|
|
1672
1668
|
|
|
1673
|
-
df = clean_full_duplicates(df, logger=self.logger,
|
|
1669
|
+
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1674
1670
|
|
|
1675
1671
|
num_samples = _num_samples(df)
|
|
1676
1672
|
sample_threshold, sample_rows = (
|
|
@@ -1817,7 +1813,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1817
1813
|
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
|
1818
1814
|
df = pd.concat([df, eval_df_with_index])
|
|
1819
1815
|
|
|
1820
|
-
df = clean_full_duplicates(df, logger=self.logger,
|
|
1816
|
+
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1821
1817
|
|
|
1822
1818
|
# downsample if need to eval_set threshold
|
|
1823
1819
|
num_samples = _num_samples(df)
|
|
@@ -1830,7 +1826,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1830
1826
|
tmp_target_name = "__target"
|
|
1831
1827
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1832
1828
|
|
|
1833
|
-
enriched_df, columns_renaming = self.__inner_transform(
|
|
1829
|
+
enriched_df, columns_renaming, generated_features = self.__inner_transform(
|
|
1834
1830
|
trace_id,
|
|
1835
1831
|
df,
|
|
1836
1832
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -1847,7 +1843,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1847
1843
|
|
|
1848
1844
|
x_columns = [
|
|
1849
1845
|
c
|
|
1850
|
-
for c in (validated_X.columns.tolist() +
|
|
1846
|
+
for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
|
|
1851
1847
|
if c in enriched_df.columns
|
|
1852
1848
|
]
|
|
1853
1849
|
|
|
@@ -1869,7 +1865,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1869
1865
|
|
|
1870
1866
|
df[TARGET] = validated_y
|
|
1871
1867
|
|
|
1872
|
-
df = clean_full_duplicates(df, logger=self.logger,
|
|
1868
|
+
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1873
1869
|
|
|
1874
1870
|
num_samples = _num_samples(df)
|
|
1875
1871
|
if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
|
@@ -1879,7 +1875,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1879
1875
|
tmp_target_name = "__target"
|
|
1880
1876
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1881
1877
|
|
|
1882
|
-
enriched_Xy, columns_renaming = self.__inner_transform(
|
|
1878
|
+
enriched_Xy, columns_renaming, generated_features = self.__inner_transform(
|
|
1883
1879
|
trace_id,
|
|
1884
1880
|
df,
|
|
1885
1881
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -1896,7 +1892,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1896
1892
|
|
|
1897
1893
|
x_columns = [
|
|
1898
1894
|
c
|
|
1899
|
-
for c in (validated_X.columns.tolist() +
|
|
1895
|
+
for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
|
|
1900
1896
|
if c in enriched_Xy.columns
|
|
1901
1897
|
]
|
|
1902
1898
|
|
|
@@ -1904,7 +1900,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1904
1900
|
y_sampled = enriched_Xy[TARGET].copy()
|
|
1905
1901
|
enriched_X = enriched_Xy.drop(columns=TARGET)
|
|
1906
1902
|
|
|
1907
|
-
datasets_hash = hash_input(
|
|
1903
|
+
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
|
1908
1904
|
self.__cached_sampled_datasets[datasets_hash] = (
|
|
1909
1905
|
X_sampled,
|
|
1910
1906
|
y_sampled,
|
|
@@ -2023,7 +2019,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2023
2019
|
progress_bar: Optional[ProgressBar] = None,
|
|
2024
2020
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
2025
2021
|
add_fit_system_record_id: bool = False,
|
|
2026
|
-
) -> Tuple[pd.DataFrame, Dict[str, str]]:
|
|
2022
|
+
) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
|
|
2027
2023
|
if self._search_task is None:
|
|
2028
2024
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
2029
2025
|
|
|
@@ -2036,24 +2032,25 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2036
2032
|
|
|
2037
2033
|
if len(self.feature_names_) == 0:
|
|
2038
2034
|
self.logger.warning(self.bundle.get("no_important_features_for_transform"))
|
|
2039
|
-
return X, {c: c for c in X.columns}
|
|
2035
|
+
return X, {c: c for c in X.columns}, []
|
|
2040
2036
|
|
|
2041
2037
|
if self._has_paid_features(exclude_features_sources):
|
|
2042
2038
|
msg = self.bundle.get("transform_with_paid_features")
|
|
2043
2039
|
self.logger.warning(msg)
|
|
2044
2040
|
self.__display_support_link(msg)
|
|
2045
|
-
return None, {c: c for c in X.columns}
|
|
2041
|
+
return None, {c: c for c in X.columns}, []
|
|
2046
2042
|
|
|
2047
2043
|
if not metrics_calculation:
|
|
2048
2044
|
transform_usage = self.rest_client.get_current_transform_usage(trace_id)
|
|
2049
2045
|
self.logger.info(f"Current transform usage: {transform_usage}. Transforming {len(X)} rows")
|
|
2050
2046
|
if transform_usage.has_limit:
|
|
2051
2047
|
if len(X) > transform_usage.rest_rows:
|
|
2052
|
-
|
|
2048
|
+
rest_rows = max(transform_usage.rest_rows, 0)
|
|
2049
|
+
msg = self.bundle.get("transform_usage_warning").format(len(X), rest_rows)
|
|
2053
2050
|
self.logger.warning(msg)
|
|
2054
2051
|
print(msg)
|
|
2055
2052
|
show_request_quote_button()
|
|
2056
|
-
return None, {c: c for c in X.columns}
|
|
2053
|
+
return None, {c: c for c in X.columns}, []
|
|
2057
2054
|
else:
|
|
2058
2055
|
msg = self.bundle.get("transform_usage_info").format(
|
|
2059
2056
|
transform_usage.limit, transform_usage.transformed_rows
|
|
@@ -2093,9 +2090,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2093
2090
|
generated_features = []
|
|
2094
2091
|
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2095
2092
|
if date_column is not None:
|
|
2096
|
-
converter = DateTimeSearchKeyConverter(
|
|
2097
|
-
date_column, self.date_format, self.logger, bundle=self.bundle, silent_mode=silent_mode
|
|
2098
|
-
)
|
|
2093
|
+
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2099
2094
|
df = converter.convert(df)
|
|
2100
2095
|
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
2101
2096
|
generated_features.extend(converter.generated_features)
|
|
@@ -2110,7 +2105,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2110
2105
|
df = generator.generate(df)
|
|
2111
2106
|
generated_features.extend(generator.generated_features)
|
|
2112
2107
|
|
|
2113
|
-
normalizer = Normalizer(self.bundle, self.logger
|
|
2108
|
+
normalizer = Normalizer(self.bundle, self.logger)
|
|
2114
2109
|
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
2115
2110
|
columns_renaming = normalizer.columns_renaming
|
|
2116
2111
|
|
|
@@ -2176,7 +2171,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2176
2171
|
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2177
2172
|
df = converter.convert(df)
|
|
2178
2173
|
|
|
2179
|
-
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
2174
|
+
# generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
2180
2175
|
|
|
2181
2176
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
2182
2177
|
for col in features_for_transform:
|
|
@@ -2216,9 +2211,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2216
2211
|
|
|
2217
2212
|
df_without_features = df.drop(columns=features_not_to_pass)
|
|
2218
2213
|
|
|
2219
|
-
df_without_features = clean_full_duplicates(
|
|
2220
|
-
df_without_features, self.logger,
|
|
2214
|
+
df_without_features, full_duplicates_warning = clean_full_duplicates(
|
|
2215
|
+
df_without_features, self.logger, bundle=self.bundle
|
|
2221
2216
|
)
|
|
2217
|
+
if not silent_mode and full_duplicates_warning:
|
|
2218
|
+
self.__log_warning(full_duplicates_warning)
|
|
2222
2219
|
|
|
2223
2220
|
del df
|
|
2224
2221
|
gc.collect()
|
|
@@ -2337,7 +2334,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2337
2334
|
if add_fit_system_record_id:
|
|
2338
2335
|
result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
|
|
2339
2336
|
|
|
2340
|
-
return result, columns_renaming
|
|
2337
|
+
return result, columns_renaming, generated_features
|
|
2341
2338
|
|
|
2342
2339
|
def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
|
|
2343
2340
|
features_info = self._internal_features_info
|
|
@@ -2415,6 +2412,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2415
2412
|
def __is_registered(self) -> bool:
|
|
2416
2413
|
return self.api_key is not None and self.api_key != ""
|
|
2417
2414
|
|
|
2415
|
+
def __log_warning(self, message: str, show_support_link: bool = False):
|
|
2416
|
+
warning_num = self.warning_counter.increment()
|
|
2417
|
+
formatted_message = f"WARNING #{warning_num}: {message}\n"
|
|
2418
|
+
if show_support_link:
|
|
2419
|
+
self.__display_support_link(formatted_message)
|
|
2420
|
+
else:
|
|
2421
|
+
print(formatted_message)
|
|
2422
|
+
self.logger.warning(message)
|
|
2423
|
+
|
|
2418
2424
|
def __inner_fit(
|
|
2419
2425
|
self,
|
|
2420
2426
|
trace_id: str,
|
|
@@ -2461,9 +2467,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2461
2467
|
checked_generate_features = []
|
|
2462
2468
|
for gen_feature in self.generate_features:
|
|
2463
2469
|
if gen_feature not in x_columns:
|
|
2464
|
-
|
|
2465
|
-
print(msg)
|
|
2466
|
-
self.logger.warning(msg)
|
|
2470
|
+
self.__log_warning(self.bundle.get("missing_generate_feature").format(gen_feature, x_columns))
|
|
2467
2471
|
else:
|
|
2468
2472
|
checked_generate_features.append(gen_feature)
|
|
2469
2473
|
self.generate_features = checked_generate_features
|
|
@@ -2524,9 +2528,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2524
2528
|
self.date_format,
|
|
2525
2529
|
self.logger,
|
|
2526
2530
|
bundle=self.bundle,
|
|
2527
|
-
warnings_counter=self.warning_counter,
|
|
2528
2531
|
)
|
|
2529
2532
|
df = converter.convert(df, keep_time=True)
|
|
2533
|
+
if converter.has_old_dates:
|
|
2534
|
+
self.__log_warning(self.bundle.get("dataset_drop_old_dates"))
|
|
2530
2535
|
self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
|
|
2531
2536
|
self.fit_generated_features.extend(converter.generated_features)
|
|
2532
2537
|
else:
|
|
@@ -2541,23 +2546,36 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2541
2546
|
self.fit_generated_features.extend(generator.generated_features)
|
|
2542
2547
|
|
|
2543
2548
|
# Checks that need validated date
|
|
2544
|
-
validate_dates_distribution(df, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
|
|
2545
2549
|
|
|
2546
|
-
if
|
|
2550
|
+
if not is_dates_distribution_valid(df, self.fit_search_keys):
|
|
2551
|
+
self.__log_warning(bundle.get("x_unstable_by_date"))
|
|
2552
|
+
|
|
2553
|
+
if (
|
|
2554
|
+
is_numeric_dtype(df[self.TARGET_NAME])
|
|
2555
|
+
and self.model_task_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
|
|
2556
|
+
and has_date
|
|
2557
|
+
):
|
|
2547
2558
|
self._validate_PSI(df.sort_values(by=maybe_date_column))
|
|
2548
2559
|
|
|
2549
|
-
normalizer = Normalizer(self.bundle, self.logger
|
|
2560
|
+
normalizer = Normalizer(self.bundle, self.logger)
|
|
2550
2561
|
df, self.fit_search_keys, self.fit_generated_features = normalizer.normalize(
|
|
2551
2562
|
df, self.fit_search_keys, self.fit_generated_features
|
|
2552
2563
|
)
|
|
2553
2564
|
self.fit_columns_renaming = normalizer.columns_renaming
|
|
2565
|
+
if normalizer.removed_features:
|
|
2566
|
+
self.__log_warning(self.bundle.get("dataset_date_features").format(normalizer.removed_features))
|
|
2554
2567
|
|
|
2555
2568
|
self.__adjust_cv(df)
|
|
2556
2569
|
|
|
2557
|
-
df = remove_fintech_duplicates(
|
|
2570
|
+
df, fintech_warnings = remove_fintech_duplicates(
|
|
2558
2571
|
df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
|
|
2559
2572
|
)
|
|
2560
|
-
|
|
2573
|
+
if fintech_warnings:
|
|
2574
|
+
for fintech_warning in fintech_warnings:
|
|
2575
|
+
self.__log_warning(fintech_warning)
|
|
2576
|
+
df, full_duplicates_warning = clean_full_duplicates(df, self.logger, bundle=self.bundle)
|
|
2577
|
+
if full_duplicates_warning:
|
|
2578
|
+
self.__log_warning(full_duplicates_warning)
|
|
2561
2579
|
|
|
2562
2580
|
# Explode multiple search keys
|
|
2563
2581
|
df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
@@ -2617,9 +2635,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2617
2635
|
|
|
2618
2636
|
features_columns = [c for c in df.columns if c not in non_feature_columns]
|
|
2619
2637
|
|
|
2620
|
-
features_to_drop = FeaturesValidator(self.logger).validate(
|
|
2621
|
-
df, features_columns, self.generate_features, self.
|
|
2638
|
+
features_to_drop, feature_validator_warnings = FeaturesValidator(self.logger).validate(
|
|
2639
|
+
df, features_columns, self.generate_features, self.fit_columns_renaming
|
|
2622
2640
|
)
|
|
2641
|
+
if feature_validator_warnings:
|
|
2642
|
+
for warning in feature_validator_warnings:
|
|
2643
|
+
self.__log_warning(warning)
|
|
2623
2644
|
self.fit_dropped_features.update(features_to_drop)
|
|
2624
2645
|
df = df.drop(columns=features_to_drop)
|
|
2625
2646
|
|
|
@@ -2735,9 +2756,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2735
2756
|
zero_hit_columns = self.get_columns_by_search_keys(zero_hit_search_keys)
|
|
2736
2757
|
if zero_hit_columns:
|
|
2737
2758
|
msg = self.bundle.get("features_info_zero_hit_rate_search_keys").format(zero_hit_columns)
|
|
2738
|
-
self.
|
|
2739
|
-
self.__display_support_link(msg)
|
|
2740
|
-
self.warning_counter.increment()
|
|
2759
|
+
self.__log_warning(msg, show_support_link=True)
|
|
2741
2760
|
|
|
2742
2761
|
if (
|
|
2743
2762
|
self._search_task.unused_features_for_generation is not None
|
|
@@ -2747,9 +2766,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2747
2766
|
dataset.columns_renaming.get(col) or col for col in self._search_task.unused_features_for_generation
|
|
2748
2767
|
]
|
|
2749
2768
|
msg = self.bundle.get("features_not_generated").format(unused_features_for_generation)
|
|
2750
|
-
self.
|
|
2751
|
-
print(msg)
|
|
2752
|
-
self.warning_counter.increment()
|
|
2769
|
+
self.__log_warning(msg)
|
|
2753
2770
|
|
|
2754
2771
|
self.__prepare_feature_importances(trace_id, validated_X.columns.to_list() + self.fit_generated_features)
|
|
2755
2772
|
|
|
@@ -3150,7 +3167,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3150
3167
|
maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
3151
3168
|
if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
|
|
3152
3169
|
# TODO cast date column to single dtype
|
|
3153
|
-
date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format
|
|
3170
|
+
date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
|
|
3154
3171
|
converted_X = date_converter.convert(X)
|
|
3155
3172
|
min_date = converted_X[maybe_date_col].min()
|
|
3156
3173
|
max_date = converted_X[maybe_date_col].max()
|
|
@@ -3192,7 +3209,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3192
3209
|
logger.warning(msg)
|
|
3193
3210
|
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
|
3194
3211
|
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
|
3195
|
-
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE
|
|
3212
|
+
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE)
|
|
3196
3213
|
df = converter.convert(df)
|
|
3197
3214
|
return df
|
|
3198
3215
|
|
|
@@ -3764,15 +3781,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3764
3781
|
if meaning_type == SearchKey.COUNTRY and self.country_code is not None:
|
|
3765
3782
|
msg = self.bundle.get("search_key_country_and_country_code")
|
|
3766
3783
|
self.logger.warning(msg)
|
|
3767
|
-
|
|
3784
|
+
if not silent_mode:
|
|
3785
|
+
self.__log_warning(msg)
|
|
3768
3786
|
self.country_code = None
|
|
3769
3787
|
|
|
3770
3788
|
if not self.__is_registered and not is_demo_dataset and meaning_type in SearchKey.personal_keys():
|
|
3771
3789
|
msg = self.bundle.get("unregistered_with_personal_keys").format(meaning_type)
|
|
3772
3790
|
self.logger.warning(msg)
|
|
3773
3791
|
if not silent_mode:
|
|
3774
|
-
self.
|
|
3775
|
-
print(msg)
|
|
3792
|
+
self.__log_warning(msg)
|
|
3776
3793
|
|
|
3777
3794
|
valid_search_keys[column_name] = SearchKey.CUSTOM_KEY
|
|
3778
3795
|
else:
|
|
@@ -3806,27 +3823,22 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3806
3823
|
and not silent_mode
|
|
3807
3824
|
):
|
|
3808
3825
|
msg = self.bundle.get("date_only_search")
|
|
3809
|
-
|
|
3810
|
-
self.logger.warning(msg)
|
|
3811
|
-
self.warning_counter.increment()
|
|
3826
|
+
self.__log_warning(msg)
|
|
3812
3827
|
|
|
3813
3828
|
maybe_date = [k for k, v in valid_search_keys.items() if v in [SearchKey.DATE, SearchKey.DATETIME]]
|
|
3814
3829
|
if (self.cv is None or self.cv == CVType.k_fold) and len(maybe_date) > 0 and not silent_mode:
|
|
3815
3830
|
date_column = next(iter(maybe_date))
|
|
3816
3831
|
if x[date_column].nunique() > 0.9 * _num_samples(x):
|
|
3817
3832
|
msg = self.bundle.get("date_search_without_time_series")
|
|
3818
|
-
|
|
3819
|
-
self.logger.warning(msg)
|
|
3820
|
-
self.warning_counter.increment()
|
|
3833
|
+
self.__log_warning(msg)
|
|
3821
3834
|
|
|
3822
3835
|
if len(valid_search_keys) == 1:
|
|
3823
3836
|
key, value = list(valid_search_keys.items())[0]
|
|
3824
3837
|
# Show warning for country only if country is the only key
|
|
3825
3838
|
if x[key].nunique() == 1:
|
|
3826
3839
|
msg = self.bundle.get("single_constant_search_key").format(value, x[key].values[0])
|
|
3827
|
-
|
|
3828
|
-
|
|
3829
|
-
self.warning_counter.increment()
|
|
3840
|
+
if not silent_mode:
|
|
3841
|
+
self.__log_warning(msg)
|
|
3830
3842
|
# TODO maybe raise ValidationError
|
|
3831
3843
|
|
|
3832
3844
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
@@ -3886,9 +3898,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3886
3898
|
)
|
|
3887
3899
|
else:
|
|
3888
3900
|
msg = self.bundle.get("features_info_zero_important_features")
|
|
3889
|
-
self.
|
|
3890
|
-
self.__display_support_link(msg)
|
|
3891
|
-
self.warning_counter.increment()
|
|
3901
|
+
self.__log_warning(msg, show_support_link=True)
|
|
3892
3902
|
except (ImportError, NameError):
|
|
3893
3903
|
print(msg)
|
|
3894
3904
|
print(self._internal_features_info)
|
|
@@ -3990,8 +4000,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3990
4000
|
" But not used because not registered user"
|
|
3991
4001
|
)
|
|
3992
4002
|
if not silent_mode:
|
|
3993
|
-
|
|
3994
|
-
self.warning_counter.increment()
|
|
4003
|
+
self.__log_warning(self.bundle.get("email_detected_not_registered").format(maybe_keys))
|
|
3995
4004
|
|
|
3996
4005
|
# if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
|
|
3997
4006
|
if check_need_detect(SearchKey.PHONE):
|
|
@@ -4010,8 +4019,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
4010
4019
|
"But not used because not registered user"
|
|
4011
4020
|
)
|
|
4012
4021
|
if not silent_mode:
|
|
4013
|
-
|
|
4014
|
-
self.warning_counter.increment()
|
|
4022
|
+
self.__log_warning(self.bundle.get("phone_detected_not_registered"))
|
|
4015
4023
|
|
|
4016
4024
|
return search_keys
|
|
4017
4025
|
|
|
@@ -4035,19 +4043,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
4035
4043
|
part2 = train[half_train:]
|
|
4036
4044
|
train_psi = calculate_psi(part1[self.TARGET_NAME], part2[self.TARGET_NAME])
|
|
4037
4045
|
if train_psi > 0.2:
|
|
4038
|
-
self.
|
|
4039
|
-
msg = self.bundle.get("train_unstable_target").format(train_psi)
|
|
4040
|
-
print(msg)
|
|
4041
|
-
self.logger.warning(msg)
|
|
4046
|
+
self.__log_warning(self.bundle.get("train_unstable_target").format(train_psi))
|
|
4042
4047
|
|
|
4043
4048
|
# 2. Check train-test PSI
|
|
4044
4049
|
if eval1 is not None:
|
|
4045
4050
|
train_test_psi = calculate_psi(train[self.TARGET_NAME], eval1[self.TARGET_NAME])
|
|
4046
4051
|
if train_test_psi > 0.2:
|
|
4047
|
-
self.
|
|
4048
|
-
msg = self.bundle.get("eval_unstable_target").format(train_test_psi)
|
|
4049
|
-
print(msg)
|
|
4050
|
-
self.logger.warning(msg)
|
|
4052
|
+
self.__log_warning(self.bundle.get("eval_unstable_target").format(train_test_psi))
|
|
4051
4053
|
|
|
4052
4054
|
def _dump_python_libs(self):
|
|
4053
4055
|
try:
|
|
@@ -4069,8 +4071,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
4069
4071
|
self.logger.warning(f"Showing support link: {link_text}")
|
|
4070
4072
|
display(
|
|
4071
4073
|
HTML(
|
|
4072
|
-
f"""
|
|
4073
|
-
here</a
|
|
4074
|
+
f"""{link_text} <a href='{support_link}' target='_blank' rel='noopener noreferrer'>
|
|
4075
|
+
here</a><br/>"""
|
|
4074
4076
|
)
|
|
4075
4077
|
)
|
|
4076
4078
|
except (ImportError, NameError):
|
|
@@ -745,20 +745,25 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
|
745
745
|
|
|
746
746
|
|
|
747
747
|
def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
|
748
|
-
if
|
|
748
|
+
if scoring is None:
|
|
749
|
+
return
|
|
750
|
+
|
|
751
|
+
if isinstance(scoring, str):
|
|
749
752
|
_get_scorer_by_name(scoring)
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
raise ValidationError(
|
|
754
|
-
f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
|
|
755
|
-
)
|
|
756
|
-
elif scoring is not None:
|
|
753
|
+
return
|
|
754
|
+
|
|
755
|
+
if not isinstance(scoring, Callable):
|
|
757
756
|
raise ValidationError(
|
|
758
757
|
f"Invalid scoring argument passed {scoring}. It should be string with scoring name or function"
|
|
759
758
|
" that accepts 3 input arguments: estimator, x, y"
|
|
760
759
|
)
|
|
761
760
|
|
|
761
|
+
spec = inspect.getfullargspec(scoring)
|
|
762
|
+
if len(spec.args) < 3:
|
|
763
|
+
raise ValidationError(
|
|
764
|
+
f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
|
|
765
|
+
)
|
|
766
|
+
|
|
762
767
|
|
|
763
768
|
def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
|
|
764
769
|
metric_name = scoring
|
|
@@ -26,7 +26,6 @@ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
|
26
26
|
from upgini.utils import find_numbers_with_decimal_comma
|
|
27
27
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
28
28
|
from upgini.utils.phone_utils import PhoneSearchKeyConverter
|
|
29
|
-
from upgini.utils.warning_counter import WarningCounter
|
|
30
29
|
|
|
31
30
|
|
|
32
31
|
class Normalizer:
|
|
@@ -37,16 +36,13 @@ class Normalizer:
|
|
|
37
36
|
self,
|
|
38
37
|
bundle: ResourceBundle = None,
|
|
39
38
|
logger: Logger = None,
|
|
40
|
-
warnings_counter: WarningCounter = None,
|
|
41
|
-
silent_mode=False,
|
|
42
39
|
):
|
|
43
40
|
self.bundle = bundle or get_custom_bundle()
|
|
44
41
|
self.logger = logger or getLogger()
|
|
45
|
-
self.warnings_counter = warnings_counter or WarningCounter()
|
|
46
|
-
self.silent_mode = silent_mode
|
|
47
42
|
self.columns_renaming = {}
|
|
48
43
|
self.search_keys = {}
|
|
49
44
|
self.generated_features = []
|
|
45
|
+
self.removed_features = []
|
|
50
46
|
|
|
51
47
|
def normalize(
|
|
52
48
|
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
|
|
@@ -139,19 +135,11 @@ class Normalizer:
|
|
|
139
135
|
def _remove_dates_from_features(self, df: pd.DataFrame):
|
|
140
136
|
features = self._get_features(df)
|
|
141
137
|
|
|
142
|
-
removed_features = []
|
|
143
138
|
for f in features:
|
|
144
139
|
if is_datetime(df[f]) or isinstance(df[f].dtype, pd.PeriodDtype):
|
|
145
|
-
removed_features.append(f)
|
|
140
|
+
self.removed_features.append(f)
|
|
146
141
|
df.drop(columns=f, inplace=True)
|
|
147
142
|
|
|
148
|
-
if removed_features:
|
|
149
|
-
msg = self.bundle.get("dataset_date_features").format(removed_features)
|
|
150
|
-
self.logger.warning(msg)
|
|
151
|
-
if not self.silent_mode:
|
|
152
|
-
print(msg)
|
|
153
|
-
self.warnings_counter.increment()
|
|
154
|
-
|
|
155
143
|
return df
|
|
156
144
|
|
|
157
145
|
def _cut_too_long_string_values(self, df: pd.DataFrame):
|