upgini 1.2.22__py3-none-any.whl → 1.2.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +76 -78
- upgini/metrics.py +18 -9
- upgini/normalizer/normalize_utils.py +2 -14
- upgini/resource_bundle/strings.properties +45 -48
- upgini/utils/datetime_utils.py +5 -26
- upgini/utils/deduplicate_utils.py +41 -33
- upgini/utils/features_validator.py +8 -15
- upgini/utils/warning_counter.py +1 -0
- upgini/version_validator.py +7 -3
- {upgini-1.2.22.dist-info → upgini-1.2.24.dist-info}/METADATA +1 -1
- {upgini-1.2.22.dist-info → upgini-1.2.24.dist-info}/RECORD +14 -14
- {upgini-1.2.22.dist-info → upgini-1.2.24.dist-info}/WHEEL +0 -0
- {upgini-1.2.22.dist-info → upgini-1.2.24.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.24"
|
upgini/features_enricher.py
CHANGED
|
@@ -77,8 +77,8 @@ from upgini.utils.cv_utils import CVConfig, get_groups
|
|
|
77
77
|
from upgini.utils.datetime_utils import (
|
|
78
78
|
DateTimeSearchKeyConverter,
|
|
79
79
|
is_blocked_time_series,
|
|
80
|
+
is_dates_distribution_valid,
|
|
80
81
|
is_time_series,
|
|
81
|
-
validate_dates_distribution,
|
|
82
82
|
)
|
|
83
83
|
from upgini.utils.deduplicate_utils import (
|
|
84
84
|
clean_full_duplicates,
|
|
@@ -263,7 +263,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
263
263
|
dict()
|
|
264
264
|
)
|
|
265
265
|
|
|
266
|
-
validate_version(self.logger)
|
|
266
|
+
validate_version(self.logger, self.__log_warning)
|
|
267
267
|
self.search_keys = search_keys or {}
|
|
268
268
|
self.country_code = country_code
|
|
269
269
|
self.__validate_search_keys(search_keys, search_id)
|
|
@@ -723,7 +723,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
723
723
|
|
|
724
724
|
start_time = time.time()
|
|
725
725
|
try:
|
|
726
|
-
result, _ = self.__inner_transform(
|
|
726
|
+
result, _, _ = self.__inner_transform(
|
|
727
727
|
trace_id,
|
|
728
728
|
X,
|
|
729
729
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -951,9 +951,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
951
951
|
gc.collect()
|
|
952
952
|
|
|
953
953
|
if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
|
|
954
|
-
|
|
955
|
-
self.logger.warning("No client or free relevant ADS features found to calculate metrics")
|
|
956
|
-
self.warning_counter.increment()
|
|
954
|
+
self.__log_warning(self.bundle.get("metrics_no_important_free_features"))
|
|
957
955
|
return None
|
|
958
956
|
|
|
959
957
|
print(self.bundle.get("metrics_start"))
|
|
@@ -1654,9 +1652,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1654
1652
|
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1655
1653
|
generated_features = []
|
|
1656
1654
|
if date_column is not None:
|
|
1657
|
-
converter = DateTimeSearchKeyConverter(
|
|
1658
|
-
date_column, self.date_format, self.logger, self.bundle, silent_mode=True
|
|
1659
|
-
)
|
|
1655
|
+
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
1660
1656
|
df = converter.convert(df, keep_time=True)
|
|
1661
1657
|
generated_features = converter.generated_features
|
|
1662
1658
|
|
|
@@ -1666,11 +1662,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1666
1662
|
df = generator.generate(df)
|
|
1667
1663
|
generated_features.extend(generator.generated_features)
|
|
1668
1664
|
|
|
1669
|
-
normalizer = Normalizer(self.bundle, self.logger
|
|
1665
|
+
normalizer = Normalizer(self.bundle, self.logger)
|
|
1670
1666
|
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
1671
1667
|
columns_renaming = normalizer.columns_renaming
|
|
1672
1668
|
|
|
1673
|
-
df = clean_full_duplicates(df, logger=self.logger,
|
|
1669
|
+
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1674
1670
|
|
|
1675
1671
|
num_samples = _num_samples(df)
|
|
1676
1672
|
sample_threshold, sample_rows = (
|
|
@@ -1817,7 +1813,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1817
1813
|
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
|
1818
1814
|
df = pd.concat([df, eval_df_with_index])
|
|
1819
1815
|
|
|
1820
|
-
df = clean_full_duplicates(df, logger=self.logger,
|
|
1816
|
+
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1821
1817
|
|
|
1822
1818
|
# downsample if need to eval_set threshold
|
|
1823
1819
|
num_samples = _num_samples(df)
|
|
@@ -1830,7 +1826,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1830
1826
|
tmp_target_name = "__target"
|
|
1831
1827
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1832
1828
|
|
|
1833
|
-
enriched_df, columns_renaming = self.__inner_transform(
|
|
1829
|
+
enriched_df, columns_renaming, generated_features = self.__inner_transform(
|
|
1834
1830
|
trace_id,
|
|
1835
1831
|
df,
|
|
1836
1832
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -1847,7 +1843,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1847
1843
|
|
|
1848
1844
|
x_columns = [
|
|
1849
1845
|
c
|
|
1850
|
-
for c in (validated_X.columns.tolist() +
|
|
1846
|
+
for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
|
|
1851
1847
|
if c in enriched_df.columns
|
|
1852
1848
|
]
|
|
1853
1849
|
|
|
@@ -1869,7 +1865,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1869
1865
|
|
|
1870
1866
|
df[TARGET] = validated_y
|
|
1871
1867
|
|
|
1872
|
-
df = clean_full_duplicates(df, logger=self.logger,
|
|
1868
|
+
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1873
1869
|
|
|
1874
1870
|
num_samples = _num_samples(df)
|
|
1875
1871
|
if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
|
@@ -1879,7 +1875,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1879
1875
|
tmp_target_name = "__target"
|
|
1880
1876
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1881
1877
|
|
|
1882
|
-
enriched_Xy, columns_renaming = self.__inner_transform(
|
|
1878
|
+
enriched_Xy, columns_renaming, generated_features = self.__inner_transform(
|
|
1883
1879
|
trace_id,
|
|
1884
1880
|
df,
|
|
1885
1881
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -1896,7 +1892,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1896
1892
|
|
|
1897
1893
|
x_columns = [
|
|
1898
1894
|
c
|
|
1899
|
-
for c in (validated_X.columns.tolist() +
|
|
1895
|
+
for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
|
|
1900
1896
|
if c in enriched_Xy.columns
|
|
1901
1897
|
]
|
|
1902
1898
|
|
|
@@ -1904,7 +1900,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1904
1900
|
y_sampled = enriched_Xy[TARGET].copy()
|
|
1905
1901
|
enriched_X = enriched_Xy.drop(columns=TARGET)
|
|
1906
1902
|
|
|
1907
|
-
datasets_hash = hash_input(
|
|
1903
|
+
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
|
1908
1904
|
self.__cached_sampled_datasets[datasets_hash] = (
|
|
1909
1905
|
X_sampled,
|
|
1910
1906
|
y_sampled,
|
|
@@ -2023,7 +2019,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2023
2019
|
progress_bar: Optional[ProgressBar] = None,
|
|
2024
2020
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
2025
2021
|
add_fit_system_record_id: bool = False,
|
|
2026
|
-
) -> Tuple[pd.DataFrame, Dict[str, str]]:
|
|
2022
|
+
) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
|
|
2027
2023
|
if self._search_task is None:
|
|
2028
2024
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
2029
2025
|
|
|
@@ -2036,24 +2032,25 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2036
2032
|
|
|
2037
2033
|
if len(self.feature_names_) == 0:
|
|
2038
2034
|
self.logger.warning(self.bundle.get("no_important_features_for_transform"))
|
|
2039
|
-
return X, {c: c for c in X.columns}
|
|
2035
|
+
return X, {c: c for c in X.columns}, []
|
|
2040
2036
|
|
|
2041
2037
|
if self._has_paid_features(exclude_features_sources):
|
|
2042
2038
|
msg = self.bundle.get("transform_with_paid_features")
|
|
2043
2039
|
self.logger.warning(msg)
|
|
2044
2040
|
self.__display_support_link(msg)
|
|
2045
|
-
return None, {c: c for c in X.columns}
|
|
2041
|
+
return None, {c: c for c in X.columns}, []
|
|
2046
2042
|
|
|
2047
2043
|
if not metrics_calculation:
|
|
2048
2044
|
transform_usage = self.rest_client.get_current_transform_usage(trace_id)
|
|
2049
2045
|
self.logger.info(f"Current transform usage: {transform_usage}. Transforming {len(X)} rows")
|
|
2050
2046
|
if transform_usage.has_limit:
|
|
2051
2047
|
if len(X) > transform_usage.rest_rows:
|
|
2052
|
-
|
|
2048
|
+
rest_rows = max(transform_usage.rest_rows, 0)
|
|
2049
|
+
msg = self.bundle.get("transform_usage_warning").format(len(X), rest_rows)
|
|
2053
2050
|
self.logger.warning(msg)
|
|
2054
2051
|
print(msg)
|
|
2055
2052
|
show_request_quote_button()
|
|
2056
|
-
return None, {c: c for c in X.columns}
|
|
2053
|
+
return None, {c: c for c in X.columns}, []
|
|
2057
2054
|
else:
|
|
2058
2055
|
msg = self.bundle.get("transform_usage_info").format(
|
|
2059
2056
|
transform_usage.limit, transform_usage.transformed_rows
|
|
@@ -2093,9 +2090,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2093
2090
|
generated_features = []
|
|
2094
2091
|
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2095
2092
|
if date_column is not None:
|
|
2096
|
-
converter = DateTimeSearchKeyConverter(
|
|
2097
|
-
date_column, self.date_format, self.logger, bundle=self.bundle, silent_mode=silent_mode
|
|
2098
|
-
)
|
|
2093
|
+
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2099
2094
|
df = converter.convert(df)
|
|
2100
2095
|
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
2101
2096
|
generated_features.extend(converter.generated_features)
|
|
@@ -2110,7 +2105,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2110
2105
|
df = generator.generate(df)
|
|
2111
2106
|
generated_features.extend(generator.generated_features)
|
|
2112
2107
|
|
|
2113
|
-
normalizer = Normalizer(self.bundle, self.logger
|
|
2108
|
+
normalizer = Normalizer(self.bundle, self.logger)
|
|
2114
2109
|
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
2115
2110
|
columns_renaming = normalizer.columns_renaming
|
|
2116
2111
|
|
|
@@ -2176,7 +2171,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2176
2171
|
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2177
2172
|
df = converter.convert(df)
|
|
2178
2173
|
|
|
2179
|
-
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
2174
|
+
# generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
2180
2175
|
|
|
2181
2176
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
2182
2177
|
for col in features_for_transform:
|
|
@@ -2216,9 +2211,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2216
2211
|
|
|
2217
2212
|
df_without_features = df.drop(columns=features_not_to_pass)
|
|
2218
2213
|
|
|
2219
|
-
df_without_features = clean_full_duplicates(
|
|
2220
|
-
df_without_features, self.logger,
|
|
2214
|
+
df_without_features, full_duplicates_warning = clean_full_duplicates(
|
|
2215
|
+
df_without_features, self.logger, bundle=self.bundle
|
|
2221
2216
|
)
|
|
2217
|
+
if not silent_mode and full_duplicates_warning:
|
|
2218
|
+
self.__log_warning(full_duplicates_warning)
|
|
2222
2219
|
|
|
2223
2220
|
del df
|
|
2224
2221
|
gc.collect()
|
|
@@ -2337,7 +2334,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2337
2334
|
if add_fit_system_record_id:
|
|
2338
2335
|
result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
|
|
2339
2336
|
|
|
2340
|
-
return result, columns_renaming
|
|
2337
|
+
return result, columns_renaming, generated_features
|
|
2341
2338
|
|
|
2342
2339
|
def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
|
|
2343
2340
|
features_info = self._internal_features_info
|
|
@@ -2415,6 +2412,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2415
2412
|
def __is_registered(self) -> bool:
|
|
2416
2413
|
return self.api_key is not None and self.api_key != ""
|
|
2417
2414
|
|
|
2415
|
+
def __log_warning(self, message: str, show_support_link: bool = False):
|
|
2416
|
+
warning_num = self.warning_counter.increment()
|
|
2417
|
+
formatted_message = f"WARNING #{warning_num}: {message}\n"
|
|
2418
|
+
if show_support_link:
|
|
2419
|
+
self.__display_support_link(formatted_message)
|
|
2420
|
+
else:
|
|
2421
|
+
print(formatted_message)
|
|
2422
|
+
self.logger.warning(message)
|
|
2423
|
+
|
|
2418
2424
|
def __inner_fit(
|
|
2419
2425
|
self,
|
|
2420
2426
|
trace_id: str,
|
|
@@ -2461,9 +2467,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2461
2467
|
checked_generate_features = []
|
|
2462
2468
|
for gen_feature in self.generate_features:
|
|
2463
2469
|
if gen_feature not in x_columns:
|
|
2464
|
-
|
|
2465
|
-
print(msg)
|
|
2466
|
-
self.logger.warning(msg)
|
|
2470
|
+
self.__log_warning(self.bundle.get("missing_generate_feature").format(gen_feature, x_columns))
|
|
2467
2471
|
else:
|
|
2468
2472
|
checked_generate_features.append(gen_feature)
|
|
2469
2473
|
self.generate_features = checked_generate_features
|
|
@@ -2524,9 +2528,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2524
2528
|
self.date_format,
|
|
2525
2529
|
self.logger,
|
|
2526
2530
|
bundle=self.bundle,
|
|
2527
|
-
warnings_counter=self.warning_counter,
|
|
2528
2531
|
)
|
|
2529
2532
|
df = converter.convert(df, keep_time=True)
|
|
2533
|
+
if converter.has_old_dates:
|
|
2534
|
+
self.__log_warning(self.bundle.get("dataset_drop_old_dates"))
|
|
2530
2535
|
self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
|
|
2531
2536
|
self.fit_generated_features.extend(converter.generated_features)
|
|
2532
2537
|
else:
|
|
@@ -2541,7 +2546,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2541
2546
|
self.fit_generated_features.extend(generator.generated_features)
|
|
2542
2547
|
|
|
2543
2548
|
# Checks that need validated date
|
|
2544
|
-
|
|
2549
|
+
|
|
2550
|
+
if not is_dates_distribution_valid(df, self.fit_search_keys):
|
|
2551
|
+
self.__log_warning(bundle.get("x_unstable_by_date"))
|
|
2545
2552
|
|
|
2546
2553
|
if (
|
|
2547
2554
|
is_numeric_dtype(df[self.TARGET_NAME])
|
|
@@ -2550,18 +2557,25 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2550
2557
|
):
|
|
2551
2558
|
self._validate_PSI(df.sort_values(by=maybe_date_column))
|
|
2552
2559
|
|
|
2553
|
-
normalizer = Normalizer(self.bundle, self.logger
|
|
2560
|
+
normalizer = Normalizer(self.bundle, self.logger)
|
|
2554
2561
|
df, self.fit_search_keys, self.fit_generated_features = normalizer.normalize(
|
|
2555
2562
|
df, self.fit_search_keys, self.fit_generated_features
|
|
2556
2563
|
)
|
|
2557
2564
|
self.fit_columns_renaming = normalizer.columns_renaming
|
|
2565
|
+
if normalizer.removed_features:
|
|
2566
|
+
self.__log_warning(self.bundle.get("dataset_date_features").format(normalizer.removed_features))
|
|
2558
2567
|
|
|
2559
2568
|
self.__adjust_cv(df)
|
|
2560
2569
|
|
|
2561
|
-
df = remove_fintech_duplicates(
|
|
2570
|
+
df, fintech_warnings = remove_fintech_duplicates(
|
|
2562
2571
|
df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
|
|
2563
2572
|
)
|
|
2564
|
-
|
|
2573
|
+
if fintech_warnings:
|
|
2574
|
+
for fintech_warning in fintech_warnings:
|
|
2575
|
+
self.__log_warning(fintech_warning)
|
|
2576
|
+
df, full_duplicates_warning = clean_full_duplicates(df, self.logger, bundle=self.bundle)
|
|
2577
|
+
if full_duplicates_warning:
|
|
2578
|
+
self.__log_warning(full_duplicates_warning)
|
|
2565
2579
|
|
|
2566
2580
|
# Explode multiple search keys
|
|
2567
2581
|
df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
@@ -2621,9 +2635,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2621
2635
|
|
|
2622
2636
|
features_columns = [c for c in df.columns if c not in non_feature_columns]
|
|
2623
2637
|
|
|
2624
|
-
features_to_drop = FeaturesValidator(self.logger).validate(
|
|
2625
|
-
df, features_columns, self.generate_features, self.
|
|
2638
|
+
features_to_drop, feature_validator_warnings = FeaturesValidator(self.logger).validate(
|
|
2639
|
+
df, features_columns, self.generate_features, self.fit_columns_renaming
|
|
2626
2640
|
)
|
|
2641
|
+
if feature_validator_warnings:
|
|
2642
|
+
for warning in feature_validator_warnings:
|
|
2643
|
+
self.__log_warning(warning)
|
|
2627
2644
|
self.fit_dropped_features.update(features_to_drop)
|
|
2628
2645
|
df = df.drop(columns=features_to_drop)
|
|
2629
2646
|
|
|
@@ -2739,9 +2756,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2739
2756
|
zero_hit_columns = self.get_columns_by_search_keys(zero_hit_search_keys)
|
|
2740
2757
|
if zero_hit_columns:
|
|
2741
2758
|
msg = self.bundle.get("features_info_zero_hit_rate_search_keys").format(zero_hit_columns)
|
|
2742
|
-
self.
|
|
2743
|
-
self.__display_support_link(msg)
|
|
2744
|
-
self.warning_counter.increment()
|
|
2759
|
+
self.__log_warning(msg, show_support_link=True)
|
|
2745
2760
|
|
|
2746
2761
|
if (
|
|
2747
2762
|
self._search_task.unused_features_for_generation is not None
|
|
@@ -2751,9 +2766,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2751
2766
|
dataset.columns_renaming.get(col) or col for col in self._search_task.unused_features_for_generation
|
|
2752
2767
|
]
|
|
2753
2768
|
msg = self.bundle.get("features_not_generated").format(unused_features_for_generation)
|
|
2754
|
-
self.
|
|
2755
|
-
print(msg)
|
|
2756
|
-
self.warning_counter.increment()
|
|
2769
|
+
self.__log_warning(msg)
|
|
2757
2770
|
|
|
2758
2771
|
self.__prepare_feature_importances(trace_id, validated_X.columns.to_list() + self.fit_generated_features)
|
|
2759
2772
|
|
|
@@ -3154,7 +3167,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3154
3167
|
maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
3155
3168
|
if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
|
|
3156
3169
|
# TODO cast date column to single dtype
|
|
3157
|
-
date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format
|
|
3170
|
+
date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
|
|
3158
3171
|
converted_X = date_converter.convert(X)
|
|
3159
3172
|
min_date = converted_X[maybe_date_col].min()
|
|
3160
3173
|
max_date = converted_X[maybe_date_col].max()
|
|
@@ -3196,7 +3209,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3196
3209
|
logger.warning(msg)
|
|
3197
3210
|
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
|
3198
3211
|
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
|
3199
|
-
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE
|
|
3212
|
+
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE)
|
|
3200
3213
|
df = converter.convert(df)
|
|
3201
3214
|
return df
|
|
3202
3215
|
|
|
@@ -3768,15 +3781,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3768
3781
|
if meaning_type == SearchKey.COUNTRY and self.country_code is not None:
|
|
3769
3782
|
msg = self.bundle.get("search_key_country_and_country_code")
|
|
3770
3783
|
self.logger.warning(msg)
|
|
3771
|
-
|
|
3784
|
+
if not silent_mode:
|
|
3785
|
+
self.__log_warning(msg)
|
|
3772
3786
|
self.country_code = None
|
|
3773
3787
|
|
|
3774
3788
|
if not self.__is_registered and not is_demo_dataset and meaning_type in SearchKey.personal_keys():
|
|
3775
3789
|
msg = self.bundle.get("unregistered_with_personal_keys").format(meaning_type)
|
|
3776
3790
|
self.logger.warning(msg)
|
|
3777
3791
|
if not silent_mode:
|
|
3778
|
-
self.
|
|
3779
|
-
print(msg)
|
|
3792
|
+
self.__log_warning(msg)
|
|
3780
3793
|
|
|
3781
3794
|
valid_search_keys[column_name] = SearchKey.CUSTOM_KEY
|
|
3782
3795
|
else:
|
|
@@ -3810,27 +3823,22 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3810
3823
|
and not silent_mode
|
|
3811
3824
|
):
|
|
3812
3825
|
msg = self.bundle.get("date_only_search")
|
|
3813
|
-
|
|
3814
|
-
self.logger.warning(msg)
|
|
3815
|
-
self.warning_counter.increment()
|
|
3826
|
+
self.__log_warning(msg)
|
|
3816
3827
|
|
|
3817
3828
|
maybe_date = [k for k, v in valid_search_keys.items() if v in [SearchKey.DATE, SearchKey.DATETIME]]
|
|
3818
3829
|
if (self.cv is None or self.cv == CVType.k_fold) and len(maybe_date) > 0 and not silent_mode:
|
|
3819
3830
|
date_column = next(iter(maybe_date))
|
|
3820
3831
|
if x[date_column].nunique() > 0.9 * _num_samples(x):
|
|
3821
3832
|
msg = self.bundle.get("date_search_without_time_series")
|
|
3822
|
-
|
|
3823
|
-
self.logger.warning(msg)
|
|
3824
|
-
self.warning_counter.increment()
|
|
3833
|
+
self.__log_warning(msg)
|
|
3825
3834
|
|
|
3826
3835
|
if len(valid_search_keys) == 1:
|
|
3827
3836
|
key, value = list(valid_search_keys.items())[0]
|
|
3828
3837
|
# Show warning for country only if country is the only key
|
|
3829
3838
|
if x[key].nunique() == 1:
|
|
3830
3839
|
msg = self.bundle.get("single_constant_search_key").format(value, x[key].values[0])
|
|
3831
|
-
|
|
3832
|
-
|
|
3833
|
-
self.warning_counter.increment()
|
|
3840
|
+
if not silent_mode:
|
|
3841
|
+
self.__log_warning(msg)
|
|
3834
3842
|
# TODO maybe raise ValidationError
|
|
3835
3843
|
|
|
3836
3844
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
@@ -3890,9 +3898,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3890
3898
|
)
|
|
3891
3899
|
else:
|
|
3892
3900
|
msg = self.bundle.get("features_info_zero_important_features")
|
|
3893
|
-
self.
|
|
3894
|
-
self.__display_support_link(msg)
|
|
3895
|
-
self.warning_counter.increment()
|
|
3901
|
+
self.__log_warning(msg, show_support_link=True)
|
|
3896
3902
|
except (ImportError, NameError):
|
|
3897
3903
|
print(msg)
|
|
3898
3904
|
print(self._internal_features_info)
|
|
@@ -3994,8 +4000,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3994
4000
|
" But not used because not registered user"
|
|
3995
4001
|
)
|
|
3996
4002
|
if not silent_mode:
|
|
3997
|
-
|
|
3998
|
-
self.warning_counter.increment()
|
|
4003
|
+
self.__log_warning(self.bundle.get("email_detected_not_registered").format(maybe_keys))
|
|
3999
4004
|
|
|
4000
4005
|
# if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
|
|
4001
4006
|
if check_need_detect(SearchKey.PHONE):
|
|
@@ -4014,8 +4019,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
4014
4019
|
"But not used because not registered user"
|
|
4015
4020
|
)
|
|
4016
4021
|
if not silent_mode:
|
|
4017
|
-
|
|
4018
|
-
self.warning_counter.increment()
|
|
4022
|
+
self.__log_warning(self.bundle.get("phone_detected_not_registered"))
|
|
4019
4023
|
|
|
4020
4024
|
return search_keys
|
|
4021
4025
|
|
|
@@ -4039,19 +4043,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
4039
4043
|
part2 = train[half_train:]
|
|
4040
4044
|
train_psi = calculate_psi(part1[self.TARGET_NAME], part2[self.TARGET_NAME])
|
|
4041
4045
|
if train_psi > 0.2:
|
|
4042
|
-
self.
|
|
4043
|
-
msg = self.bundle.get("train_unstable_target").format(train_psi)
|
|
4044
|
-
print(msg)
|
|
4045
|
-
self.logger.warning(msg)
|
|
4046
|
+
self.__log_warning(self.bundle.get("train_unstable_target").format(train_psi))
|
|
4046
4047
|
|
|
4047
4048
|
# 2. Check train-test PSI
|
|
4048
4049
|
if eval1 is not None:
|
|
4049
4050
|
train_test_psi = calculate_psi(train[self.TARGET_NAME], eval1[self.TARGET_NAME])
|
|
4050
4051
|
if train_test_psi > 0.2:
|
|
4051
|
-
self.
|
|
4052
|
-
msg = self.bundle.get("eval_unstable_target").format(train_test_psi)
|
|
4053
|
-
print(msg)
|
|
4054
|
-
self.logger.warning(msg)
|
|
4052
|
+
self.__log_warning(self.bundle.get("eval_unstable_target").format(train_test_psi))
|
|
4055
4053
|
|
|
4056
4054
|
def _dump_python_libs(self):
|
|
4057
4055
|
try:
|
|
@@ -4073,8 +4071,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
4073
4071
|
self.logger.warning(f"Showing support link: {link_text}")
|
|
4074
4072
|
display(
|
|
4075
4073
|
HTML(
|
|
4076
|
-
f"""
|
|
4077
|
-
here</a
|
|
4074
|
+
f"""{link_text} <a href='{support_link}' target='_blank' rel='noopener noreferrer'>
|
|
4075
|
+
here</a><br/>"""
|
|
4078
4076
|
)
|
|
4079
4077
|
)
|
|
4080
4078
|
except (ImportError, NameError):
|
upgini/metrics.py
CHANGED
|
@@ -273,6 +273,9 @@ class EstimatorWrapper:
|
|
|
273
273
|
else:
|
|
274
274
|
x, y = self._remove_empty_target_rows(x, y)
|
|
275
275
|
|
|
276
|
+
# Make order of columns idempotent
|
|
277
|
+
x = x[sorted(x.columns)]
|
|
278
|
+
|
|
276
279
|
self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
|
|
277
280
|
return x, y, groups
|
|
278
281
|
|
|
@@ -434,7 +437,8 @@ class EstimatorWrapper:
|
|
|
434
437
|
f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
|
|
435
438
|
)
|
|
436
439
|
estimator_copy.set_params(
|
|
437
|
-
cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
|
|
440
|
+
# cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
|
|
441
|
+
cat_features=cat_features
|
|
438
442
|
)
|
|
439
443
|
estimator = CatBoostWrapper(**kwargs)
|
|
440
444
|
else:
|
|
@@ -745,20 +749,25 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
|
745
749
|
|
|
746
750
|
|
|
747
751
|
def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
|
748
|
-
if
|
|
752
|
+
if scoring is None:
|
|
753
|
+
return
|
|
754
|
+
|
|
755
|
+
if isinstance(scoring, str):
|
|
749
756
|
_get_scorer_by_name(scoring)
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
raise ValidationError(
|
|
754
|
-
f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
|
|
755
|
-
)
|
|
756
|
-
elif scoring is not None:
|
|
757
|
+
return
|
|
758
|
+
|
|
759
|
+
if not isinstance(scoring, Callable):
|
|
757
760
|
raise ValidationError(
|
|
758
761
|
f"Invalid scoring argument passed {scoring}. It should be string with scoring name or function"
|
|
759
762
|
" that accepts 3 input arguments: estimator, x, y"
|
|
760
763
|
)
|
|
761
764
|
|
|
765
|
+
spec = inspect.getfullargspec(scoring)
|
|
766
|
+
if len(spec.args) < 3:
|
|
767
|
+
raise ValidationError(
|
|
768
|
+
f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
|
|
769
|
+
)
|
|
770
|
+
|
|
762
771
|
|
|
763
772
|
def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
|
|
764
773
|
metric_name = scoring
|
|
@@ -26,7 +26,6 @@ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
|
26
26
|
from upgini.utils import find_numbers_with_decimal_comma
|
|
27
27
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
28
28
|
from upgini.utils.phone_utils import PhoneSearchKeyConverter
|
|
29
|
-
from upgini.utils.warning_counter import WarningCounter
|
|
30
29
|
|
|
31
30
|
|
|
32
31
|
class Normalizer:
|
|
@@ -37,16 +36,13 @@ class Normalizer:
|
|
|
37
36
|
self,
|
|
38
37
|
bundle: ResourceBundle = None,
|
|
39
38
|
logger: Logger = None,
|
|
40
|
-
warnings_counter: WarningCounter = None,
|
|
41
|
-
silent_mode=False,
|
|
42
39
|
):
|
|
43
40
|
self.bundle = bundle or get_custom_bundle()
|
|
44
41
|
self.logger = logger or getLogger()
|
|
45
|
-
self.warnings_counter = warnings_counter or WarningCounter()
|
|
46
|
-
self.silent_mode = silent_mode
|
|
47
42
|
self.columns_renaming = {}
|
|
48
43
|
self.search_keys = {}
|
|
49
44
|
self.generated_features = []
|
|
45
|
+
self.removed_features = []
|
|
50
46
|
|
|
51
47
|
def normalize(
|
|
52
48
|
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
|
|
@@ -139,19 +135,11 @@ class Normalizer:
|
|
|
139
135
|
def _remove_dates_from_features(self, df: pd.DataFrame):
|
|
140
136
|
features = self._get_features(df)
|
|
141
137
|
|
|
142
|
-
removed_features = []
|
|
143
138
|
for f in features:
|
|
144
139
|
if is_datetime(df[f]) or isinstance(df[f].dtype, pd.PeriodDtype):
|
|
145
|
-
removed_features.append(f)
|
|
140
|
+
self.removed_features.append(f)
|
|
146
141
|
df.drop(columns=f, inplace=True)
|
|
147
142
|
|
|
148
|
-
if removed_features:
|
|
149
|
-
msg = self.bundle.get("dataset_date_features").format(removed_features)
|
|
150
|
-
self.logger.warning(msg)
|
|
151
|
-
if not self.silent_mode:
|
|
152
|
-
print(msg)
|
|
153
|
-
self.warnings_counter.increment()
|
|
154
|
-
|
|
155
143
|
return df
|
|
156
144
|
|
|
157
145
|
def _cut_too_long_string_values(self, df: pd.DataFrame):
|
|
@@ -15,31 +15,28 @@ transform_usage_warning=You are trying to launch enrichment for {} rows, which w
|
|
|
15
15
|
|
|
16
16
|
# Warnings
|
|
17
17
|
support_link=https://upgini.com/support
|
|
18
|
-
|
|
19
|
-
# slack_community_text=\nWARNING: Looks like you've run into an error. For help request write us in the Upgini community
|
|
20
|
-
support_text=\nWARNING: Looks like you've run into an error. For help request write us in support
|
|
18
|
+
support_text=Looks like you've run into an error. For help request write us in support
|
|
21
19
|
slack_community_bage=https://img.shields.io/badge/slack-@upgini-orange.svg?logo=slack
|
|
22
20
|
slack_community_alt=Upgini Slack community
|
|
23
|
-
version_warning
|
|
24
|
-
unregistered_with_personal_keys
|
|
25
|
-
date_only_search
|
|
26
|
-
date_search_without_time_series
|
|
27
|
-
metrics_exclude_paid_features
|
|
28
|
-
metrics_no_important_free_features
|
|
29
|
-
metrics_no_important_features
|
|
21
|
+
version_warning=Unsupported library version detected {},\nplease update with “%pip install -U upgini” to the latest {} and restart Jupyter kernel
|
|
22
|
+
unregistered_with_personal_keys=Search key {} can be used only with personal api_key from profile.upgini.com It will be ignored
|
|
23
|
+
date_only_search=Search started with DATE search key only\nTry to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IP to your training dataset\nfor search through all the available data sources.\nSee docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history
|
|
24
|
+
date_search_without_time_series=Looks like your training dataset is a time series. We recommend to set `cv=CVType.time_series` param for correct search results.\nSee docs https://github.com/upgini/upgini#-time-series-prediction-support
|
|
25
|
+
metrics_exclude_paid_features=Metrics calculated after enrichment has a free features only. To calculate metrics with a full set of relevant features, including commercial data sources, please contact support team:
|
|
26
|
+
metrics_no_important_free_features=No important free features to calculate metrics
|
|
27
|
+
metrics_no_important_features=No important features to calculate metrics
|
|
30
28
|
metrics_negative_uplift_without_cv=Please re-check that your task is not a time series prediction. If so, restart search with cv=CVType.time_series param for correct search results. See docs https://github.com/upgini/upgini#-time-series-prediction-support
|
|
31
29
|
# metrics_with_trial_features=The calculation of final accuracy metrics using Trial data is not available for unauthorized users.\nGet a free API key on https://upgini.com and repeat your request.
|
|
32
|
-
# transform_with_trial_features
|
|
30
|
+
# transform_with_trial_features=Your search results contain Trial data sources. To enrich your dataframe using transform or fit_transform with features from these Trial data sources, please register for a Free API key at https://upgini.com and resubmit your request.
|
|
33
31
|
# Enriching with Trial data is not available for unauthorized users.\nGet a free API key on https://upgini.com and repeat your request.
|
|
34
|
-
metrics_with_paid_features
|
|
35
|
-
transform_with_paid_features
|
|
36
|
-
trial_quota_limit_riched
|
|
37
|
-
loss_selection_warn
|
|
38
|
-
loss_calc_metrics_warn
|
|
39
|
-
multivariate_timeseries_detected
|
|
40
|
-
group_k_fold_in_classification
|
|
41
|
-
current_date_added
|
|
42
|
-
|
|
32
|
+
metrics_with_paid_features=The calculation of final accuracy metrics using Paid data is not available.\nContact Upgini support for the data access
|
|
33
|
+
transform_with_paid_features=Enriching with Paid data is not available.\nContact Upgini support for the data access
|
|
34
|
+
trial_quota_limit_riched=You have reached the quota limit of trial data usage. Please contact Upgini support to remove restriction
|
|
35
|
+
loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
|
|
36
|
+
loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
|
|
37
|
+
multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
|
|
38
|
+
group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
|
|
39
|
+
current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
|
|
43
40
|
# Errors
|
|
44
41
|
failed_search_by_task_id=Failed to retrieve the specified search results
|
|
45
42
|
metrics_unfitted_enricher=Call fit method before calling calculate_metrics
|
|
@@ -86,11 +83,11 @@ search_key_not_found=Column `{}` from search_keys was not found in X dataframe:
|
|
|
86
83
|
numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
|
|
87
84
|
unsupported_search_key_type=Unsupported type of key in search_keys: {}
|
|
88
85
|
unsupported_type_of_search_key=Unsupported type of search key: {}. It should be a member of SearcKey
|
|
89
|
-
search_key_country_and_country_code
|
|
86
|
+
search_key_country_and_country_code=SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
|
|
90
87
|
empty_search_key=Search key {} is empty. Please fill values or remove this search key
|
|
91
|
-
single_constant_search_key
|
|
88
|
+
single_constant_search_key=Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
|
|
92
89
|
unsupported_multi_key=Search key {} cannot be used multiple times
|
|
93
|
-
unsupported_index_column
|
|
90
|
+
unsupported_index_column=Your column with name `index` was dropped because it's reserved name is booked for system needs.
|
|
94
91
|
date_string_without_format=Date column `{}` has string type, but date_format is not specified. Convert column to datetime type or pass date_format
|
|
95
92
|
invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit date format in date_format argument of FeaturesEnricher constructor
|
|
96
93
|
unsupported_date_type=Unsupported type of date column `{}`. Convert to datetime please.
|
|
@@ -100,7 +97,7 @@ invalid_ip=All values of IP column `{}` are invalid
|
|
|
100
97
|
# X and y validation
|
|
101
98
|
unsupported_x_type=Unsupported type of X: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list
|
|
102
99
|
x_contains_dup_columns=X contains duplicate column names. Please rename or drop duplicates
|
|
103
|
-
x_contains_enriching_columns
|
|
100
|
+
x_contains_enriching_columns=X contains column names that match the names of features from external data sources. They will be dropped from the dataframe before the enrichment: {}
|
|
104
101
|
unsupported_y_type=Unsupported type of y: {}. Use pandas.DataFrame, pandas.Series, numpy.ndarray or list
|
|
105
102
|
y_is_constant=y is a constant. Relevant feature search requires a non-constant y
|
|
106
103
|
x_and_y_diff_size=X and y has different size: {}, {}.
|
|
@@ -113,10 +110,10 @@ y_multiindex_unsupported=Multi index in y is not supported
|
|
|
113
110
|
x_is_empty=X is empty
|
|
114
111
|
y_is_empty=y is empty
|
|
115
112
|
x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
|
|
116
|
-
missing_generate_feature
|
|
117
|
-
x_unstable_by_date
|
|
118
|
-
train_unstable_target
|
|
119
|
-
eval_unstable_target
|
|
113
|
+
missing_generate_feature=Feature {} specified in `generate_features` is not present in input columns: {}
|
|
114
|
+
x_unstable_by_date=Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
|
|
115
|
+
train_unstable_target=Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
|
|
116
|
+
eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
|
|
120
117
|
# eval set validation
|
|
121
118
|
unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
|
|
122
119
|
eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
|
|
@@ -140,21 +137,23 @@ missing_features_for_transform=Missing some features for transform that were pre
|
|
|
140
137
|
# target validation
|
|
141
138
|
empty_target=Target is empty in all rows
|
|
142
139
|
# non_numeric_target=Binary target should be numerical type
|
|
143
|
-
uneven_eval_target_distribution
|
|
144
|
-
target_outliers_warning
|
|
140
|
+
uneven_eval_target_distribution=y distributions from the training sample and eval_set differ according to the Kolmogorov-Smirnov test,\nwhich makes metrics between the train and eval_set incomparable.
|
|
141
|
+
target_outliers_warning=We detected {} outliers in your sample.\nExamples of outliers with maximum value of target:\n{}\nOutliers will {}be excluded during the metrics calculation.
|
|
142
|
+
|
|
145
143
|
# features validation
|
|
146
|
-
empty_or_contant_features
|
|
147
|
-
high_cardinality_features
|
|
148
|
-
# one_hot_encoded_features
|
|
144
|
+
empty_or_contant_features=Columns {} has value with frequency more than 99%, removed from X
|
|
145
|
+
high_cardinality_features=Columns {} has high cardinality (>90% unique values), removed from X
|
|
146
|
+
# one_hot_encoded_features=One hot encoded features detected. Use int encoding for correct results of fit.\n{}
|
|
147
|
+
|
|
149
148
|
# Dataset validation
|
|
150
149
|
dataset_too_few_rows=X size should be at least {} rows after validation
|
|
151
150
|
dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample X
|
|
152
151
|
dataset_empty_column_names=Some column names are empty. Add names please
|
|
153
|
-
dataset_full_duplicates
|
|
154
|
-
dataset_diff_target_duplicates
|
|
155
|
-
dataset_train_diff_target_duplicates_fintech
|
|
156
|
-
dataset_eval_diff_target_duplicates_fintech
|
|
157
|
-
dataset_drop_old_dates
|
|
152
|
+
dataset_full_duplicates={:.5f}% of the rows are fully duplicated
|
|
153
|
+
dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
|
|
154
|
+
dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
|
155
|
+
dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
|
156
|
+
dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset
|
|
158
157
|
dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
|
|
159
158
|
dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
|
|
160
159
|
dataset_invalid_binary_target=Binary task type should contain only 2 target values, but {} found
|
|
@@ -163,8 +162,8 @@ dataset_invalid_regression_target=Unexpected dtype of target for regression task
|
|
|
163
162
|
dataset_invalid_timeseries_target=Unexpected dtype of target for timeseries task type: {}. Expected float
|
|
164
163
|
dataset_to_many_multiclass_targets=The number of target classes {} exceeds the allowed threshold: {}. Please, correct your data and try again
|
|
165
164
|
dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
|
|
166
|
-
dataset_rarest_class_less_threshold
|
|
167
|
-
dataset_date_features
|
|
165
|
+
dataset_rarest_class_less_threshold=Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
|
|
166
|
+
dataset_date_features=Columns {} is a datetime or period type but not used as a search key, removed from X
|
|
168
167
|
dataset_too_many_features=Too many features. Maximum number of features is {}
|
|
169
168
|
dataset_constant_target=y contains only one distinct value
|
|
170
169
|
dataset_empty_target=y contains only NaN or incorrect values.
|
|
@@ -172,10 +171,9 @@ dataset_invalid_column_type=Unsupported data type of column {}: {}
|
|
|
172
171
|
dataset_invalid_filter=Unknown field in filter_features. Should be {'min_importance', 'max_psi', 'max_count', 'selected_features'}.
|
|
173
172
|
dataset_too_big_file=Too big size of dataframe X for processing. Please reduce number of rows or columns
|
|
174
173
|
dataset_transform_diff_fit=You try to enrich dataset that column names are different from the train dataset column names that you used on the fit stage. Please make the column names the same as in the train dataset and restart.
|
|
175
|
-
binary_small_dataset
|
|
174
|
+
binary_small_dataset=The least populated class in Target contains less than 1000 rows.\nSmall numbers of observations may negatively affect the number of selected features and quality of your ML model.\nUpgini recommends you increase the number of observations in the least populated class.\n
|
|
176
175
|
all_search_keys_invalid=All search keys are invalid
|
|
177
|
-
all_emails_invalid
|
|
178
|
-
# Metrics validation
|
|
176
|
+
all_emails_invalid=All values in column {} are invalid emails # Metrics validation
|
|
179
177
|
metrics_msle_negative_target=Mean Squared Logarithmic Error cannot be used when y contain negative values
|
|
180
178
|
metrics_unsupported_target_type=Unsupported type of target in y: {}
|
|
181
179
|
metrics_invalid_scoring={} is not a valid scoring value. Use {} to get valid options
|
|
@@ -193,8 +191,7 @@ ads_upload_to_many_empty_rows=More than 50% of rows in the submitted sample does
|
|
|
193
191
|
# Features info warning
|
|
194
192
|
features_info_zero_important_features=Oops, we can't find any relevant external features for your training dataset,\nmost probably due to issues with search keys formats.\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
|
|
195
193
|
features_info_zero_hit_rate_search_keys=Oops, looks like values/formats of the search keys {} might be incorrect,\nas we won't be able to match any data source using these values\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
|
|
196
|
-
features_not_generated
|
|
197
|
-
|
|
194
|
+
features_not_generated=Following features didn't pass checks for automated feature generation: {}
|
|
198
195
|
# Information
|
|
199
196
|
postal_code_detected=Postal codes detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
200
197
|
country_detected=Countries detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
@@ -203,8 +200,8 @@ country_default_determined=Search key country_code `{}` was used as default. \nS
|
|
|
203
200
|
email_detected=Emails detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
204
201
|
email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
205
202
|
phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
206
|
-
phone_detected_not_registered
|
|
207
|
-
target_type_detected
|
|
203
|
+
phone_detected_not_registered=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
204
|
+
target_type_detected=Detected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
|
|
208
205
|
binary_target_reason=only two unique label-values observed
|
|
209
206
|
non_numeric_multiclass_reason=non-numeric label values observed
|
|
210
207
|
few_unique_label_multiclass_reason=few unique label-values observed and can be considered as categorical
|
|
@@ -215,7 +212,7 @@ limited_int_multiclass_reason=integer-like values with limited unique values obs
|
|
|
215
212
|
all_ok_community_invite=❓ Support request
|
|
216
213
|
too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
|
|
217
214
|
imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
|
|
218
|
-
imbalanced_target
|
|
215
|
+
imbalanced_target=Target is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
|
|
219
216
|
loss_selection_info=Using loss `{}` for feature selection
|
|
220
217
|
loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
|
|
221
218
|
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -11,7 +11,6 @@ from pandas.api.types import is_numeric_dtype
|
|
|
11
11
|
from upgini.errors import ValidationError
|
|
12
12
|
from upgini.metadata import EVAL_SET_INDEX, SearchKey
|
|
13
13
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
14
|
-
from upgini.utils.warning_counter import WarningCounter
|
|
15
14
|
|
|
16
15
|
DATE_FORMATS = [
|
|
17
16
|
"%Y-%m-%d",
|
|
@@ -42,8 +41,6 @@ class DateTimeSearchKeyConverter:
|
|
|
42
41
|
date_format: Optional[str] = None,
|
|
43
42
|
logger: Optional[logging.Logger] = None,
|
|
44
43
|
bundle: Optional[ResourceBundle] = None,
|
|
45
|
-
warnings_counter: Optional[WarningCounter] = None,
|
|
46
|
-
silent_mode=False,
|
|
47
44
|
):
|
|
48
45
|
self.date_column = date_column
|
|
49
46
|
self.date_format = date_format
|
|
@@ -54,8 +51,7 @@ class DateTimeSearchKeyConverter:
|
|
|
54
51
|
self.logger.setLevel("FATAL")
|
|
55
52
|
self.generated_features: List[str] = []
|
|
56
53
|
self.bundle = bundle or get_custom_bundle()
|
|
57
|
-
self.
|
|
58
|
-
self.silent_mode = silent_mode
|
|
54
|
+
self.has_old_dates = False
|
|
59
55
|
|
|
60
56
|
@staticmethod
|
|
61
57
|
def _int_to_opt(i: int) -> Optional[int]:
|
|
@@ -101,7 +97,6 @@ class DateTimeSearchKeyConverter:
|
|
|
101
97
|
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
|
|
102
98
|
else:
|
|
103
99
|
msg = self.bundle.get("unsupported_date_type").format(self.date_column)
|
|
104
|
-
self.logger.warning(msg)
|
|
105
100
|
raise ValidationError(msg)
|
|
106
101
|
else:
|
|
107
102
|
df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
|
|
@@ -162,13 +157,9 @@ class DateTimeSearchKeyConverter:
|
|
|
162
157
|
condition = df[self.date_column] <= self.MIN_SUPPORTED_DATE_TS
|
|
163
158
|
old_subset = df[condition]
|
|
164
159
|
if len(old_subset) > 0:
|
|
160
|
+
self.has_old_dates = True
|
|
165
161
|
df.loc[condition, self.date_column] = None
|
|
166
162
|
self.logger.info(f"Set to None: {len(old_subset)} of {len(df)} rows because they are before 2000-01-01")
|
|
167
|
-
msg = self.bundle.get("dataset_drop_old_dates")
|
|
168
|
-
self.logger.warning(msg)
|
|
169
|
-
if not self.silent_mode:
|
|
170
|
-
print(msg)
|
|
171
|
-
self.warnings_counter.increment()
|
|
172
163
|
return df
|
|
173
164
|
|
|
174
165
|
|
|
@@ -256,13 +247,10 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
|
|
|
256
247
|
return len(accumulated_changing_columns) <= 2
|
|
257
248
|
|
|
258
249
|
|
|
259
|
-
def
|
|
250
|
+
def is_dates_distribution_valid(
|
|
260
251
|
df: pd.DataFrame,
|
|
261
252
|
search_keys: Dict[str, SearchKey],
|
|
262
|
-
|
|
263
|
-
bundle: Optional[ResourceBundle] = None,
|
|
264
|
-
warning_counter: Optional[WarningCounter] = None,
|
|
265
|
-
):
|
|
253
|
+
) -> bool:
|
|
266
254
|
maybe_date_col = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
267
255
|
|
|
268
256
|
if EVAL_SET_INDEX in df.columns:
|
|
@@ -303,13 +291,4 @@ def validate_dates_distribution(
|
|
|
303
291
|
date_counts_2 = date_counts[round(len(date_counts) / 2) :]
|
|
304
292
|
ratio = date_counts_2.mean() / date_counts_1.mean()
|
|
305
293
|
|
|
306
|
-
|
|
307
|
-
if warning_counter is not None:
|
|
308
|
-
warning_counter.increment()
|
|
309
|
-
if logger is None:
|
|
310
|
-
logger = logging.getLogger("muted_logger")
|
|
311
|
-
logger.setLevel("FATAL")
|
|
312
|
-
bundle = bundle or get_custom_bundle()
|
|
313
|
-
msg = bundle.get("x_unstable_by_date")
|
|
314
|
-
print(msg)
|
|
315
|
-
logger.warning(msg)
|
|
294
|
+
return ratio >= 0.8 and ratio <= 1.2
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from logging import Logger
|
|
2
|
-
from typing import Dict, List, Optional, Union
|
|
3
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
3
4
|
|
|
4
5
|
import pandas as pd
|
|
5
6
|
|
|
@@ -12,7 +13,7 @@ from upgini.metadata import (
|
|
|
12
13
|
ModelTaskType,
|
|
13
14
|
SearchKey,
|
|
14
15
|
)
|
|
15
|
-
from upgini.resource_bundle import ResourceBundle
|
|
16
|
+
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
16
17
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
17
18
|
from upgini.utils.target_utils import define_task
|
|
18
19
|
|
|
@@ -22,16 +23,19 @@ def remove_fintech_duplicates(
|
|
|
22
23
|
search_keys: Dict[str, SearchKey],
|
|
23
24
|
date_format: Optional[str] = None,
|
|
24
25
|
logger: Optional[Logger] = None,
|
|
25
|
-
silent=False,
|
|
26
26
|
bundle: ResourceBundle = None,
|
|
27
|
-
) -> pd.DataFrame:
|
|
27
|
+
) -> Tuple[pd.DataFrame, Optional[List[str]]]:
|
|
28
28
|
# Initial checks for target type and date column
|
|
29
|
+
bundle = bundle or get_custom_bundle()
|
|
30
|
+
if logger is None:
|
|
31
|
+
logger = logging.getLogger()
|
|
32
|
+
logger.setLevel(logging.FATAL)
|
|
29
33
|
date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
30
34
|
if define_task(df[TARGET], date_col is not None, silent=True) != ModelTaskType.BINARY:
|
|
31
|
-
return df
|
|
35
|
+
return df, []
|
|
32
36
|
|
|
33
37
|
if date_col is None:
|
|
34
|
-
return df
|
|
38
|
+
return df, []
|
|
35
39
|
|
|
36
40
|
personal_cols = []
|
|
37
41
|
phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
|
|
@@ -44,7 +48,7 @@ def remove_fintech_duplicates(
|
|
|
44
48
|
if hem_col:
|
|
45
49
|
personal_cols.append(hem_col)
|
|
46
50
|
if len(personal_cols) == 0:
|
|
47
|
-
return df
|
|
51
|
+
return df, []
|
|
48
52
|
|
|
49
53
|
# Splitting into train and eval_set parts
|
|
50
54
|
if EVAL_SET_INDEX in df.columns:
|
|
@@ -54,11 +58,13 @@ def remove_fintech_duplicates(
|
|
|
54
58
|
train_df = df
|
|
55
59
|
eval_dfs = []
|
|
56
60
|
|
|
57
|
-
|
|
61
|
+
warning_messages = []
|
|
62
|
+
|
|
63
|
+
def process_df(segment_df: pd.DataFrame, eval_index=0) -> Tuple[pd.DataFrame, Optional[str]]:
|
|
58
64
|
"""Process a subset of the dataset to remove duplicates based on personal keys."""
|
|
59
65
|
# Fast check for duplicates based on personal keys
|
|
60
66
|
if not segment_df[personal_cols].duplicated().any():
|
|
61
|
-
return segment_df
|
|
67
|
+
return segment_df, None
|
|
62
68
|
|
|
63
69
|
sub_df = segment_df[personal_cols + [date_col, TARGET]].copy()
|
|
64
70
|
|
|
@@ -70,18 +76,18 @@ def remove_fintech_duplicates(
|
|
|
70
76
|
total = len(uniques)
|
|
71
77
|
diff_dates = len(uniques[uniques > 1])
|
|
72
78
|
if diff_dates / total >= 0.6:
|
|
73
|
-
return segment_df
|
|
79
|
+
return segment_df, None
|
|
74
80
|
|
|
75
81
|
# Check for duplicate rows
|
|
76
82
|
duplicates = sub_df.duplicated(personal_cols, keep=False)
|
|
77
83
|
duplicate_rows = sub_df[duplicates]
|
|
78
84
|
if len(duplicate_rows) == 0:
|
|
79
|
-
return segment_df
|
|
85
|
+
return segment_df, None
|
|
80
86
|
|
|
81
87
|
# Check if there are different target values for the same personal keys
|
|
82
88
|
nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
|
|
83
89
|
if nonunique_target_groups.sum() == 0:
|
|
84
|
-
return segment_df
|
|
90
|
+
return segment_df, None
|
|
85
91
|
|
|
86
92
|
# Helper function to check if there are different target values within 60 days
|
|
87
93
|
def has_diff_target_within_60_days(rows: pd.DataFrame):
|
|
@@ -115,23 +121,23 @@ def remove_fintech_duplicates(
|
|
|
115
121
|
msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
|
|
116
122
|
perc, len(rows_to_remove), eval_index, rows_to_remove.index.to_list()
|
|
117
123
|
)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
if logger:
|
|
121
|
-
logger.warning(msg)
|
|
122
|
-
return segment_df[~segment_df.index.isin(rows_to_remove.index)]
|
|
123
|
-
return segment_df
|
|
124
|
+
return segment_df[~segment_df.index.isin(rows_to_remove.index)], msg
|
|
125
|
+
return segment_df, None
|
|
124
126
|
|
|
125
127
|
# Process the train part separately
|
|
126
128
|
logger.info(f"Train dataset shape before clean fintech duplicates: {train_df.shape}")
|
|
127
|
-
train_df = process_df(train_df)
|
|
129
|
+
train_df, train_warning = process_df(train_df)
|
|
130
|
+
if train_warning:
|
|
131
|
+
warning_messages.append(train_warning)
|
|
128
132
|
logger.info(f"Train dataset shape after clean fintech duplicates: {train_df.shape}")
|
|
129
133
|
|
|
130
134
|
# Process each eval_set part separately
|
|
131
135
|
new_eval_dfs = []
|
|
132
136
|
for i, eval_df in enumerate(eval_dfs, 1):
|
|
133
137
|
logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
|
|
134
|
-
cleaned_eval_df = process_df(eval_df, i)
|
|
138
|
+
cleaned_eval_df, eval_warning = process_df(eval_df, i)
|
|
139
|
+
if eval_warning:
|
|
140
|
+
warning_messages.append(eval_warning)
|
|
135
141
|
logger.info(f"Eval {i} dataset shape after clean fintech duplicates: {cleaned_eval_df.shape}")
|
|
136
142
|
new_eval_dfs.append(cleaned_eval_df)
|
|
137
143
|
|
|
@@ -143,15 +149,21 @@ def remove_fintech_duplicates(
|
|
|
143
149
|
df = train_df
|
|
144
150
|
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
|
145
151
|
|
|
146
|
-
return df
|
|
152
|
+
return df, warning_messages
|
|
147
153
|
|
|
148
154
|
|
|
149
155
|
def clean_full_duplicates(
|
|
150
|
-
df: pd.DataFrame, logger: Optional[Logger] = None,
|
|
151
|
-
) -> pd.DataFrame:
|
|
156
|
+
df: pd.DataFrame, logger: Optional[Logger] = None, bundle: Optional[ResourceBundle] = None
|
|
157
|
+
) -> Tuple[pd.DataFrame, Optional[str]]:
|
|
158
|
+
if logger is None:
|
|
159
|
+
logger = logging.getLogger()
|
|
160
|
+
logger.setLevel(logging.FATAL)
|
|
161
|
+
if bundle is None:
|
|
162
|
+
bundle = get_custom_bundle()
|
|
163
|
+
|
|
152
164
|
nrows = len(df)
|
|
153
165
|
if nrows == 0:
|
|
154
|
-
return df
|
|
166
|
+
return df, None
|
|
155
167
|
# Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
|
|
156
168
|
unique_columns = df.columns.tolist()
|
|
157
169
|
if SYSTEM_RECORD_ID in unique_columns:
|
|
@@ -162,6 +174,7 @@ def clean_full_duplicates(
|
|
|
162
174
|
unique_columns.remove(SORT_ID)
|
|
163
175
|
if EVAL_SET_INDEX in unique_columns:
|
|
164
176
|
unique_columns.remove(EVAL_SET_INDEX)
|
|
177
|
+
|
|
165
178
|
logger.info(f"Dataset shape before clean duplicates: {df.shape}")
|
|
166
179
|
# Train segment goes first so if duplicates are found in train and eval set
|
|
167
180
|
# then we keep unique rows in train segment
|
|
@@ -170,11 +183,9 @@ def clean_full_duplicates(
|
|
|
170
183
|
nrows_after_full_dedup = len(df)
|
|
171
184
|
share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
|
|
172
185
|
if share_full_dedup > 0:
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
# print(msg)
|
|
177
|
-
# self.warning_counter.increment()
|
|
186
|
+
logger.warning(bundle.get("dataset_full_duplicates").format(share_full_dedup))
|
|
187
|
+
|
|
188
|
+
msg = None
|
|
178
189
|
if TARGET in df.columns:
|
|
179
190
|
unique_columns.remove(TARGET)
|
|
180
191
|
marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
|
|
@@ -185,13 +196,10 @@ def clean_full_duplicates(
|
|
|
185
196
|
share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
|
|
186
197
|
|
|
187
198
|
msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
|
|
188
|
-
logger.warning(msg)
|
|
189
|
-
if not silent:
|
|
190
|
-
print(msg)
|
|
191
199
|
df = df.drop_duplicates(subset=unique_columns, keep=False)
|
|
192
200
|
logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
|
|
193
201
|
|
|
194
|
-
return df
|
|
202
|
+
return df, msg
|
|
195
203
|
|
|
196
204
|
|
|
197
205
|
def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:
|
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from logging import Logger
|
|
3
|
-
from typing import Dict, List, Optional
|
|
3
|
+
from typing import Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
|
|
7
7
|
|
|
8
8
|
from upgini.resource_bundle import bundle
|
|
9
|
-
from upgini.utils.warning_counter import WarningCounter
|
|
10
9
|
|
|
11
10
|
|
|
12
11
|
class FeaturesValidator:
|
|
@@ -21,13 +20,13 @@ class FeaturesValidator:
|
|
|
21
20
|
self,
|
|
22
21
|
df: pd.DataFrame,
|
|
23
22
|
features: List[str],
|
|
24
|
-
features_for_generate: Optional[List[str]],
|
|
25
|
-
warning_counter: WarningCounter,
|
|
23
|
+
features_for_generate: Optional[List[str]] = None,
|
|
26
24
|
columns_renaming: Optional[Dict[str, str]] = None,
|
|
27
|
-
) -> List[str]:
|
|
25
|
+
) -> Tuple[List[str], List[str]]:
|
|
28
26
|
# one_hot_encoded_features = []
|
|
29
27
|
empty_or_constant_features = []
|
|
30
28
|
high_cardinality_features = []
|
|
29
|
+
warnings = []
|
|
31
30
|
|
|
32
31
|
for f in features:
|
|
33
32
|
column = df[f]
|
|
@@ -52,9 +51,7 @@ class FeaturesValidator:
|
|
|
52
51
|
|
|
53
52
|
# if one_hot_encoded_features:
|
|
54
53
|
# msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
|
|
55
|
-
#
|
|
56
|
-
# self.logger.warning(msg)
|
|
57
|
-
# warning_counter.increment()
|
|
54
|
+
# warnings.append(msg)
|
|
58
55
|
|
|
59
56
|
columns_renaming = columns_renaming or {}
|
|
60
57
|
|
|
@@ -62,9 +59,7 @@ class FeaturesValidator:
|
|
|
62
59
|
msg = bundle.get("empty_or_contant_features").format(
|
|
63
60
|
[columns_renaming.get(f, f) for f in empty_or_constant_features]
|
|
64
61
|
)
|
|
65
|
-
|
|
66
|
-
self.logger.warning(msg)
|
|
67
|
-
warning_counter.increment()
|
|
62
|
+
warnings.append(msg)
|
|
68
63
|
|
|
69
64
|
high_cardinality_features = self.find_high_cardinality(df[features])
|
|
70
65
|
if features_for_generate:
|
|
@@ -75,11 +70,9 @@ class FeaturesValidator:
|
|
|
75
70
|
msg = bundle.get("high_cardinality_features").format(
|
|
76
71
|
[columns_renaming.get(f, f) for f in high_cardinality_features]
|
|
77
72
|
)
|
|
78
|
-
|
|
79
|
-
self.logger.warning(msg)
|
|
80
|
-
warning_counter.increment()
|
|
73
|
+
warnings.append(msg)
|
|
81
74
|
|
|
82
|
-
return empty_or_constant_features + high_cardinality_features
|
|
75
|
+
return (empty_or_constant_features + high_cardinality_features, warnings)
|
|
83
76
|
|
|
84
77
|
@staticmethod
|
|
85
78
|
def find_high_cardinality(df: pd.DataFrame) -> List[str]:
|
upgini/utils/warning_counter.py
CHANGED
upgini/version_validator.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import threading
|
|
3
|
+
from typing import Callable, Optional
|
|
3
4
|
|
|
4
5
|
import requests
|
|
5
6
|
|
|
@@ -30,15 +31,18 @@ def get_version(package, url_pattern=URL_PATTERN):
|
|
|
30
31
|
return version
|
|
31
32
|
|
|
32
33
|
|
|
33
|
-
def validate_version(logger: logging.Logger):
|
|
34
|
+
def validate_version(logger: logging.Logger, warning_function: Optional[Callable[[str], None]] = None):
|
|
34
35
|
def task():
|
|
35
36
|
try:
|
|
36
37
|
current_version = parse(__version__)
|
|
37
38
|
latest_version = get_version("upgini")
|
|
38
39
|
if current_version < latest_version:
|
|
39
40
|
msg = bundle.get("version_warning").format(current_version, latest_version)
|
|
40
|
-
|
|
41
|
-
|
|
41
|
+
if warning_function:
|
|
42
|
+
warning_function(msg)
|
|
43
|
+
else:
|
|
44
|
+
logger.warning(msg)
|
|
45
|
+
print(msg)
|
|
42
46
|
except Exception:
|
|
43
47
|
logger.warning("Failed to validate version", exc_info=True)
|
|
44
48
|
|
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=rRcFnLqwG22zZ399qswskAE5L_if50hEsd_TKzGcrZ4,23
|
|
2
2
|
upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=rctS3kRWwTJmU5X203t7sUZ_B40XYVBPeXy_0hPw2Ec,193667
|
|
7
7
|
upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
|
|
10
|
-
upgini/metrics.py,sha256=
|
|
10
|
+
upgini/metrics.py,sha256=PoY1fq6XYAHNzn-rmnwRQZjCoVYP5bJNmKhR0ST2Txk,34588
|
|
11
11
|
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
|
-
upgini/version_validator.py,sha256=
|
|
13
|
+
upgini/version_validator.py,sha256=h1GViOWzULy5vf6M4dpTJuIk-4V38UCrTY1sb9yLa5I,1594
|
|
14
14
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
15
15
|
upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -27,10 +27,10 @@ upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lY
|
|
|
27
27
|
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
28
28
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
|
29
29
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
-
upgini/normalizer/normalize_utils.py,sha256=
|
|
30
|
+
upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=ikL5KvPcJz9fGyVK-xOvvo6LyRfeOey8xXjoq5nnWqU,26667
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -42,12 +42,12 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
|
|
|
42
42
|
upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
|
|
43
43
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
44
44
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
45
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
46
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
|
45
|
+
upgini/utils/datetime_utils.py,sha256=a8X4jX2y3-6E7ZNZIG5z61qfzCvsvaNEjR1Bi5KUqfM,11279
|
|
46
|
+
upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
|
|
47
47
|
upgini/utils/display_utils.py,sha256=NGhki1aGMsS8OeI69eLXEpmS_s41k8ojKHQxacJaXiU,11493
|
|
48
48
|
upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
|
|
49
49
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
50
|
-
upgini/utils/features_validator.py,sha256=
|
|
50
|
+
upgini/utils/features_validator.py,sha256=1Xj2ir5LzzYiX3NH8o88c2J6RTTetaTwu0MhjLTyuvM,3378
|
|
51
51
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
52
52
|
upgini/utils/ip_utils.py,sha256=Q6vb7Sr5Khx3Sq3eENjW2qCXKej_S5jZbneH6zEOkzQ,5171
|
|
53
53
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
@@ -56,8 +56,8 @@ upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,
|
|
|
56
56
|
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
57
57
|
upgini/utils/target_utils.py,sha256=qHzZRmICFbLNCrmVqGkaBcjm91L2ERRZMppci36acV4,10085
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
|
-
upgini/utils/warning_counter.py,sha256
|
|
60
|
-
upgini-1.2.
|
|
61
|
-
upgini-1.2.
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
59
|
+
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
60
|
+
upgini-1.2.24.dist-info/METADATA,sha256=eRRiMIY75gP4H4Y20_D9dmut5jCgx_siV-TrG_VA_qg,48578
|
|
61
|
+
upgini-1.2.24.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
62
|
+
upgini-1.2.24.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.2.24.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|