PyPI - upgini - Versions diffs - 1.2.22__py3-none-any.whl → 1.2.24__py3-none-any.whl - Mend

upgini 1.2.22py3-none-any.whl → 1.2.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (14) hide show

upgini/__about__.py +1 -1
upgini/features_enricher.py +76 -78
upgini/metrics.py +18 -9
upgini/normalizer/normalize_utils.py +2 -14
upgini/resource_bundle/strings.properties +45 -48
upgini/utils/datetime_utils.py +5 -26
upgini/utils/deduplicate_utils.py +41 -33
upgini/utils/features_validator.py +8 -15
upgini/utils/warning_counter.py +1 -0
upgini/version_validator.py +7 -3
{upgini-1.2.22.dist-info → upgini-1.2.24.dist-info}/METADATA +1 -1
{upgini-1.2.22.dist-info → upgini-1.2.24.dist-info}/RECORD +14 -14
{upgini-1.2.22.dist-info → upgini-1.2.24.dist-info}/WHEEL +0 -0
{upgini-1.2.22.dist-info → upgini-1.2.24.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.22"
1	+ __version__ = "1.2.24"

upgini/features_enricher.py CHANGED Viewed

@@ -77,8 +77,8 @@ from upgini.utils.cv_utils import CVConfig, get_groups
 from upgini.utils.datetime_utils import (
     DateTimeSearchKeyConverter,
     is_blocked_time_series,
+    is_dates_distribution_valid,
     is_time_series,
-    validate_dates_distribution,
 )
 from upgini.utils.deduplicate_utils import (
     clean_full_duplicates,
@@ -263,7 +263,7 @@ class FeaturesEnricher(TransformerMixin):
             dict()
         )
-        validate_version(self.logger)
+        validate_version(self.logger, self.__log_warning)
         self.search_keys = search_keys or {}
         self.country_code = country_code
         self.__validate_search_keys(search_keys, search_id)
@@ -723,7 +723,7 @@ class FeaturesEnricher(TransformerMixin):
             start_time = time.time()
             try:
-                result, _ = self.__inner_transform(
+                result, _, _ = self.__inner_transform(
                     trace_id,
                     X,
                     exclude_features_sources=exclude_features_sources,
@@ -951,9 +951,7 @@ class FeaturesEnricher(TransformerMixin):
                 gc.collect()
                 if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
-                    print(self.bundle.get("metrics_no_important_free_features"))
-                    self.logger.warning("No client or free relevant ADS features found to calculate metrics")
-                    self.warning_counter.increment()
+                    self.__log_warning(self.bundle.get("metrics_no_important_free_features"))
                     return None
                 print(self.bundle.get("metrics_start"))
@@ -1654,9 +1652,7 @@ class FeaturesEnricher(TransformerMixin):
         date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
         generated_features = []
         if date_column is not None:
-            converter = DateTimeSearchKeyConverter(
-                date_column, self.date_format, self.logger, self.bundle, silent_mode=True
-            )
+            converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
             df = converter.convert(df, keep_time=True)
             generated_features = converter.generated_features
@@ -1666,11 +1662,11 @@ class FeaturesEnricher(TransformerMixin):
             df = generator.generate(df)
             generated_features.extend(generator.generated_features)
-        normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
+        normalizer = Normalizer(self.bundle, self.logger)
         df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
         columns_renaming = normalizer.columns_renaming
-        df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
+        df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
         num_samples = _num_samples(df)
         sample_threshold, sample_rows = (
@@ -1817,7 +1813,7 @@ class FeaturesEnricher(TransformerMixin):
                 eval_df_with_index[EVAL_SET_INDEX] = idx + 1
                 df = pd.concat([df, eval_df_with_index])
-            df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
+            df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
             # downsample if need to eval_set threshold
             num_samples = _num_samples(df)
@@ -1830,7 +1826,7 @@ class FeaturesEnricher(TransformerMixin):
             tmp_target_name = "__target"
             df = df.rename(columns={TARGET: tmp_target_name})
-            enriched_df, columns_renaming = self.__inner_transform(
+            enriched_df, columns_renaming, generated_features = self.__inner_transform(
                 trace_id,
                 df,
                 exclude_features_sources=exclude_features_sources,
@@ -1847,7 +1843,7 @@ class FeaturesEnricher(TransformerMixin):
             x_columns = [
                 c
-                for c in (validated_X.columns.tolist() + self.fit_generated_features + [SYSTEM_RECORD_ID])
+                for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
                 if c in enriched_df.columns
             ]
@@ -1869,7 +1865,7 @@ class FeaturesEnricher(TransformerMixin):
             df[TARGET] = validated_y
-            df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
+            df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
             num_samples = _num_samples(df)
             if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
@@ -1879,7 +1875,7 @@ class FeaturesEnricher(TransformerMixin):
             tmp_target_name = "__target"
             df = df.rename(columns={TARGET: tmp_target_name})
-            enriched_Xy, columns_renaming = self.__inner_transform(
+            enriched_Xy, columns_renaming, generated_features = self.__inner_transform(
                 trace_id,
                 df,
                 exclude_features_sources=exclude_features_sources,
@@ -1896,7 +1892,7 @@ class FeaturesEnricher(TransformerMixin):
             x_columns = [
                 c
-                for c in (validated_X.columns.tolist() + self.fit_generated_features + [SYSTEM_RECORD_ID])
+                for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
                 if c in enriched_Xy.columns
             ]
@@ -1904,7 +1900,7 @@ class FeaturesEnricher(TransformerMixin):
             y_sampled = enriched_Xy[TARGET].copy()
             enriched_X = enriched_Xy.drop(columns=TARGET)
-        datasets_hash = hash_input(X_sampled, y_sampled, eval_set_sampled_dict)
+        datasets_hash = hash_input(validated_X, validated_y, eval_set)
         self.__cached_sampled_datasets[datasets_hash] = (
             X_sampled,
             y_sampled,
@@ -2023,7 +2019,7 @@ class FeaturesEnricher(TransformerMixin):
         progress_bar: Optional[ProgressBar] = None,
         progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
         add_fit_system_record_id: bool = False,
-    ) -> Tuple[pd.DataFrame, Dict[str, str]]:
+    ) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
         if self._search_task is None:
             raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
@@ -2036,24 +2032,25 @@ class FeaturesEnricher(TransformerMixin):
             if len(self.feature_names_) == 0:
                 self.logger.warning(self.bundle.get("no_important_features_for_transform"))
-                return X, {c: c for c in X.columns}
+                return X, {c: c for c in X.columns}, []
             if self._has_paid_features(exclude_features_sources):
                 msg = self.bundle.get("transform_with_paid_features")
                 self.logger.warning(msg)
                 self.__display_support_link(msg)
-                return None, {c: c for c in X.columns}
+                return None, {c: c for c in X.columns}, []
             if not metrics_calculation:
                 transform_usage = self.rest_client.get_current_transform_usage(trace_id)
                 self.logger.info(f"Current transform usage: {transform_usage}. Transforming {len(X)} rows")
                 if transform_usage.has_limit:
                     if len(X) > transform_usage.rest_rows:
-                        msg = self.bundle.get("transform_usage_warning").format(len(X), transform_usage.rest_rows)
+                        rest_rows = max(transform_usage.rest_rows, 0)
+                        msg = self.bundle.get("transform_usage_warning").format(len(X), rest_rows)
                         self.logger.warning(msg)
                         print(msg)
                         show_request_quote_button()
-                        return None, {c: c for c in X.columns}
+                        return None, {c: c for c in X.columns}, []
                     else:
                         msg = self.bundle.get("transform_usage_info").format(
                             transform_usage.limit, transform_usage.transformed_rows
@@ -2093,9 +2090,7 @@ class FeaturesEnricher(TransformerMixin):
             generated_features = []
             date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
             if date_column is not None:
-                converter = DateTimeSearchKeyConverter(
-                    date_column, self.date_format, self.logger, bundle=self.bundle, silent_mode=silent_mode
-                )
+                converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
                 df = converter.convert(df)
                 self.logger.info(f"Date column after convertion: {df[date_column]}")
                 generated_features.extend(converter.generated_features)
@@ -2110,7 +2105,7 @@ class FeaturesEnricher(TransformerMixin):
                 df = generator.generate(df)
                 generated_features.extend(generator.generated_features)
-            normalizer = Normalizer(self.bundle, self.logger, self.warning_counter, silent_mode)
+            normalizer = Normalizer(self.bundle, self.logger)
             df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
             columns_renaming = normalizer.columns_renaming
@@ -2176,7 +2171,7 @@ class FeaturesEnricher(TransformerMixin):
                 converter = PostalCodeSearchKeyConverter(postal_code)
                 df = converter.convert(df)
-            generated_features = [f for f in generated_features if f in self.fit_generated_features]
+            # generated_features = [f for f in generated_features if f in self.fit_generated_features]
             meaning_types = {col: key.value for col, key in search_keys.items()}
             for col in features_for_transform:
@@ -2216,9 +2211,11 @@ class FeaturesEnricher(TransformerMixin):
             df_without_features = df.drop(columns=features_not_to_pass)
-            df_without_features = clean_full_duplicates(
-                df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
+            df_without_features, full_duplicates_warning = clean_full_duplicates(
+                df_without_features, self.logger, bundle=self.bundle
             )
+            if not silent_mode and full_duplicates_warning:
+                self.__log_warning(full_duplicates_warning)
             del df
             gc.collect()
@@ -2337,7 +2334,7 @@ class FeaturesEnricher(TransformerMixin):
             if add_fit_system_record_id:
                 result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
-            return result, columns_renaming
+            return result, columns_renaming, generated_features
     def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
         features_info = self._internal_features_info
@@ -2415,6 +2412,15 @@ class FeaturesEnricher(TransformerMixin):
     def __is_registered(self) -> bool:
         return self.api_key is not None and self.api_key != ""
+    def __log_warning(self, message: str, show_support_link: bool = False):
+        warning_num = self.warning_counter.increment()
+        formatted_message = f"WARNING #{warning_num}: {message}\n"
+        if show_support_link:
+            self.__display_support_link(formatted_message)
+        else:
+            print(formatted_message)
+        self.logger.warning(message)
     def __inner_fit(
         self,
         trace_id: str,
@@ -2461,9 +2467,7 @@ class FeaturesEnricher(TransformerMixin):
             checked_generate_features = []
             for gen_feature in self.generate_features:
                 if gen_feature not in x_columns:
-                    msg = self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
-                    print(msg)
-                    self.logger.warning(msg)
+                    self.__log_warning(self.bundle.get("missing_generate_feature").format(gen_feature, x_columns))
                 else:
                     checked_generate_features.append(gen_feature)
             self.generate_features = checked_generate_features
@@ -2524,9 +2528,10 @@ class FeaturesEnricher(TransformerMixin):
                 self.date_format,
                 self.logger,
                 bundle=self.bundle,
-                warnings_counter=self.warning_counter,
             )
             df = converter.convert(df, keep_time=True)
+            if converter.has_old_dates:
+                self.__log_warning(self.bundle.get("dataset_drop_old_dates"))
             self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
             self.fit_generated_features.extend(converter.generated_features)
         else:
@@ -2541,7 +2546,9 @@ class FeaturesEnricher(TransformerMixin):
             self.fit_generated_features.extend(generator.generated_features)
         # Checks that need validated date
-        validate_dates_distribution(df, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
+        if not is_dates_distribution_valid(df, self.fit_search_keys):
+            self.__log_warning(bundle.get("x_unstable_by_date"))
         if (
             is_numeric_dtype(df[self.TARGET_NAME])
@@ -2550,18 +2557,25 @@ class FeaturesEnricher(TransformerMixin):
         ):
             self._validate_PSI(df.sort_values(by=maybe_date_column))
-        normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
+        normalizer = Normalizer(self.bundle, self.logger)
         df, self.fit_search_keys, self.fit_generated_features = normalizer.normalize(
             df, self.fit_search_keys, self.fit_generated_features
         )
         self.fit_columns_renaming = normalizer.columns_renaming
+        if normalizer.removed_features:
+            self.__log_warning(self.bundle.get("dataset_date_features").format(normalizer.removed_features))
         self.__adjust_cv(df)
-        df = remove_fintech_duplicates(
+        df, fintech_warnings = remove_fintech_duplicates(
             df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
         )
-        df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
+        if fintech_warnings:
+            for fintech_warning in fintech_warnings:
+                self.__log_warning(fintech_warning)
+        df, full_duplicates_warning = clean_full_duplicates(df, self.logger, bundle=self.bundle)
+        if full_duplicates_warning:
+            self.__log_warning(full_duplicates_warning)
         # Explode multiple search keys
         df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
@@ -2621,9 +2635,12 @@ class FeaturesEnricher(TransformerMixin):
         features_columns = [c for c in df.columns if c not in non_feature_columns]
-        features_to_drop = FeaturesValidator(self.logger).validate(
-            df, features_columns, self.generate_features, self.warning_counter, self.fit_columns_renaming
+        features_to_drop, feature_validator_warnings = FeaturesValidator(self.logger).validate(
+            df, features_columns, self.generate_features, self.fit_columns_renaming
         )
+        if feature_validator_warnings:
+            for warning in feature_validator_warnings:
+                self.__log_warning(warning)
         self.fit_dropped_features.update(features_to_drop)
         df = df.drop(columns=features_to_drop)
@@ -2739,9 +2756,7 @@ class FeaturesEnricher(TransformerMixin):
             zero_hit_columns = self.get_columns_by_search_keys(zero_hit_search_keys)
             if zero_hit_columns:
                 msg = self.bundle.get("features_info_zero_hit_rate_search_keys").format(zero_hit_columns)
-                self.logger.warning(msg)
-                self.__display_support_link(msg)
-                self.warning_counter.increment()
+                self.__log_warning(msg, show_support_link=True)
         if (
             self._search_task.unused_features_for_generation is not None
@@ -2751,9 +2766,7 @@ class FeaturesEnricher(TransformerMixin):
                 dataset.columns_renaming.get(col) or col for col in self._search_task.unused_features_for_generation
             ]
             msg = self.bundle.get("features_not_generated").format(unused_features_for_generation)
-            self.logger.warning(msg)
-            print(msg)
-            self.warning_counter.increment()
+            self.__log_warning(msg)
         self.__prepare_feature_importances(trace_id, validated_X.columns.to_list() + self.fit_generated_features)
@@ -3154,7 +3167,7 @@ class FeaturesEnricher(TransformerMixin):
             maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
             if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
                 # TODO cast date column to single dtype
-                date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format, silent_mode=True)
+                date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
                 converted_X = date_converter.convert(X)
                 min_date = converted_X[maybe_date_col].min()
                 max_date = converted_X[maybe_date_col].max()
@@ -3196,7 +3209,7 @@ class FeaturesEnricher(TransformerMixin):
             logger.warning(msg)
             df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
             search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
-            converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, silent_mode=True)
+            converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE)
             df = converter.convert(df)
         return df
@@ -3768,15 +3781,15 @@ class FeaturesEnricher(TransformerMixin):
             if meaning_type == SearchKey.COUNTRY and self.country_code is not None:
                 msg = self.bundle.get("search_key_country_and_country_code")
                 self.logger.warning(msg)
-                print(msg)
+                if not silent_mode:
+                    self.__log_warning(msg)
                 self.country_code = None
             if not self.__is_registered and not is_demo_dataset and meaning_type in SearchKey.personal_keys():
                 msg = self.bundle.get("unregistered_with_personal_keys").format(meaning_type)
                 self.logger.warning(msg)
                 if not silent_mode:
-                    self.warning_counter.increment()
-                    print(msg)
+                    self.__log_warning(msg)
                 valid_search_keys[column_name] = SearchKey.CUSTOM_KEY
             else:
@@ -3810,27 +3823,22 @@ class FeaturesEnricher(TransformerMixin):
             and not silent_mode
         ):
             msg = self.bundle.get("date_only_search")
-            print(msg)
-            self.logger.warning(msg)
-            self.warning_counter.increment()
+            self.__log_warning(msg)
         maybe_date = [k for k, v in valid_search_keys.items() if v in [SearchKey.DATE, SearchKey.DATETIME]]
         if (self.cv is None or self.cv == CVType.k_fold) and len(maybe_date) > 0 and not silent_mode:
             date_column = next(iter(maybe_date))
             if x[date_column].nunique() > 0.9 * _num_samples(x):
                 msg = self.bundle.get("date_search_without_time_series")
-                print(msg)
-                self.logger.warning(msg)
-                self.warning_counter.increment()
+                self.__log_warning(msg)
         if len(valid_search_keys) == 1:
             key, value = list(valid_search_keys.items())[0]
             # Show warning for country only if country is the only key
             if x[key].nunique() == 1:
                 msg = self.bundle.get("single_constant_search_key").format(value, x[key].values[0])
-                print(msg)
-                self.logger.warning(msg)
-                self.warning_counter.increment()
+                if not silent_mode:
+                    self.__log_warning(msg)
                 # TODO maybe raise ValidationError
         self.logger.info(f"Prepared search keys: {valid_search_keys}")
@@ -3890,9 +3898,7 @@ class FeaturesEnricher(TransformerMixin):
                 )
             else:
                 msg = self.bundle.get("features_info_zero_important_features")
-                self.logger.warning(msg)
-                self.__display_support_link(msg)
-                self.warning_counter.increment()
+                self.__log_warning(msg, show_support_link=True)
         except (ImportError, NameError):
             print(msg)
             print(self._internal_features_info)
@@ -3994,8 +4000,7 @@ class FeaturesEnricher(TransformerMixin):
                         " But not used because not registered user"
                     )
                     if not silent_mode:
-                        print(self.bundle.get("email_detected_not_registered").format(maybe_keys))
-                    self.warning_counter.increment()
+                        self.__log_warning(self.bundle.get("email_detected_not_registered").format(maybe_keys))
         # if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
         if check_need_detect(SearchKey.PHONE):
@@ -4014,8 +4019,7 @@ class FeaturesEnricher(TransformerMixin):
                         "But not used because not registered user"
                     )
                     if not silent_mode:
-                        print(self.bundle.get("phone_detected_not_registered"))
-                    self.warning_counter.increment()
+                        self.__log_warning(self.bundle.get("phone_detected_not_registered"))
         return search_keys
@@ -4039,19 +4043,13 @@ class FeaturesEnricher(TransformerMixin):
         part2 = train[half_train:]
         train_psi = calculate_psi(part1[self.TARGET_NAME], part2[self.TARGET_NAME])
         if train_psi > 0.2:
-            self.warning_counter.increment()
-            msg = self.bundle.get("train_unstable_target").format(train_psi)
-            print(msg)
-            self.logger.warning(msg)
+            self.__log_warning(self.bundle.get("train_unstable_target").format(train_psi))
         # 2. Check train-test PSI
         if eval1 is not None:
             train_test_psi = calculate_psi(train[self.TARGET_NAME], eval1[self.TARGET_NAME])
             if train_test_psi > 0.2:
-                self.warning_counter.increment()
-                msg = self.bundle.get("eval_unstable_target").format(train_test_psi)
-                print(msg)
-                self.logger.warning(msg)
+                self.__log_warning(self.bundle.get("eval_unstable_target").format(train_test_psi))
     def _dump_python_libs(self):
         try:
@@ -4073,8 +4071,8 @@ class FeaturesEnricher(TransformerMixin):
             self.logger.warning(f"Showing support link: {link_text}")
             display(
                 HTML(
-                    f"""<br/>{link_text} <a href='{support_link}' target='_blank' rel='noopener noreferrer'>
-                    here</a>"""
+                    f"""{link_text} <a href='{support_link}' target='_blank' rel='noopener noreferrer'>
+                    here</a><br/>"""
                 )
             )
         except (ImportError, NameError):

upgini/metrics.py CHANGED Viewed

@@ -273,6 +273,9 @@ class EstimatorWrapper:
         else:
             x, y = self._remove_empty_target_rows(x, y)
+        # Make order of columns idempotent
+        x = x[sorted(x.columns)]
         self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
         return x, y, groups
@@ -434,7 +437,8 @@ class EstimatorWrapper:
                                 f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
                             )
                     estimator_copy.set_params(
-                        cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
+                        # cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
+                        cat_features=cat_features
                     )
                 estimator = CatBoostWrapper(**kwargs)
             else:
@@ -745,20 +749,25 @@ class OtherEstimatorWrapper(EstimatorWrapper):
 def validate_scoring_argument(scoring: Union[Callable, str, None]):
-    if isinstance(scoring, str) and scoring is not None:
+    if scoring is None:
+        return
+    if isinstance(scoring, str):
         _get_scorer_by_name(scoring)
-    elif isinstance(scoring, Callable):
-        spec = inspect.getfullargspec(scoring)
-        if len(spec.args) < 3:
-            raise ValidationError(
-                f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
-            )
-    elif scoring is not None:
+        return
+    if not isinstance(scoring, Callable):
         raise ValidationError(
             f"Invalid scoring argument passed {scoring}. It should be string with scoring name or function"
             " that accepts 3 input arguments: estimator, x, y"
         )
+    spec = inspect.getfullargspec(scoring)
+    if len(spec.args) < 3:
+        raise ValidationError(
+            f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
+        )
 def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
     metric_name = scoring

upgini/normalizer/normalize_utils.py CHANGED Viewed

@@ -26,7 +26,6 @@ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
 from upgini.utils import find_numbers_with_decimal_comma
 from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
 from upgini.utils.phone_utils import PhoneSearchKeyConverter
-from upgini.utils.warning_counter import WarningCounter
 class Normalizer:
@@ -37,16 +36,13 @@ class Normalizer:
         self,
         bundle: ResourceBundle = None,
         logger: Logger = None,
-        warnings_counter: WarningCounter = None,
-        silent_mode=False,
     ):
         self.bundle = bundle or get_custom_bundle()
         self.logger = logger or getLogger()
-        self.warnings_counter = warnings_counter or WarningCounter()
-        self.silent_mode = silent_mode
         self.columns_renaming = {}
         self.search_keys = {}
         self.generated_features = []
+        self.removed_features = []
     def normalize(
         self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
@@ -139,19 +135,11 @@ class Normalizer:
     def _remove_dates_from_features(self, df: pd.DataFrame):
         features = self._get_features(df)
-        removed_features = []
         for f in features:
             if is_datetime(df[f]) or isinstance(df[f].dtype, pd.PeriodDtype):
-                removed_features.append(f)
+                self.removed_features.append(f)
                 df.drop(columns=f, inplace=True)
-        if removed_features:
-            msg = self.bundle.get("dataset_date_features").format(removed_features)
-            self.logger.warning(msg)
-            if not self.silent_mode:
-                print(msg)
-            self.warnings_counter.increment()
         return df
     def _cut_too_long_string_values(self, df: pd.DataFrame):

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -15,31 +15,28 @@ transform_usage_warning=You are trying to launch enrichment for {} rows, which w
 # Warnings
 support_link=https://upgini.com/support
-# slack_community_link=https://4mlg.short.gy/join-upgini-community
-# slack_community_text=\nWARNING: Looks like you've run into an error. For help request write us in the Upgini community
-support_text=\nWARNING: Looks like you've run into an error. For help request write us in support
+support_text=Looks like you've run into an error. For help request write us in support
 slack_community_bage=https://img.shields.io/badge/slack-@upgini-orange.svg?logo=slack
 slack_community_alt=Upgini Slack community
-version_warning=\nWARNING: Unsupported library version detected {},\nplease update with “%pip install -U upgini” to the latest {} and restart Jupyter kernel
-unregistered_with_personal_keys=\nWARNING: Search key {} can be used only with personal api_key from profile.upgini.com It will be ignored
-date_only_search=\nWARNING: Search started with DATE search key only\nTry to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IP to your training dataset\nfor search through all the available data sources.\nSee docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history
-date_search_without_time_series=\nWARNING: Looks like your training dataset is a time series. We recommend to set `cv=CVType.time_series` param for correct search results.\nSee docs https://github.com/upgini/upgini#-time-series-prediction-support
-metrics_exclude_paid_features=\nWARNING: Metrics calculated after enrichment has a free features only. To calculate metrics with a full set of relevant features, including commercial data sources, please contact support team:
-metrics_no_important_free_features=\nWARNING: No important free features to calculate metrics
-metrics_no_important_features=\nWARNING: No important features to calculate metrics
+version_warning=Unsupported library version detected {},\nplease update with “%pip install -U upgini” to the latest {} and restart Jupyter kernel
+unregistered_with_personal_keys=Search key {} can be used only with personal api_key from profile.upgini.com It will be ignored
+date_only_search=Search started with DATE search key only\nTry to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IP to your training dataset\nfor search through all the available data sources.\nSee docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history
+date_search_without_time_series=Looks like your training dataset is a time series. We recommend to set `cv=CVType.time_series` param for correct search results.\nSee docs https://github.com/upgini/upgini#-time-series-prediction-support
+metrics_exclude_paid_features=Metrics calculated after enrichment has a free features only. To calculate metrics with a full set of relevant features, including commercial data sources, please contact support team:
+metrics_no_important_free_features=No important free features to calculate metrics
+metrics_no_important_features=No important features to calculate metrics
 metrics_negative_uplift_without_cv=Please re-check that your task is not a time series prediction. If so, restart search with cv=CVType.time_series param for correct search results. See docs https://github.com/upgini/upgini#-time-series-prediction-support
 # metrics_with_trial_features=The calculation of final accuracy metrics using Trial data is not available for unauthorized users.\nGet a free API key on https://upgini.com and repeat your request.
-# transform_with_trial_features=\nWARNING: Your search results contain Trial data sources. To enrich your dataframe using transform or fit_transform with features from these Trial data sources, please register for a Free API key at https://upgini.com and resubmit your request.
+# transform_with_trial_features=Your search results contain Trial data sources. To enrich your dataframe using transform or fit_transform with features from these Trial data sources, please register for a Free API key at https://upgini.com and resubmit your request.
 # Enriching with Trial data is not available for unauthorized users.\nGet a free API key on https://upgini.com and repeat your request.
-metrics_with_paid_features=\nWARNING: The calculation of final accuracy metrics using Paid data is not available.\nContact Upgini support for the data access
-transform_with_paid_features=\nWARNING: Enriching with Paid data is not available.\nContact Upgini support for the data access
-trial_quota_limit_riched=\nWARNING: You have reached the quota limit of trial data usage. Please contact Upgini support to remove restriction
-loss_selection_warn=\nWARNING: Loss `{0}` is not supported for feature selection with {1}
-loss_calc_metrics_warn=\nWARNING: Loss `{0}` is not supported for metrics calculation with {1}
-multivariate_timeseries_detected=\nWARNING: Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
-group_k_fold_in_classification=\nWARNING: Using group K-fold cross-validation split for classification task.
-current_date_added=\nWARNING: No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
+metrics_with_paid_features=The calculation of final accuracy metrics using Paid data is not available.\nContact Upgini support for the data access
+transform_with_paid_features=Enriching with Paid data is not available.\nContact Upgini support for the data access
+trial_quota_limit_riched=You have reached the quota limit of trial data usage. Please contact Upgini support to remove restriction
+loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
+loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
+multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
+group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
+current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
 # Errors
 failed_search_by_task_id=Failed to retrieve the specified search results
 metrics_unfitted_enricher=Call fit method before calling calculate_metrics
@@ -86,11 +83,11 @@ search_key_not_found=Column `{}` from search_keys was not found in X dataframe:
 numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
 unsupported_search_key_type=Unsupported type of key in search_keys: {}
 unsupported_type_of_search_key=Unsupported type of search key: {}. It should be a member of SearcKey
-search_key_country_and_country_code=\nWARNING: SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
+search_key_country_and_country_code=SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
 empty_search_key=Search key {} is empty. Please fill values or remove this search key
-single_constant_search_key=\nWARNING: Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
+single_constant_search_key=Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
 unsupported_multi_key=Search key {} cannot be used multiple times
-unsupported_index_column=\nWARNING: Your column with name `index` was dropped because it's reserved name is booked for system needs.
+unsupported_index_column=Your column with name `index` was dropped because it's reserved name is booked for system needs.
 date_string_without_format=Date column `{}` has string type, but date_format is not specified. Convert column to datetime type or pass date_format
 invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit date format in date_format argument of FeaturesEnricher constructor
 unsupported_date_type=Unsupported type of date column `{}`. Convert to datetime please.
@@ -100,7 +97,7 @@ invalid_ip=All values of IP column `{}` are invalid
     # X and y validation
 unsupported_x_type=Unsupported type of X: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list
 x_contains_dup_columns=X contains duplicate column names. Please rename or drop duplicates
-x_contains_enriching_columns=\nWARNING: X contains column names that match the names of features from external data sources. They will be dropped from the dataframe before the enrichment: {}
+x_contains_enriching_columns=X contains column names that match the names of features from external data sources. They will be dropped from the dataframe before the enrichment: {}
 unsupported_y_type=Unsupported type of y: {}. Use pandas.DataFrame, pandas.Series, numpy.ndarray or list
 y_is_constant=y is a constant. Relevant feature search requires a non-constant y
 x_and_y_diff_size=X and y has different size: {}, {}.
@@ -113,10 +110,10 @@ y_multiindex_unsupported=Multi index in y is not supported
 x_is_empty=X is empty
 y_is_empty=y is empty
 x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
-missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
-x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
-train_unstable_target=\nWARNING: Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
-eval_unstable_target=\nWARNING: Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
+missing_generate_feature=Feature {} specified in `generate_features` is not present in input columns: {}
+x_unstable_by_date=Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
+train_unstable_target=Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
+eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
     # eval set validation
 unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
 eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
@@ -140,21 +137,23 @@ missing_features_for_transform=Missing some features for transform that were pre
     # target validation
 empty_target=Target is empty in all rows
 # non_numeric_target=Binary target should be numerical type
-uneven_eval_target_distribution=\nWARNING: y distributions from the training sample and eval_set differ according to the Kolmogorov-Smirnov test,\nwhich makes metrics between the train and eval_set incomparable.
-target_outliers_warning=\nWARNING: We detected {} outliers in your sample.\nExamples of outliers with maximum value of target:\n{}\nOutliers will {}be excluded during the metrics calculation.
+uneven_eval_target_distribution=y distributions from the training sample and eval_set differ according to the Kolmogorov-Smirnov test,\nwhich makes metrics between the train and eval_set incomparable.
+target_outliers_warning=We detected {} outliers in your sample.\nExamples of outliers with maximum value of target:\n{}\nOutliers will {}be excluded during the metrics calculation.
     # features validation
-empty_or_contant_features=\nWARNING: Columns {} has value with frequency more than 99%, removed from X
-high_cardinality_features=\nWARNING: Columns {} has high cardinality (>90% unique values), removed from X
-# one_hot_encoded_features=\nWARNING: One hot encoded features detected. Use int encoding for correct results of fit.\n{}
+empty_or_contant_features=Columns {} has value with frequency more than 99%, removed from X
+high_cardinality_features=Columns {} has high cardinality (>90% unique values), removed from X
+# one_hot_encoded_features=One hot encoded features detected. Use int encoding for correct results of fit.\n{}
     # Dataset validation
 dataset_too_few_rows=X size should be at least {} rows after validation
 dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample X
 dataset_empty_column_names=Some column names are empty. Add names please
-dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
-dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
-dataset_train_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
-dataset_eval_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
-dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
+dataset_full_duplicates={:.5f}% of the rows are fully duplicated
+dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
+dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
+dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
+dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset
 dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
 dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
 dataset_invalid_binary_target=Binary task type should contain only 2 target values, but {} found
@@ -163,8 +162,8 @@ dataset_invalid_regression_target=Unexpected dtype of target for regression task
 dataset_invalid_timeseries_target=Unexpected dtype of target for timeseries task type: {}. Expected float
 dataset_to_many_multiclass_targets=The number of target classes {} exceeds the allowed threshold: {}. Please, correct your data and try again
 dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
-dataset_rarest_class_less_threshold=\nWARNING: Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
-dataset_date_features=\nWARNING: Columns {} is a datetime or period type but not used as a search key, removed from X
+dataset_rarest_class_less_threshold=Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
+dataset_date_features=Columns {} is a datetime or period type but not used as a search key, removed from X
 dataset_too_many_features=Too many features. Maximum number of features is {}
 dataset_constant_target=y contains only one distinct value
 dataset_empty_target=y contains only NaN or incorrect values.
@@ -172,10 +171,9 @@ dataset_invalid_column_type=Unsupported data type of column {}: {}
 dataset_invalid_filter=Unknown field in filter_features. Should be {'min_importance', 'max_psi', 'max_count', 'selected_features'}.
 dataset_too_big_file=Too big size of dataframe X for processing. Please reduce number of rows or columns
 dataset_transform_diff_fit=You try to enrich dataset that column names are different from the train dataset column names that you used on the fit stage. Please make the column names the same as in the train dataset and restart.
-binary_small_dataset=\nWARNING: The least populated class in Target contains less than 1000 rows.\nSmall numbers of observations may negatively affect the number of selected features and quality of your ML model.\nUpgini recommends you increase the number of observations in the least populated class.
+binary_small_dataset=The least populated class in Target contains less than 1000 rows.\nSmall numbers of observations may negatively affect the number of selected features and quality of your ML model.\nUpgini recommends you increase the number of observations in the least populated class.\n
 all_search_keys_invalid=All search keys are invalid
-all_emails_invalid=\nWARNING: All values in column {} are invalid emails
-    # Metrics validation
+all_emails_invalid=All values in column {} are invalid emails    # Metrics validation
 metrics_msle_negative_target=Mean Squared Logarithmic Error cannot be used when y contain negative values
 metrics_unsupported_target_type=Unsupported type of target in y: {}
 metrics_invalid_scoring={} is not a valid scoring value. Use {} to get valid options
@@ -193,8 +191,7 @@ ads_upload_to_many_empty_rows=More than 50% of rows in the submitted sample does
     # Features info warning
 features_info_zero_important_features=Oops, we can't find any relevant external features for your training dataset,\nmost probably due to issues with search keys formats.\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
 features_info_zero_hit_rate_search_keys=Oops, looks like values/formats of the search keys {} might be incorrect,\nas we won't be able to match any data source using these values\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
-features_not_generated=\nWARNING: Following features didn't pass checks for automated feature generation: {}
+features_not_generated=Following features didn't pass checks for automated feature generation: {}
 # Information
 postal_code_detected=Postal codes detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
 country_detected=Countries detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
@@ -203,8 +200,8 @@ country_default_determined=Search key country_code `{}` was used as default. \nS
 email_detected=Emails detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
 email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
 phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
-phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
-target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
+phone_detected_not_registered=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
+target_type_detected=Detected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
 binary_target_reason=only two unique label-values observed
 non_numeric_multiclass_reason=non-numeric label values observed
 few_unique_label_multiclass_reason=few unique label-values observed and can be considered as categorical
@@ -215,7 +212,7 @@ limited_int_multiclass_reason=integer-like values with limited unique values obs
 all_ok_community_invite=❓ Support request
 too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
 imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
-imbalanced_target=\nWARNING: Target is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
+imbalanced_target=Target is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
 loss_selection_info=Using loss `{}` for feature selection
 loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator

upgini/utils/datetime_utils.py CHANGED Viewed

@@ -11,7 +11,6 @@ from pandas.api.types import is_numeric_dtype
 from upgini.errors import ValidationError
 from upgini.metadata import EVAL_SET_INDEX, SearchKey
 from upgini.resource_bundle import ResourceBundle, get_custom_bundle
-from upgini.utils.warning_counter import WarningCounter
 DATE_FORMATS = [
     "%Y-%m-%d",
@@ -42,8 +41,6 @@ class DateTimeSearchKeyConverter:
         date_format: Optional[str] = None,
         logger: Optional[logging.Logger] = None,
         bundle: Optional[ResourceBundle] = None,
-        warnings_counter: Optional[WarningCounter] = None,
-        silent_mode=False,
     ):
         self.date_column = date_column
         self.date_format = date_format
@@ -54,8 +51,7 @@ class DateTimeSearchKeyConverter:
             self.logger.setLevel("FATAL")
         self.generated_features: List[str] = []
         self.bundle = bundle or get_custom_bundle()
-        self.warnings_counter = warnings_counter or WarningCounter()
-        self.silent_mode = silent_mode
+        self.has_old_dates = False
     @staticmethod
     def _int_to_opt(i: int) -> Optional[int]:
@@ -101,7 +97,6 @@ class DateTimeSearchKeyConverter:
                 df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
             else:
                 msg = self.bundle.get("unsupported_date_type").format(self.date_column)
-                self.logger.warning(msg)
                 raise ValidationError(msg)
         else:
             df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
@@ -162,13 +157,9 @@ class DateTimeSearchKeyConverter:
         condition = df[self.date_column] <= self.MIN_SUPPORTED_DATE_TS
         old_subset = df[condition]
         if len(old_subset) > 0:
+            self.has_old_dates = True
             df.loc[condition, self.date_column] = None
             self.logger.info(f"Set to None: {len(old_subset)} of {len(df)} rows because they are before 2000-01-01")
-            msg = self.bundle.get("dataset_drop_old_dates")
-            self.logger.warning(msg)
-            if not self.silent_mode:
-                print(msg)
-            self.warnings_counter.increment()
         return df
@@ -256,13 +247,10 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
     return len(accumulated_changing_columns) <= 2
-def validate_dates_distribution(
+def is_dates_distribution_valid(
     df: pd.DataFrame,
     search_keys: Dict[str, SearchKey],
-    logger: Optional[logging.Logger] = None,
-    bundle: Optional[ResourceBundle] = None,
-    warning_counter: Optional[WarningCounter] = None,
-):
+) -> bool:
     maybe_date_col = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
     if EVAL_SET_INDEX in df.columns:
@@ -303,13 +291,4 @@ def validate_dates_distribution(
     date_counts_2 = date_counts[round(len(date_counts) / 2) :]
     ratio = date_counts_2.mean() / date_counts_1.mean()
-    if ratio > 1.2 or ratio < 0.8:
-        if warning_counter is not None:
-            warning_counter.increment()
-        if logger is None:
-            logger = logging.getLogger("muted_logger")
-            logger.setLevel("FATAL")
-        bundle = bundle or get_custom_bundle()
-        msg = bundle.get("x_unstable_by_date")
-        print(msg)
-        logger.warning(msg)
+    return ratio >= 0.8 and ratio <= 1.2

upgini/utils/deduplicate_utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
+import logging
 from logging import Logger
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 import pandas as pd
@@ -12,7 +13,7 @@ from upgini.metadata import (
     ModelTaskType,
     SearchKey,
 )
-from upgini.resource_bundle import ResourceBundle
+from upgini.resource_bundle import ResourceBundle, get_custom_bundle
 from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
 from upgini.utils.target_utils import define_task
@@ -22,16 +23,19 @@ def remove_fintech_duplicates(
     search_keys: Dict[str, SearchKey],
     date_format: Optional[str] = None,
     logger: Optional[Logger] = None,
-    silent=False,
     bundle: ResourceBundle = None,
-) -> pd.DataFrame:
+) -> Tuple[pd.DataFrame, Optional[List[str]]]:
     # Initial checks for target type and date column
+    bundle = bundle or get_custom_bundle()
+    if logger is None:
+        logger = logging.getLogger()
+        logger.setLevel(logging.FATAL)
     date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
     if define_task(df[TARGET], date_col is not None, silent=True) != ModelTaskType.BINARY:
-        return df
+        return df, []
     if date_col is None:
-        return df
+        return df, []
     personal_cols = []
     phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
@@ -44,7 +48,7 @@ def remove_fintech_duplicates(
     if hem_col:
         personal_cols.append(hem_col)
     if len(personal_cols) == 0:
-        return df
+        return df, []
     # Splitting into train and eval_set parts
     if EVAL_SET_INDEX in df.columns:
@@ -54,11 +58,13 @@ def remove_fintech_duplicates(
         train_df = df
         eval_dfs = []
-    def process_df(segment_df: pd.DataFrame, eval_index=0) -> pd.DataFrame:
+    warning_messages = []
+    def process_df(segment_df: pd.DataFrame, eval_index=0) -> Tuple[pd.DataFrame, Optional[str]]:
         """Process a subset of the dataset to remove duplicates based on personal keys."""
         # Fast check for duplicates based on personal keys
         if not segment_df[personal_cols].duplicated().any():
-            return segment_df
+            return segment_df, None
         sub_df = segment_df[personal_cols + [date_col, TARGET]].copy()
@@ -70,18 +76,18 @@ def remove_fintech_duplicates(
         total = len(uniques)
         diff_dates = len(uniques[uniques > 1])
         if diff_dates / total >= 0.6:
-            return segment_df
+            return segment_df, None
         # Check for duplicate rows
         duplicates = sub_df.duplicated(personal_cols, keep=False)
         duplicate_rows = sub_df[duplicates]
         if len(duplicate_rows) == 0:
-            return segment_df
+            return segment_df, None
         # Check if there are different target values for the same personal keys
         nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
         if nonunique_target_groups.sum() == 0:
-            return segment_df
+            return segment_df, None
         # Helper function to check if there are different target values within 60 days
         def has_diff_target_within_60_days(rows: pd.DataFrame):
@@ -115,23 +121,23 @@ def remove_fintech_duplicates(
                 msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
                     perc, len(rows_to_remove), eval_index, rows_to_remove.index.to_list()
                 )
-            if not silent:
-                print(msg)
-            if logger:
-                logger.warning(msg)
-            return segment_df[~segment_df.index.isin(rows_to_remove.index)]
-        return segment_df
+            return segment_df[~segment_df.index.isin(rows_to_remove.index)], msg
+        return segment_df, None
     # Process the train part separately
     logger.info(f"Train dataset shape before clean fintech duplicates: {train_df.shape}")
-    train_df = process_df(train_df)
+    train_df, train_warning = process_df(train_df)
+    if train_warning:
+        warning_messages.append(train_warning)
     logger.info(f"Train dataset shape after clean fintech duplicates: {train_df.shape}")
     # Process each eval_set part separately
     new_eval_dfs = []
     for i, eval_df in enumerate(eval_dfs, 1):
         logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
-        cleaned_eval_df = process_df(eval_df, i)
+        cleaned_eval_df, eval_warning = process_df(eval_df, i)
+        if eval_warning:
+            warning_messages.append(eval_warning)
         logger.info(f"Eval {i} dataset shape after clean fintech duplicates: {cleaned_eval_df.shape}")
         new_eval_dfs.append(cleaned_eval_df)
@@ -143,15 +149,21 @@ def remove_fintech_duplicates(
         df = train_df
     logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
-    return df
+    return df, warning_messages
 def clean_full_duplicates(
-    df: pd.DataFrame, logger: Optional[Logger] = None, silent=False, bundle: ResourceBundle = None
-) -> pd.DataFrame:
+    df: pd.DataFrame, logger: Optional[Logger] = None, bundle: Optional[ResourceBundle] = None
+) -> Tuple[pd.DataFrame, Optional[str]]:
+    if logger is None:
+        logger = logging.getLogger()
+        logger.setLevel(logging.FATAL)
+    if bundle is None:
+        bundle = get_custom_bundle()
     nrows = len(df)
     if nrows == 0:
-        return df
+        return df, None
     # Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
     unique_columns = df.columns.tolist()
     if SYSTEM_RECORD_ID in unique_columns:
@@ -162,6 +174,7 @@ def clean_full_duplicates(
         unique_columns.remove(SORT_ID)
     if EVAL_SET_INDEX in unique_columns:
         unique_columns.remove(EVAL_SET_INDEX)
     logger.info(f"Dataset shape before clean duplicates: {df.shape}")
     # Train segment goes first so if duplicates are found in train and eval set
     # then we keep unique rows in train segment
@@ -170,11 +183,9 @@ def clean_full_duplicates(
     nrows_after_full_dedup = len(df)
     share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
     if share_full_dedup > 0:
-        msg = bundle.get("dataset_full_duplicates").format(share_full_dedup)
-        logger.warning(msg)
-        # if not silent_mode:
-        #     print(msg)
-        # self.warning_counter.increment()
+        logger.warning(bundle.get("dataset_full_duplicates").format(share_full_dedup))
+    msg = None
     if TARGET in df.columns:
         unique_columns.remove(TARGET)
         marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
@@ -185,13 +196,10 @@ def clean_full_duplicates(
             share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
             msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
-            logger.warning(msg)
-            if not silent:
-                print(msg)
             df = df.drop_duplicates(subset=unique_columns, keep=False)
             logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
-    return df
+    return df, msg
 def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:

upgini/utils/features_validator.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import logging
 from logging import Logger
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 import pandas as pd
 from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
 from upgini.resource_bundle import bundle
-from upgini.utils.warning_counter import WarningCounter
 class FeaturesValidator:
@@ -21,13 +20,13 @@ class FeaturesValidator:
         self,
         df: pd.DataFrame,
         features: List[str],
-        features_for_generate: Optional[List[str]],
-        warning_counter: WarningCounter,
+        features_for_generate: Optional[List[str]] = None,
         columns_renaming: Optional[Dict[str, str]] = None,
-    ) -> List[str]:
+    ) -> Tuple[List[str], List[str]]:
         # one_hot_encoded_features = []
         empty_or_constant_features = []
         high_cardinality_features = []
+        warnings = []
         for f in features:
             column = df[f]
@@ -52,9 +51,7 @@ class FeaturesValidator:
         # if one_hot_encoded_features:
         #     msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
-        #     print(msg)
-        #     self.logger.warning(msg)
-        #     warning_counter.increment()
+        #     warnings.append(msg)
         columns_renaming = columns_renaming or {}
@@ -62,9 +59,7 @@ class FeaturesValidator:
             msg = bundle.get("empty_or_contant_features").format(
                 [columns_renaming.get(f, f) for f in empty_or_constant_features]
             )
-            print(msg)
-            self.logger.warning(msg)
-            warning_counter.increment()
+            warnings.append(msg)
         high_cardinality_features = self.find_high_cardinality(df[features])
         if features_for_generate:
@@ -75,11 +70,9 @@ class FeaturesValidator:
             msg = bundle.get("high_cardinality_features").format(
                 [columns_renaming.get(f, f) for f in high_cardinality_features]
             )
-            print(msg)
-            self.logger.warning(msg)
-            warning_counter.increment()
+            warnings.append(msg)
-        return empty_or_constant_features + high_cardinality_features
+        return (empty_or_constant_features + high_cardinality_features, warnings)
     @staticmethod
     def find_high_cardinality(df: pd.DataFrame) -> List[str]:

upgini/utils/warning_counter.py CHANGED Viewed

@@ -4,6 +4,7 @@ class WarningCounter:
     def increment(self):
         self._count += 1
+        return self._count
     def reset(self):
         self._count = 0

upgini/version_validator.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 import threading
+from typing import Callable, Optional
 import requests
@@ -30,15 +31,18 @@ def get_version(package, url_pattern=URL_PATTERN):
     return version
-def validate_version(logger: logging.Logger):
+def validate_version(logger: logging.Logger, warning_function: Optional[Callable[[str], None]] = None):
     def task():
         try:
             current_version = parse(__version__)
             latest_version = get_version("upgini")
             if current_version < latest_version:
                 msg = bundle.get("version_warning").format(current_version, latest_version)
-                logger.warning(msg)
-                print(msg)
+                if warning_function:
+                    warning_function(msg)
+                else:
+                    logger.warning(msg)
+                    print(msg)
         except Exception:
             logger.warning("Failed to validate version", exc_info=True)

{upgini-1.2.22.dist-info → upgini-1.2.24.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.22
+Version: 1.2.24
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.22.dist-info → upgini-1.2.24.dist-info}/RECORD RENAMED Viewed

@@ -1,16 +1,16 @@
-upgini/__about__.py,sha256=P6UdnfqZMN8bM1yBQGaUu5LMabVISCCurCBNtZJOvTE,23
+upgini/__about__.py,sha256=rRcFnLqwG22zZ399qswskAE5L_if50hEsd_TKzGcrZ4,23
 upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=CK_ymyXeS0JxzBxy2y2UJ7miwy0DUcwdJdJBoFNY0IE,193511
+upgini/features_enricher.py,sha256=rctS3kRWwTJmU5X203t7sUZ_B40XYVBPeXy_0hPw2Ec,193667
 upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
 upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
 upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
-upgini/metrics.py,sha256=M508zOvqg0uc2sSgS8fpU7uNjGv1JA6iW_gDDOq-6PI,34474
+upgini/metrics.py,sha256=PoY1fq6XYAHNzn-rmnwRQZjCoVYP5bJNmKhR0ST2Txk,34588
 upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
 upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
-upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
+upgini/version_validator.py,sha256=h1GViOWzULy5vf6M4dpTJuIk-4V38UCrTY1sb9yLa5I,1594
 upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
 upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
 upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -27,10 +27,10 @@ upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lY
 upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
 upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
 upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-upgini/normalizer/normalize_utils.py,sha256=Lv75lq7M46z9cAIutwkdKZtPZkWblgoRzToAJ1BwY8A,7709
+upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
 upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
 upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
-upgini/resource_bundle/strings.properties,sha256=bWWznzu43Lwfd-j4XDrpKJCpoxMMThd73awB7ge7wfo,27319
+upgini/resource_bundle/strings.properties,sha256=ikL5KvPcJz9fGyVK-xOvvo6LyRfeOey8xXjoq5nnWqU,26667
 upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
 upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -42,12 +42,12 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
 upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
 upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
 upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
-upgini/utils/datetime_utils.py,sha256=4tsGeehU0KS6wqNsc9gEEWZ9s6T9E0UReUIO3rSuXNU,12174
-upgini/utils/deduplicate_utils.py,sha256=NpaPtBYXwUtfKTRHWrtz2uUq6tZN6C_Nd719ydPRF2Q,8484
+upgini/utils/datetime_utils.py,sha256=a8X4jX2y3-6E7ZNZIG5z61qfzCvsvaNEjR1Bi5KUqfM,11279
+upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
 upgini/utils/display_utils.py,sha256=NGhki1aGMsS8OeI69eLXEpmS_s41k8ojKHQxacJaXiU,11493
 upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
 upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
-upgini/utils/features_validator.py,sha256=yiOdzVtpArELMufzAa9mtWq32lETB6sIF-w3Yvl3vV8,3614
+upgini/utils/features_validator.py,sha256=1Xj2ir5LzzYiX3NH8o88c2J6RTTetaTwu0MhjLTyuvM,3378
 upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
 upgini/utils/ip_utils.py,sha256=Q6vb7Sr5Khx3Sq3eENjW2qCXKej_S5jZbneH6zEOkzQ,5171
 upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
@@ -56,8 +56,8 @@ upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,
 upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
 upgini/utils/target_utils.py,sha256=qHzZRmICFbLNCrmVqGkaBcjm91L2ERRZMppci36acV4,10085
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
-upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
-upgini-1.2.22.dist-info/METADATA,sha256=xz213bCp7FlucAgHEqT8KlX7G0E_BMwP3wN444cz3QU,48578
-upgini-1.2.22.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.22.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.22.dist-info/RECORD,,
+upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
+upgini-1.2.24.dist-info/METADATA,sha256=eRRiMIY75gP4H4Y20_D9dmut5jCgx_siV-TrG_VA_qg,48578
+upgini-1.2.24.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.2.24.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.24.dist-info/RECORD,,

{upgini-1.2.22.dist-info → upgini-1.2.24.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.22.dist-info → upgini-1.2.24.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.22__py3-none-any.whl → 1.2.24__py3-none-any.whl

Potentially problematic release.

upgini 1.2.22py3-none-any.whl → 1.2.24py3-none-any.whl