PyPI - upgini - Versions diffs - 1.2.26__tar.gz → 1.2.28__tar.gz - Mend

upgini 1.2.26tar.gz → 1.2.28tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (67) hide show

{upgini-1.2.26 → upgini-1.2.28}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.26
+Version: 1.2.28
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

upgini-1.2.28/src/upgini/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.2.28"

upgini-1.2.28/src/upgini/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from upgini.features_enricher import FeaturesEnricher  # noqa: F401
+from upgini.metadata import SearchKey, CVType, RuntimeParameters, ModelTaskType  # noqa: F401
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning, module="_distutils_hack")

{upgini-1.2.26 → upgini-1.2.28}/src/upgini/features_enricher.py RENAMED Viewed

@@ -2026,7 +2026,10 @@ class FeaturesEnricher(TransformerMixin):
         start_time = time.time()
         with MDC(trace_id=trace_id):
             self.logger.info("Start transform")
-            self.__log_debug_information(X, exclude_features_sources=exclude_features_sources)
+            validated_X = self._validate_X(X, is_transform=True)
+            self.__log_debug_information(validated_X, exclude_features_sources=exclude_features_sources)
             self.__validate_search_keys(self.search_keys, self.search_id)
@@ -2058,8 +2061,6 @@ class FeaturesEnricher(TransformerMixin):
                         self.logger.info(msg)
                         print(msg)
-            validated_X = self._validate_X(X, is_transform=True)
             is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
             columns_to_drop = [c for c in validated_X.columns if c in self.feature_names_]
@@ -2476,9 +2477,9 @@ class FeaturesEnricher(TransformerMixin):
         validate_scoring_argument(scoring)
         self.__log_debug_information(
-            X,
-            y,
-            eval_set,
+            validated_X,
+            validated_y,
+            validated_eval_set,
             exclude_features_sources=exclude_features_sources,
             calculate_metrics=calculate_metrics,
             scoring=scoring,
@@ -2546,9 +2547,11 @@ class FeaturesEnricher(TransformerMixin):
             self.fit_generated_features.extend(generator.generated_features)
         # Checks that need validated date
-        if not is_dates_distribution_valid(df, self.fit_search_keys):
-            self.__log_warning(bundle.get("x_unstable_by_date"))
+        try:
+            if not is_dates_distribution_valid(df, self.fit_search_keys):
+                self.__log_warning(bundle.get("x_unstable_by_date"))
+        except Exception:
+            self.logger.exception("Failed to check dates distribution validity")
         if (
             is_numeric_dtype(df[self.TARGET_NAME])
@@ -3760,11 +3763,17 @@ class FeaturesEnricher(TransformerMixin):
         if len(passed_unsupported_search_keys) > 0:
             raise ValidationError(self.bundle.get("unsupported_search_key").format(passed_unsupported_search_keys))
+        x_columns = [
+            c
+            for c in x.columns
+            if c not in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
+        ]
         for column_id, meaning_type in search_keys.items():
             column_name = None
             if isinstance(column_id, str):
                 if column_id not in x.columns:
-                    raise ValidationError(self.bundle.get("search_key_not_found").format(column_id, list(x.columns)))
+                    raise ValidationError(self.bundle.get("search_key_not_found").format(column_id, x_columns))
                 column_name = column_id
                 valid_search_keys[column_name] = meaning_type
             elif isinstance(column_id, int):
@@ -4038,15 +4047,19 @@ class FeaturesEnricher(TransformerMixin):
         half_train = round(len(train) / 2)
         part1 = train[:half_train]
         part2 = train[half_train:]
-        train_psi = calculate_psi(part1[self.TARGET_NAME], part2[self.TARGET_NAME])
-        if train_psi > 0.2:
-            self.__log_warning(self.bundle.get("train_unstable_target").format(train_psi))
+        train_psi_result = calculate_psi(part1[self.TARGET_NAME], part2[self.TARGET_NAME])
+        if isinstance(train_psi_result, Exception):
+            self.logger.exception("Failed to calculate train PSI", train_psi_result)
+        elif train_psi_result > 0.2:
+            self.__log_warning(self.bundle.get("train_unstable_target").format(train_psi_result))
         # 2. Check train-test PSI
         if eval1 is not None:
-            train_test_psi = calculate_psi(train[self.TARGET_NAME], eval1[self.TARGET_NAME])
-            if train_test_psi > 0.2:
-                self.__log_warning(self.bundle.get("eval_unstable_target").format(train_test_psi))
+            train_test_psi_result = calculate_psi(train[self.TARGET_NAME], eval1[self.TARGET_NAME])
+            if isinstance(train_test_psi_result, Exception):
+                self.logger.exception("Failed to calculate test PSI", train_test_psi_result)
+            elif train_test_psi_result > 0.2:
+                self.__log_warning(self.bundle.get("eval_unstable_target").format(train_test_psi_result))
     def _dump_python_libs(self):
         try:

{upgini-1.2.26 → upgini-1.2.28}/src/upgini/resource_bundle/strings.properties RENAMED Viewed

@@ -201,7 +201,7 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
 email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
 phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
 phone_detected_not_registered=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
-target_type_detected=Detected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
+target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
 binary_target_reason=only two unique label-values observed
 non_numeric_multiclass_reason=non-numeric label values observed
 few_unique_label_multiclass_reason=few unique label-values observed and can be considered as categorical
@@ -212,7 +212,7 @@ limited_int_multiclass_reason=integer-like values with limited unique values obs
 all_ok_community_invite=❓ Support request
 too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
 imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
-imbalanced_target=Target is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
+imbalanced_target=\nTarget is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
 loss_selection_info=Using loss `{}` for feature selection
 loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator

{upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/target_utils.py RENAMED Viewed

@@ -229,25 +229,25 @@ def balance_undersample(
     return resampled_data
-def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
-    df = pd.concat([expected, actual])
+def calculate_psi(expected: pd.Series, actual: pd.Series) -> Union[float, Exception]:
+    try:
+        df = pd.concat([expected, actual])
-    if is_bool_dtype(df):
-        df = np.where(df, 1, 0)
+        if is_bool_dtype(df):
+            df = np.where(df, 1, 0)
-    # Define the bins for the target variable
-    df_min = df.min()
-    df_max = df.max()
-    bins = [df_min, (df_min + df_max) / 2, df_max]
+        # Define the bins for the target variable
+        df_min = df.min()
+        df_max = df.max()
+        bins = [df_min, (df_min + df_max) / 2, df_max]
-    # Calculate the base distribution
-    train_distribution = expected.value_counts(bins=bins, normalize=True).sort_index().values
+        # Calculate the base distribution
+        train_distribution = expected.value_counts(bins=bins, normalize=True).sort_index().values
-    # Calculate the target distribution
-    test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
+        # Calculate the target distribution
+        test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
-    # Calculate the PSI
-    try:
+        # Calculate the PSI
         return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
-    except Exception:
-        return np.nan
+    except Exception as e:
+        return e

upgini-1.2.26/src/upgini/__about__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__ = "1.2.26"

upgini-1.2.26/src/upgini/__init__.py DELETED Viewed

@@ -1,13 +0,0 @@
-import os
-from upgini.features_enricher import FeaturesEnricher  # noqa: F401
-from upgini.metadata import SearchKey, CVType, RuntimeParameters, ModelTaskType  # noqa: F401
-# from .lazy_import import LazyImport
-os.environ["SETUPTOOLS_USE_DISTUTILS"] = "stdlib"
-# FeaturesEnricher = LazyImport("upgini.features_enricher", "FeaturesEnricher")
-# SearchKey = LazyImport("upgini.metadata", "SearchKey")
-# RuntimeParameters = LazyImport("upgini.metadata", "RuntimeParameters")
-# CVType = LazyImport("upgini.metadata", "CVType")
-# ModelTaskType = LazyImport("upgini.metadata", "ModelTaskType")