PyPI - upgini - Versions diffs - 1.2.117a1__tar.gz → 1.2.118__tar.gz - Mend

upgini 1.2.117a1tar.gz → 1.2.118tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

{upgini-1.2.117a1 → upgini-1.2.118}/.gitignore RENAMED Viewed

@@ -111,6 +111,7 @@ env10/
 .env10/
 .env310/
 env11/
+env12/
 venv/
 ENV/
 env.bak/

{upgini-1.2.117a1 → upgini-1.2.118}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.117a1
+Version: 1.2.118
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

upgini-1.2.118/src/upgini/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.2.118"

{upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/features_enricher.py RENAMED Viewed

@@ -1423,8 +1423,15 @@ class FeaturesEnricher(TransformerMixin):
         # Find latest eval set or earliest if all eval sets are before train set
         date_column = self._get_date_column(search_keys)
-        # Get minimum date from main dataset X
-        main_min_date = X[date_column].dropna().min()
+        x_date = X[date_column].dropna()
+        if not is_numeric_dtype(x_date):
+            x_date = pd.to_datetime(x_date).dt.floor("D").astype(np.int64) / 10**6
+        main_min_date = x_date.min()
+        for eval_x, _ in eval_set:
+            eval_x_date = eval_x[date_column].dropna()
+            if not is_numeric_dtype(eval_x_date):
+                eval_x[date_column] = pd.to_datetime(eval_x_date).dt.floor("D").astype(np.int64) / 10**6
         # Find minimum date for each eval_set and compare with main dataset
         eval_dates = []
@@ -1433,8 +1440,11 @@ class FeaturesEnricher(TransformerMixin):
                 if len(eval_x) < 1000:
                     self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
                     continue
-                eval_min_date = eval_x[date_column].dropna().min()
-                eval_max_date = eval_x[date_column].dropna().max()
+                eval_x_date = eval_x[date_column].dropna()
+                if not is_numeric_dtype(eval_x_date):
+                    eval_x_date = pd.to_datetime(eval_x_date).dt.floor("D").astype(np.int64) / 10**6
+                eval_min_date = eval_x_date.min()
+                eval_max_date = eval_x_date.max()
                 eval_dates.append((i, eval_min_date, eval_max_date))
         if not eval_dates:
@@ -1460,6 +1470,10 @@ class FeaturesEnricher(TransformerMixin):
         checking_eval_set_df = checking_eval_set_df.copy()
         checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
+        if not is_numeric_dtype(checking_eval_set_df[date_column]):
+            checking_eval_set_df[date_column] = (
+                pd.to_datetime(checking_eval_set_df[date_column]).dt.floor("D").astype(np.int64) / 10**6
+            )
         psi_values_sparse = calculate_sparsity_psi(
             checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
@@ -3708,6 +3722,25 @@ if response.status_code == 200:
             else:
                 raise ValidationError(self.bundle.get("eval_x_and_x_diff_shape"))
+        if any(validated_eval_X.dtypes != X.dtypes):
+            x_types = X.dtypes
+            eval_types = validated_eval_X.dtypes
+            # Find columns with different types
+            diff_cols = [
+                (col, x_types[col], eval_types[col])
+                for col in x_types.index
+                if x_types[col] != eval_types[col]
+            ]
+            diff_col_names = [col for col, _, _ in diff_cols]
+            # print columns with different types
+            print("Columns with different types:")
+            for col, x_type, eval_type in diff_cols:
+                print("-" * 50)
+                print(f"Column: {col}")
+                print(f"X type:        {x_type}")
+                print(f"Eval_set type: {eval_type}")
+            raise ValidationError(self.bundle.get("eval_x_and_x_diff_dtypes").format(diff_col_names))
         if _num_samples(validated_eval_X) != _num_samples(eval_y):
             raise ValidationError(
                 self.bundle.get("x_and_y_diff_size_eval_set").format(
@@ -4420,7 +4453,8 @@ if response.status_code == 200:
         if len(features_info) > 0:
             self.features_info = pd.DataFrame(features_info)
-            if self.features_info[self.bundle.get("features_info_psi")].isna().all():
+            # If all psi values are 0 or null, drop psi column
+            if self.features_info[self.bundle.get("features_info_psi")].fillna(0.0).eq(0.0).all():
                 self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
             self._features_info_without_links = pd.DataFrame(features_info_without_links)
             self._internal_features_info = pd.DataFrame(internal_features_info)

{upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/metrics.py RENAMED Viewed

@@ -847,7 +847,7 @@ class CatBoostWrapper(EstimatorWrapper):
             feature_importance = {}
             for i, col in enumerate(x.columns):
-                feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
+                feature_importance[col] = float(np.mean(np.abs(shap_values[:, i])))
             return feature_importance
@@ -922,6 +922,7 @@ class LightGBMWrapper(EstimatorWrapper):
                     encoded = cat_encoder.transform(x_copy[self.cat_features]).astype(int)
                 else:
                     encoded = cat_encoder.transform(x_copy[self.cat_features]).astype("category")
+                x_copy = x_copy.drop(columns=self.cat_features, errors="ignore")
                 x_copy[self.cat_features] = encoded
             shap_matrix = estimator.predict(
@@ -943,7 +944,7 @@ class LightGBMWrapper(EstimatorWrapper):
             feature_importance = {}
             for i, col in enumerate(x.columns):
-                feature_importance[col] = np.mean(np.abs(shap_matrix[:, i]))
+                feature_importance[col] = float(np.mean(np.abs(shap_matrix[:, i])))
             return feature_importance

{upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/resource_bundle/strings.properties RENAMED Viewed

@@ -123,6 +123,7 @@ unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of
 eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y or X only
 unsupported_x_type_eval_set=Unsupported type of X in eval_set: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list.
 eval_x_and_x_diff_shape=The column set in eval_set are differ from the column set in X
+eval_x_and_x_diff_dtypes=The column types in eval_set are different from the column types in X: {}
 unsupported_y_type_eval_set=Unsupported type of y in eval_set: {}. Use pandas.Series, numpy.ndarray or list
 y_is_constant_eval_set=y in eval_set is a constant. Relevant feature search requires a non-constant y
 x_and_y_diff_size_eval_set=X and y in eval_set contain different number of rows: {}, {}

{upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/psi.py RENAMED Viewed

@@ -82,9 +82,6 @@ def calculate_features_psi(
 ) -> dict[str, float]:
     empty_res = {col: 0.0 for col in df.columns if col not in [TARGET, date_column]}
-    if not is_numeric_dtype(df[date_column]):
-        df[date_column] = pd.to_datetime(df[date_column]).dt.floor("D").astype(np.int64) / 10**6
     # Filter out rows with missing dates
     df = df[df[date_column].notna()].copy()