PyPI - upgini - Versions diffs - 1.2.113a5__py3-none-any.whl → 1.2.113a3974.dev2__py3-none-any.whl - Mend

upgini 1.2.113a5py3-none-any.whl → 1.2.113a3974.dev2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

upgini/__about__.py +1 -1
upgini/autofe/date.py +8 -2
upgini/features_enricher.py +125 -438
upgini/metadata.py +0 -1
upgini/metrics.py +1 -4
upgini/resource_bundle/strings.properties +1 -4
upgini/sampler/base.py +1 -3
upgini/sampler/random_under_sampler.py +8 -18
upgini/utils/deduplicate_utils.py +7 -43
upgini/utils/feature_info.py +0 -5
{upgini-1.2.113a5.dist-info → upgini-1.2.113a3974.dev2.dist-info}/METADATA +1 -1
{upgini-1.2.113a5.dist-info → upgini-1.2.113a3974.dev2.dist-info}/RECORD +14 -15
{upgini-1.2.113a5.dist-info → upgini-1.2.113a3974.dev2.dist-info}/WHEEL +1 -1
upgini/utils/psi.py +0 -294
{upgini-1.2.113a5.dist-info → upgini-1.2.113a3974.dev2.dist-info}/licenses/LICENSE +0 -0

upgini/metadata.py CHANGED Viewed

@@ -285,7 +285,6 @@ class FeaturesMetadataV2(BaseModel):
     doc_link: Optional[str] = None
     update_frequency: Optional[str] = None
     from_online_api: Optional[bool] = None
-    psi_value: Optional[float] = None
 class HitRateMetrics(BaseModel):

upgini/metrics.py CHANGED Viewed

@@ -1175,10 +1175,7 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
     >>> mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7])
     0.060...
     """
-    try:
-        _, y_true, y_pred, multioutput = _check_reg_targets(y_true, y_pred, multioutput)
-    except TypeError:
-        _, y_true, y_pred, sample_weight, multioutput = _check_reg_targets(y_true, y_pred, sample_weight, multioutput)
+    _, y_true, y_pred, multioutput = _check_reg_targets(y_true, y_pred, multioutput)
     check_consistent_length(y_true, y_pred, sample_weight)
     if (y_true < 0).any():

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -123,7 +123,7 @@ train_unstable_target=Your training sample contains an unstable target event, PS
 eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
     # eval set validation
 unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
-eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y or X only
+eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
 unsupported_x_type_eval_set=Unsupported type of X in eval_set: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list.
 eval_x_and_x_diff_shape=The column set in eval_set are differ from the column set in X
 unsupported_y_type_eval_set=Unsupported type of y in eval_set: {}. Use pandas.Series, numpy.ndarray or list
@@ -139,8 +139,6 @@ eval_x_is_empty=X in eval_set is empty.
 eval_y_is_empty=y in eval_set is empty.
 x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
 eval_x_has_train_samples=Eval set X has rows that are present in train set X
-oot_without_date_not_supported=Eval set {} provided as OOT but date column is missing. It will be ignored for stability check
-oot_with_online_sources_not_supported=Eval set {} provided as OOT and also provided columns for online API. It will be ignored for stability check
 baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
 baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
@@ -257,7 +255,6 @@ features_info_provider=Provider
 features_info_source=Source
 features_info_name=Feature name
 features_info_shap=SHAP value
-features_info_psi=PSI value
 features_info_hitrate=Coverage %
 features_info_type=Type
 # Deprecated

upgini/sampler/base.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """
 Base class for the under-sampling method.
 """
 # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
 # License: MIT
@@ -13,7 +12,6 @@ import numpy as np
 from sklearn.base import BaseEstimator
 from sklearn.preprocessing import label_binarize
 from sklearn.utils.multiclass import check_classification_targets
-from sklearn.utils.validation import check_X_y
 from .utils import ArraysTransformer, check_sampling_strategy, check_target_type
@@ -127,7 +125,7 @@ class BaseSampler(SamplerMixin):
         if accept_sparse is None:
             accept_sparse = ["csr", "csc"]
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
-        X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=None, ensure_all_finite=False)
+        X, y = self._validate_data(X, y, reset=True, accept_sparse=accept_sparse)
         return X, y, binarize_y
     def _more_tags(self):

upgini/sampler/random_under_sampler.py CHANGED Viewed

@@ -80,24 +80,14 @@ RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE
     def _check_X_y(self, X, y):
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
-        try:
-            X, y = self._validate_data(
-                X,
-                y,
-                reset=True,
-                accept_sparse=["csr", "csc"],
-                dtype=None,
-                force_all_finite=False,
-            )
-        except AttributeError:
-            from sklearn.utils.validation import check_X_y
-            X, y = check_X_y(
-                X,
-                y,
-                accept_sparse=["csr", "csc"],
-                dtype=None,
-                ensure_all_finite=False,
-            )
+        X, y = self._validate_data(
+            X,
+            y,
+            reset=True,
+            accept_sparse=["csr", "csc"],
+            dtype=None,
+            force_all_finite=False,
+        )
         return X, y, binarize_y
     def _fit_resample(self, X, y):

upgini/utils/deduplicate_utils.py CHANGED Viewed

@@ -136,9 +136,6 @@ def remove_fintech_duplicates(
     # Process each eval_set part separately
     new_eval_dfs = []
     for i, eval_df in enumerate(eval_dfs, 1):
-        # Skip OOT
-        if eval_df[TARGET].isna().all():
-            continue
         logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
         cleaned_eval_df, eval_warning = process_df(eval_df, i)
         if eval_warning:
@@ -193,49 +190,16 @@ def clean_full_duplicates(
     msg = None
     if TARGET in df.columns:
         unique_columns.remove(TARGET)
-        # Separate rows to exclude from deduplication:
-        # for each eval_set_index != 0 check separately, all TARGET values are NaN
-        excluded_from_dedup = pd.DataFrame()
-        df_for_dedup = df
-        if EVAL_SET_INDEX in df.columns:
-            excluded_parts = []
-            # Get all unique eval_set_index values, except 0
-            unique_eval_indices = df[df[EVAL_SET_INDEX] != 0][EVAL_SET_INDEX].unique()
-            for eval_idx in unique_eval_indices:
-                eval_subset = df[df[EVAL_SET_INDEX] == eval_idx]
-                # Check that all TARGET values for this specific eval_set_index are NaN
-                if len(eval_subset) > 0 and eval_subset[TARGET].isna().all():
-                    excluded_parts.append(eval_subset)
-                    logger.info(
-                        f"Excluded {len(eval_subset)} rows from deduplication "
-                        f"(eval_set_index={eval_idx} and all TARGET values are NaN)"
-                    )
-            # Combine all excluded parts
-            if excluded_parts:
-                excluded_from_dedup = pd.concat(excluded_parts, ignore_index=False)
-                # Remove excluded rows from dataframe for deduplication
-                excluded_indices = excluded_from_dedup.index
-                df_for_dedup = df[~df.index.isin(excluded_indices)]
-        marked_duplicates = df_for_dedup.duplicated(subset=unique_columns, keep=False)
+        marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
         if marked_duplicates.sum() > 0:
-            dups_indices = df_for_dedup[marked_duplicates].index.to_list()[:100]
-            nrows_after_tgt_dedup = len(df_for_dedup.drop_duplicates(subset=unique_columns, keep=False))
-            num_dup_rows = len(df_for_dedup) - nrows_after_tgt_dedup
-            share_tgt_dedup = 100 * num_dup_rows / len(df_for_dedup)
+            dups_indices = df[marked_duplicates].index.to_list()[:100]
+            nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
+            num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
+            share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
             msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
-            df_for_dedup = df_for_dedup.drop_duplicates(subset=unique_columns, keep=False)
-            logger.info(f"Dataset shape after clean invalid target duplicates: {df_for_dedup.shape}")
-        # Combine back excluded rows
-        if len(excluded_from_dedup) > 0:
-            df = pd.concat([df_for_dedup, excluded_from_dedup], ignore_index=False)
-            logger.info(f"Final dataset shape after adding back excluded rows: {df.shape}")
-        else:
-            df = df_for_dedup
+            df = df.drop_duplicates(subset=unique_columns, keep=False)
+            logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
     return df, msg

upgini/utils/feature_info.py CHANGED Viewed

@@ -27,7 +27,6 @@ class FeatureInfo:
     doc_link: str
     data_provider_link: str
     data_source_link: str
-    psi_value: Optional[float] = None
     @staticmethod
     def from_metadata(
@@ -48,14 +47,12 @@ class FeatureInfo:
             doc_link=feature_meta.doc_link,
             data_provider_link=feature_meta.data_provider_link,
             data_source_link=feature_meta.data_source_link,
-            psi_value=feature_meta.psi_value,
         )
     def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
         return {
             bundle.get("features_info_name"): self.name,
             bundle.get("features_info_shap"): self.rounded_shap,
-            bundle.get("features_info_psi"): self.psi_value,
             bundle.get("features_info_hitrate"): self.hitrate,
             bundle.get("features_info_value_preview"): self.value_preview,
             bundle.get("features_info_provider"): self.provider,
@@ -67,7 +64,6 @@ class FeatureInfo:
         return {
             bundle.get("features_info_name"): self.internal_name,
             bundle.get("features_info_shap"): self.rounded_shap,
-            bundle.get("features_info_psi"): self.psi_value,
             bundle.get("features_info_hitrate"): self.hitrate,
             bundle.get("features_info_value_preview"): self.value_preview,
             bundle.get("features_info_provider"): self.internal_provider,
@@ -80,7 +76,6 @@ class FeatureInfo:
             bundle.get("features_info_name"): self.internal_name,
             "feature_link": self.doc_link,
             bundle.get("features_info_shap"): self.rounded_shap,
-            bundle.get("features_info_psi"): self.psi_value,
             bundle.get("features_info_hitrate"): self.hitrate,
             bundle.get("features_info_value_preview"): self.value_preview,
             bundle.get("features_info_provider"): self.internal_provider,

{upgini-1.2.113a5.dist-info → upgini-1.2.113a3974.dev2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.113a5
+Version: 1.2.113a3974.dev2
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.113a5.dist-info → upgini-1.2.113a3974.dev2.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
-upgini/__about__.py,sha256=QdA0r4M8wEBY37BMjK9uA_83s1sWkyXy2XJhfn7vl3A,26
+upgini/__about__.py,sha256=ziYMT-cCb1zPGJYidvejUtxXlUCjQLvR25p82kAy21c,34
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=xFi0a-A3uvtxVwFM6JOyitkEPd1I2slIBj5SWfys3hQ,32724
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=wifdmDP-3e3y51KYhCHPYuN6vU8mj2m3SYo-kMWcNz0,234523
+upgini/features_enricher.py,sha256=rfVdHgUYEq9saqhWcI04jUmNQcAAn5Kto4w3WpxlOpA,221762
 upgini/http.py,sha256=zeAZvT6IAzOs9jQ3WG8mJBANLajgvv2LZePFzKz004w,45482
-upgini/metadata.py,sha256=sx4X9fPkyCgXB6FPk9Rq_S1Kx8ibkbaWA-qNDVCuSmg,12811
-upgini/metrics.py,sha256=O19UqmgZ6SA136eCYV5lVU3J26ecgZlGXnxGblMvZJc,45869
+upgini/metadata.py,sha256=9_0lFEWPpIHRBW-xWYSEcwPzICTC6_bQ6dUUlE75Xns,12773
+upgini/metrics.py,sha256=V2SP6NS5bfFHzRqufeKVsCXME1yG4t_8Dmk2E3zKdYk,45715
 upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
 upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
 upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -15,7 +15,7 @@ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo
 upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
 upgini/autofe/binary.py,sha256=oOEECc4nRzZN2tYaiqx8F2XHnfWpk1bVvb7ZkZJ0lO8,7709
-upgini/autofe/date.py,sha256=Ga022BUSgXJ4W3P8uWkPNo6k6J0IuEZw6Ezs9KNikPk,11188
+upgini/autofe/date.py,sha256=RvexgrL1_6ISYPVrl9HUQmPgpVSGQsTNv8YhNQWs-5M,11329
 upgini/autofe/feature.py,sha256=b4Ps_sCPui9b4h0K3ya85cfL1SWpLVrlHc40zkKVfAY,16329
 upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
 upgini/autofe/operator.py,sha256=RB3rKMjFi5Cx81RiYXN3OTCuXjmvzmFKQrxn4h0Oclo,5219
@@ -38,11 +38,11 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
 upgini/normalizer/normalize_utils.py,sha256=mDh2mBW3aQMB4EFP2aHbf2dGMVkOcWnp4sKKvKDBh8w,8511
 upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
 upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
-upgini/resource_bundle/strings.properties,sha256=6Q3dwI0v1aiXt7_3Xx0Ih6jMmSCBaaRGIoUiZ5-VnCY,28988
+upgini/resource_bundle/strings.properties,sha256=NyxRwzehkrL5LMoVyjkhN811MvalepavNfjlC9ubE0Q,28677
 upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
 upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-upgini/sampler/base.py,sha256=Fva2FEhLiNRPZ9Q6uOtJRtRzwsayjv7aphalAZO_4lc,6452
-upgini/sampler/random_under_sampler.py,sha256=4mofmaRTmNwT_HqxecWJyfXdLKK0h9jMBwS46xdrIqE,4356
+upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
+upgini/sampler/random_under_sampler.py,sha256=TIbm7ATo-bCMF-IiS5sZeDC1ad1SYg0eY_rRmg84yIQ,4024
 upgini/sampler/utils.py,sha256=PYOk3kKSnFlyxcpdtDNLBEEhTB4lO_iP7pQHqeUcmAc,20211
 upgini/utils/Roboto-Regular.ttf,sha256=kqYnZjMRQMpbyLulIChCLSdgYa1XF8GsUIoRi2Gcauw,168260
 upgini/utils/__init__.py,sha256=O_KgzKiJjW3g4NoqZ7lAxUpoHcBi_gze6r3ndEjCH74,842
@@ -52,11 +52,11 @@ upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk
 upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
 upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
 upgini/utils/datetime_utils.py,sha256=UL1ernnawW0LV9mPDpCIc6sFy0HUhFscWVNwfH4V7rI,14366
-upgini/utils/deduplicate_utils.py,sha256=xXashCSIg87gCy6QyXc0eb8huuzPLANmckMVxUVBEgM,10729
+upgini/utils/deduplicate_utils.py,sha256=EpBVCov42-FJIAPfa4jY_ZRct3N2MFaC7i-oJNZ_MGI,8954
 upgini/utils/display_utils.py,sha256=Ou7dYdgvvdh443OgOLTM_xKwC2ITx9DQrpKoC2vCRYc,11856
 upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
 upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
-upgini/utils/feature_info.py,sha256=6vihytwKma_TlXtTn4l6Aj4kqlOj0ouLy-yWVV6VUw8,7551
+upgini/utils/feature_info.py,sha256=b3RvAeOHSEu-ZXWTrf42Dll_3ZUBL0pw7sdk7hgUKD0,7284
 upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
 upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
 upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
@@ -64,7 +64,6 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
 upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
 upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
 upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
-upgini/utils/psi.py,sha256=pLtECcCeco_WRqMjFnQvhUB4vHArjHtD5HzJFP9ICMc,10972
 upgini/utils/sample_utils.py,sha256=lZJ4yf9Jiq9Em2Ny9m3RIiF7WSxBPrc4E3xxn_8sQk8,15417
 upgini/utils/sklearn_ext.py,sha256=jLJWAKkqQinV15Z4y1ZnsN3c-fKFwXTsprs00COnyVU,49315
 upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
@@ -72,7 +71,7 @@ upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.113a5.dist-info/METADATA,sha256=VOeoK4hhJyhb0OJWG2cgsN-hES6xe3QIRyZMovxP8ek,49531
-upgini-1.2.113a5.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.113a5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.113a5.dist-info/RECORD,,
+upgini-1.2.113a3974.dev2.dist-info/METADATA,sha256=RC2p2RrCBlPWX6hGAcLGtt-k6wOmmq2DFhetxg3LvGk,49539
+upgini-1.2.113a3974.dev2.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
+upgini-1.2.113a3974.dev2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.113a3974.dev2.dist-info/RECORD,,

{upgini-1.2.113a5.dist-info → upgini-1.2.113a3974.dev2.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.25.0
+Generator: hatchling 1.24.2
 Root-Is-Purelib: true
 Tag: py3-none-any

upgini/utils/psi.py DELETED Viewed

@@ -1,294 +0,0 @@
-import itertools
-import logging
-import operator
-from functools import reduce
-from typing import Callable, Dict, Optional
-import more_itertools
-import numpy as np
-import pandas as pd
-from pandas.api.types import is_numeric_dtype
-from pydantic import BaseModel
-from upgini.metadata import TARGET, ModelTaskType
-class StabilityParams(BaseModel):
-    threshold: float = 999
-    n_intervals: int = 12
-    min_intervals: int = 10
-    max_intervals: Optional[int] = None
-    min_values_in_interval: Optional[int] = None
-    n_bins: int = 10
-    min_values_in_bin: Optional[int] = None
-    cat_top_pct: float = 0.7
-    agg: str = "max"
-DEFAULT_TARGET_PARAMS = StabilityParams(
-    n_intervals=12,
-    min_intervals=10,
-    max_intervals=None,
-    min_values_in_interval=None,
-    n_bins=5,
-)
-DEFAULT_FEATURES_PARAMS = StabilityParams(
-    n_intervals=12,
-    min_intervals=10,
-    max_intervals=None,
-    min_values_in_interval=None,
-    n_bins=10,
-)
-def calculate_sparsity_psi(
-    df: pd.DataFrame,
-    cat_features: list[str],
-    date_column: str,
-    logger: logging.Logger,
-    model_task_type: ModelTaskType,
-    psi_features_params: StabilityParams = DEFAULT_FEATURES_PARAMS,
-    psi_target_params: StabilityParams = DEFAULT_TARGET_PARAMS,
-) -> Dict[str, float]:
-    sparse_features = df.columns[df.isna().sum() > 0].to_list()
-    if len(sparse_features) > 0:
-        logger.info(f"Calculating sparsity stability for {len(sparse_features)} sparse features")
-        sparse_df = df[sparse_features].notna()
-        sparse_df[date_column] = df[date_column]
-        return calculate_features_psi(
-            sparse_df,
-            cat_features,
-            date_column,
-            logger,
-            model_task_type,
-            psi_target_params,
-            psi_features_params,
-        )
-    return {}
-def calculate_features_psi(
-    df: pd.DataFrame,
-    cat_features: list[str],
-    date_column: str,
-    logger: logging.Logger,
-    model_task_type: ModelTaskType,
-    psi_features_params: StabilityParams = DEFAULT_FEATURES_PARAMS,
-    psi_target_params: StabilityParams = DEFAULT_TARGET_PARAMS,
-) -> Dict[str, float]:
-    empty_res = pd.Series(index=df.columns, data=0)
-    if not is_numeric_dtype(df[date_column]):
-        df[date_column] = pd.to_datetime(df[date_column]).dt.floor("D").astype(np.int64) / 10**6
-    n_months = pd.to_datetime(df[date_column], unit="ms").dt.month.nunique()
-    if TARGET in df.columns:
-        psi_target_params.n_intervals = min(
-            psi_target_params.max_intervals or np.inf, max(psi_target_params.min_intervals, n_months)
-        )
-        logger.info(f"Setting {psi_target_params.n_intervals} intervals for target PSI check")
-        logger.info(f"Calculating target PSI for {psi_target_params.n_intervals} intervals")
-        reference_mask, current_masks = _split_intervals(df, date_column, psi_target_params.n_intervals, logger)
-        if psi_target_params.min_values_in_interval is not None and any(
-            len(mask) < psi_target_params.min_values_in_interval
-            for mask in itertools.chain(current_masks, [reference_mask])
-        ):
-            logger.info(
-                f"Some intervals have less than {psi_target_params.min_values_in_interval} values. Skip PSI check"
-            )
-            return empty_res
-        target_agg_func = _get_agg_func(psi_target_params.agg)
-        logger.info(f"Calculating target PSI with agg function {target_agg_func}")
-        target_psi = _stability_agg(
-            [df[TARGET][cur] for cur in current_masks],
-            reference_data=df[TARGET][reference_mask],
-            is_numerical=model_task_type == ModelTaskType.REGRESSION,
-            min_values_in_bin=psi_target_params.min_values_in_bin,
-            n_bins=psi_target_params.n_bins,
-            cat_top_pct=psi_target_params.cat_top_pct,
-            agg_func=target_agg_func,
-        )
-        if target_psi is None:
-            logger.info("Cannot determine target PSI. Skip feature PSI check")
-            return pd.Series(index=df.columns, data=0)
-        if target_psi > psi_target_params.threshold:
-            logger.info(
-                f"Target PSI {target_psi} is more than threshold {psi_target_params.threshold}. Skip feature PSI check"
-            )
-            return empty_res
-    psi_features_params.n_intervals = min(
-        psi_features_params.max_intervals or np.inf, max(psi_features_params.min_intervals, n_months)
-    )
-    logger.info(f"Setting {psi_features_params.n_intervals} intervals for features PSI check")
-    logger.info(f"Calculating PSI for {len(df.columns)} features")
-    reference_mask, current_masks = _split_intervals(df, date_column, psi_features_params.n_intervals, logger)
-    features_agg_func = _get_agg_func(psi_features_params.agg)
-    logger.info(f"Calculating features PSI with agg function {features_agg_func}")
-    psi_values = [
-        _stability_agg(
-            [df[feature][cur] for cur in current_masks],
-            reference_data=df[feature][reference_mask],
-            is_numerical=feature not in cat_features,
-            min_values_in_bin=psi_features_params.min_values_in_bin,
-            n_bins=psi_features_params.n_bins,
-            cat_top_pct=psi_features_params.cat_top_pct,
-            agg_func=features_agg_func,
-        )
-        for feature in df.columns
-        if feature not in [TARGET, date_column]
-    ]
-    return {feature: psi for feature, psi in zip(df.columns, psi_values)}
-def _split_intervals(
-    df: pd.DataFrame, date_column: str, n_intervals: int, logger: logging.Logger
-) -> tuple[pd.Series, list[pd.Series]]:
-    date_series = df[date_column]
-    # Check if we have enough unique values for the requested number of intervals
-    unique_values = date_series.nunique()
-    # If we have fewer unique values than requested intervals, adjust n_intervals
-    if unique_values < n_intervals:
-        logger.warning(f"Date column '{date_column}' has only {unique_values} unique values")
-    time_intervals = pd.qcut(date_series, q=n_intervals, duplicates="drop")
-    interval_labels = time_intervals.unique()
-    reference_mask = time_intervals == interval_labels[0]
-    current_masks = [time_intervals == label for label in interval_labels[1:]]
-    return reference_mask, current_masks
-def _get_agg_func(agg: str):
-    np_agg = getattr(np, agg, None)
-    if np_agg is None and agg.startswith("q"):
-        q = int(agg[1:])
-        return lambda x: np.quantile(list(x), q / 100, method="higher")
-    return np_agg
-def _psi(reference_percent: np.ndarray, current_percent: np.ndarray) -> float:
-    return np.sum((reference_percent - current_percent) * np.log(reference_percent / current_percent))
-def _stability_agg(
-    current_data: list[pd.Series],
-    reference_data: pd.Series,
-    is_numerical: bool = True,
-    min_values_in_bin: int | None = None,
-    n_bins: int = 10,
-    cat_top_pct: float = 0.7,
-    agg_func: Callable = max,
-) -> float | None:
-    """Calculate the PSI
-    Args:
-        current_data: current data
-        reference_data: reference data
-        is_numerical: whether the feature is numerical
-        reference_ratio: ratio of current data to use as reference if reference_data is not provided
-        min_values_in_bin: minimum number of values in a bin to calculate PSI
-        n_bins: number of bins to use for numerical features
-    Returns:
-        psi_value: calculated PSI
-    """
-    reference, current = _get_binned_data(reference_data, current_data, is_numerical, n_bins, cat_top_pct)
-    if len(reference) == 0 or len(current) == 0:
-        return None
-    nonempty_current = [i for i, c in enumerate(current) if len(c) > 0]
-    current = [current[i] for i in nonempty_current]
-    current_data = [current_data[i] for i in nonempty_current]
-    if len(current) == 0:
-        return None
-    if min_values_in_bin is not None and (
-        np.array(reference).min() < min_values_in_bin or any(np.array(c).min() < min_values_in_bin for c in current)
-    ):
-        return None
-    reference = _fill_zeroes(reference / len(reference_data))
-    current = [_fill_zeroes(c / len(d)) for c, d in zip(current, current_data)]
-    psi_value = agg_func([_psi(reference, c) for c in current])
-    return psi_value
-def _get_binned_data(
-    reference_data: pd.Series,
-    current_data: list[pd.Series],
-    is_numerical: bool,
-    n_bins: int,
-    cat_top_pct: float,
-):
-    """Split variable into n buckets based on reference quantiles
-    Args:
-        reference_data: reference data
-        current_data: current data
-        feature_type: feature type
-        n: number of quantiles
-    Returns:
-        reference_counts: number of records in each bucket for reference
-        current_counts: number of records in each bucket for current
-    """
-    n_vals = reference_data.nunique()
-    if is_numerical and n_vals > 20:
-        bins = _get_bin_edges(reference_data, n_bins)
-        reference_counts = np.histogram(reference_data, bins)[0]
-        current_counts = [np.histogram(d, bins)[0] for d in current_data]
-    else:
-        keys = _get_unique_not_nan_values_list_from_series([reference_data] + current_data)
-        ref_feature_dict = {**dict.fromkeys(keys, 0), **dict(reference_data.value_counts())}
-        current_feature_dict = [{**dict.fromkeys(keys, 0), **dict(d.value_counts())} for d in current_data]
-        key_dict = more_itertools.map_reduce(
-            itertools.chain(ref_feature_dict.items(), *(d.items() for d in current_feature_dict)),
-            keyfunc=operator.itemgetter(0),
-            valuefunc=operator.itemgetter(1),
-            reducefunc=sum,
-        )
-        key_dict = pd.Series(key_dict)
-        keys = key_dict.index[key_dict.rank(pct=True) >= cat_top_pct]
-        reference_counts = np.array([ref_feature_dict[key] for key in keys])
-        current_counts = [np.array([current_feature_dict[i][key] for key in keys]) for i in range(len(current_data))]
-    reference_counts = np.append(reference_counts, reference_data.isna().sum())
-    current_counts = [np.append(d, current_data[i].isna().sum()) for i, d in enumerate(current_counts)]
-    return reference_counts, current_counts
-def _fill_zeroes(percents: np.ndarray) -> np.ndarray:
-    eps = 0.0001
-    if (percents == 0).all():
-        np.place(percents, percents == 0, eps)
-    else:
-        min_value = min(percents[percents != 0])
-        if min_value <= eps:
-            np.place(percents, percents == 0, eps)
-        else:
-            np.place(percents, percents == 0, min_value / 10**6)
-    return percents
-def _get_bin_edges(data: pd.Series, n_bins: int) -> np.ndarray:
-    bins = np.nanquantile(data, np.linspace(0, 1, n_bins + 1))
-    bins[0] = -np.inf
-    bins[-1] = np.inf
-    return bins
-def _get_unique_not_nan_values_list_from_series(series: list[pd.Series]) -> list:
-    """Get unique values from current and reference series, drop NaNs"""
-    return list(reduce(set.union, (set(s.dropna().unique()) for s in series)))

{upgini-1.2.113a5.dist-info → upgini-1.2.113a3974.dev2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.113a5__py3-none-any.whl → 1.2.113a3974.dev2__py3-none-any.whl

upgini 1.2.113a5py3-none-any.whl → 1.2.113a3974.dev2py3-none-any.whl