PyPI - upgini - Versions diffs - 1.1.266a3254.post2__py3-none-any.whl → 1.1.267a3254.post3__py3-none-any.whl - Mend

upgini 1.1.266a3254.post2py3-none-any.whl → 1.1.267a3254.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

upgini/autofe/date.py CHANGED Viewed

@@ -54,6 +54,9 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
         return diff
+_ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len, 0)}
 class DateListDiff(PandasOperand, DateDiffMixin):
     is_binary = True
     has_symmetry_importance = True
@@ -72,18 +75,31 @@ class DateListDiff(PandasOperand, DateDiffMixin):
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         left = self._convert_to_date(left, self.left_unit)
+        right = right.apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
+        return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
-        return pd.Series(left.index.map(lambda i: self.reduce(self.map_diff(left.loc[i], right.loc[i]))))
+    def _diff(self, x):
+        x = x / np.timedelta64(1, self.diff_unit)
+        return x[x > 0]
+    def _agg(self, x):
+        method = getattr(np, self.aggregation, None)
+        default = np.nan
+        if method is None and self.aggregation in _ext_aggregations:
+            method, default = _ext_aggregations[self.aggregation]
+        elif not callable(method):
+            raise ValueError(f"Unsupported aggregation: {self.aggregation}")
+        return method(x) if len(x) > 0 else default
 class DateListDiffBounded(DateListDiff):
     lower_bound: Optional[int]
     upper_bound: Optional[int]
-    inclusive: Optional[str]
     def __init__(self, **data: Any) -> None:
         if "name" not in data:
-            inclusive = data.get("inclusive")
             lower_bound = data.get("lower_bound")
             upper_bound = data.get("upper_bound")
             components = [
@@ -92,18 +108,10 @@ class DateListDiffBounded(DateListDiff):
                 str(lower_bound if lower_bound is not None else "minusinf"),
                 str(upper_bound if upper_bound is not None else "plusinf"),
             ]
-            if inclusive:
-                components.append(inclusive)
             components.append(data.get("aggregation"))
             data["name"] = "_".join(components)
         super().__init__(**data)
-    def reduce(self, diff_list: pd.Series) -> float:
-        return diff_list[
-            (diff_list > 0)
-            & (
-                diff_list.between(
-                    self.lower_bound or -np.inf, self.upper_bound or np.inf, inclusive=self.inclusive or "left"
-                )
-            )
-        ].aggregate(self.aggregation)
+    def _agg(self, x):
+        x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
+        return super()._agg(x)

upgini/features_enricher.py CHANGED Viewed

@@ -94,7 +94,7 @@ try:
 except Exception:
     from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
-from upgini.utils.target_utils import define_task
+from upgini.utils.target_utils import calculate_psi, define_task
 from upgini.utils.warning_counter import WarningCounter
 from upgini.version_validator import validate_version
@@ -2226,14 +2226,11 @@ class FeaturesEnricher(TransformerMixin):
             validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter
         )
-        has_date = self._get_date_column(self.fit_search_keys) is not None
+        maybe_date_column = self._get_date_column(self.fit_search_keys)
+        has_date = maybe_date_column is not None
         model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
         self._validate_binary_observations(validated_y, model_task_type)
-        df = self.__handle_index_search_keys(df, self.fit_search_keys)
-        df = self.__correct_target(df)
         self.runtime_parameters = get_runtime_params_custom_loss(
             self.loss, model_task_type, self.runtime_parameters, self.logger
         )
@@ -2245,6 +2242,13 @@ class FeaturesEnricher(TransformerMixin):
                 eval_df[EVAL_SET_INDEX] = idx + 1
                 df = pd.concat([df, eval_df])
+        df = self.__correct_target(df)
+        df = self.__handle_index_search_keys(df, self.fit_search_keys)
+        if is_numeric_dtype(df[self.TARGET_NAME]) and has_date:
+            self._validate_PSI(df.sort_values(by=maybe_date_column))
         if DEFAULT_INDEX in df.columns:
             msg = self.bundle.get("unsupported_index_column")
             self.logger.info(msg)
@@ -3567,6 +3571,34 @@ class FeaturesEnricher(TransformerMixin):
             self.logger.warning(msg)
             print(msg)
+    def _validate_PSI(self, df: pd.DataFrame):
+        if EVAL_SET_INDEX in df.columns:
+            train = df.query(f"{EVAL_SET_INDEX} == 0")
+            eval1 = df.query(f"{EVAL_SET_INDEX} == 1")
+        else:
+            train = df
+            eval1 = None
+        # 1. Check train PSI
+        half_train = round(len(train) / 2)
+        part1 = train[:half_train]
+        part2 = train[half_train:]
+        train_psi = calculate_psi(part1[self.TARGET_NAME], part2[self.TARGET_NAME])
+        if train_psi > 0.2:
+            self.warning_counter.increment()
+            msg = self.bundle.get("train_unstable_target").format(train_psi)
+            print(msg)
+            self.logger.warning(msg)
+        # 2. Check train-test PSI
+        if eval1 is not None:
+            train_test_psi = calculate_psi(train[self.TARGET_NAME], eval1[self.TARGET_NAME])
+            if train_test_psi > 0.2:
+                self.warning_counter.increment()
+                msg = self.bundle.get("eval_unstable_target").format(train_test_psi)
+                print(msg)
+                self.logger.warning(msg)
     def _dump_python_libs(self):
         try:
             from pip._internal.operations.freeze import freeze

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -111,7 +111,9 @@ x_is_empty=X is empty
 y_is_empty=y is empty
 x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
 missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
-x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample.
+x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
+train_unstable_target=\nWARNING: Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
+eval_unstable_target=\nWARNING: Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
     # eval set validation
 unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
 eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
@@ -198,7 +200,7 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
 email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
 phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
 phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
-target_type_detected=Detected task type: {}\n
+target_type_detected=\nDetected task type: {}\n
 # all_ok_community_invite=Chat with us in Slack community:
 all_ok_community_invite=❓ Support request
 too_small_for_metrics=Your train dataset contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics

upgini/utils/target_utils.py CHANGED Viewed

@@ -177,3 +177,21 @@ def balance_undersample(
     logger.info(f"Shape after rebalance resampling: {resampled_data}")
     return resampled_data
+def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
+    df = pd.concat([expected, actual])
+    # Define the bins for the target variable
+    df_min = df.min()
+    df_max = df.max()
+    bins = [df_min, (df_min + df_max) / 2, df_max]
+    # Calculate the base distribution
+    train_distribution = expected.value_counts(bins=bins, normalize=True).sort_index().values
+    # Calculate the target distribution
+    test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
+    # Calculate the PSI
+    return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))

{upgini-1.1.266a3254.post2.dist-info → upgini-1.1.267a3254.post3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: upgini
-Version: 1.1.266a3254.post2
+Version: 1.1.267a3254.post3
 Summary: Intelligent data search & enrichment for Machine Learning
 Home-page: https://upgini.com/
 Author: Upgini Developers

{upgini-1.1.266a3254.post2.dist-info → upgini-1.1.267a3254.post3.dist-info}/RECORD RENAMED Viewed

@@ -2,7 +2,7 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
 upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
 upgini/dataset.py,sha256=xb4gIANyGbdcuM8Awyq2pJPiH_3k_LEbETApJgAoRBA,45529
 upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
-upgini/features_enricher.py,sha256=5rc9vcsCBwmRDb8aAPOFGmkRbC7_zGJGPlaSvkytqCk,172880
+upgini/features_enricher.py,sha256=poGGf5MZgangMFmfTxRWtE6FDPDy5VUtXLmW2tGiorI,174170
 upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
 upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
 upgini/metrics.py,sha256=3VvSZW1cCOIPHImXuqcnWzD3fWcpPzVa9k8eulLbUmY,27426
@@ -14,7 +14,7 @@ upgini/ads_management/ads_manager.py,sha256=fP4Yqx3h2Snw5X335TbXEwFoupq1RYsE7y0P
 upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/autofe/all_operands.py,sha256=H66wqVLD-H9k8A4-q2wslhV9QaNxlb49f8YiT0Xfkps,2356
 upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
-upgini/autofe/date.py,sha256=0fOfhJwMk14P_L0oUkl9jDopxzzO0x-XpSG5rOAayUc,3885
+upgini/autofe/date.py,sha256=ffASAn0CQiYRovRrTRLjnPmr_3Xy7GlGLieZv7yBoC0,4218
 upgini/autofe/feature.py,sha256=2FQRGtIumNz60hFAjfLReaY18SI7HxzYZOoC5avzSjQ,11847
 upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
 upgini/autofe/operand.py,sha256=dhtToPDGWtP_0u_RjayUpezJJZAgq_TzNbPH0bI9OXI,2805
@@ -28,7 +28,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
 upgini/normalizer/phone_normalizer.py,sha256=lhwsPEnfyjeIsndW2EcQGZksXYsfxaQ1ghAzVYoDRKM,9927
 upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
 upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
-upgini/resource_bundle/strings.properties,sha256=_bEfgRl2a9sgoy2RxvIf26NemnCW5CM-1AWWpljwZQE,25664
+upgini/resource_bundle/strings.properties,sha256=00KNv1A3rxXioktqB9o_V_zX0etC2LZO7NBIEsCoNNQ,26087
 upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
 upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
@@ -52,11 +52,11 @@ upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,4
 upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3x_zs,409
 upgini/utils/progress_bar.py,sha256=iNXyqT3vKCeHpfiG5HHwr7Lk2cTtKViM93Fl8iZnjGc,1564
 upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,44027
-upgini/utils/target_utils.py,sha256=5BHcOsBRb4z7P8t3e9rsdXUWUUI7DBmQMmv-x6RwzHM,7152
+upgini/utils/target_utils.py,sha256=9K67tkY7LWhQMO-vbbPqBaO-KriAmg_6fVz5RQRaLQc,7802
 upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
 upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
-upgini-1.1.266a3254.post2.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.1.266a3254.post2.dist-info/METADATA,sha256=PC7rgzScYGYLi6O0T2PaaTmIBjR5Q9D3TMmZtd1-W9k,48167
-upgini-1.1.266a3254.post2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-upgini-1.1.266a3254.post2.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
-upgini-1.1.266a3254.post2.dist-info/RECORD,,
+upgini-1.1.267a3254.post3.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.1.267a3254.post3.dist-info/METADATA,sha256=8-ODvHx4kAE3IrjYFRmIsThFJ8nIeBsD1BWjP6iuDno,48167
+upgini-1.1.267a3254.post3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+upgini-1.1.267a3254.post3.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
+upgini-1.1.267a3254.post3.dist-info/RECORD,,

{upgini-1.1.266a3254.post2.dist-info → upgini-1.1.267a3254.post3.dist-info}/LICENSE RENAMED Viewed

File without changes

{upgini-1.1.266a3254.post2.dist-info → upgini-1.1.267a3254.post3.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.1.266a3254.post2.dist-info → upgini-1.1.267a3254.post3.dist-info}/top_level.txt RENAMED Viewed

File without changes

upgini 1.1.266a3254.post2__py3-none-any.whl → 1.1.267a3254.post3__py3-none-any.whl

upgini 1.1.266a3254.post2py3-none-any.whl → 1.1.267a3254.post3py3-none-any.whl