upgini 1.1.266a3254.post2__py3-none-any.whl → 1.1.267a3254.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/autofe/date.py CHANGED
@@ -54,6 +54,9 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
54
54
  return diff
55
55
 
56
56
 
57
+ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len, 0)}
58
+
59
+
57
60
  class DateListDiff(PandasOperand, DateDiffMixin):
58
61
  is_binary = True
59
62
  has_symmetry_importance = True
@@ -72,18 +75,31 @@ class DateListDiff(PandasOperand, DateDiffMixin):
72
75
 
73
76
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
74
77
  left = self._convert_to_date(left, self.left_unit)
78
+ right = right.apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
79
+
80
+ return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
75
81
 
76
- return pd.Series(left.index.map(lambda i: self.reduce(self.map_diff(left.loc[i], right.loc[i]))))
82
+ def _diff(self, x):
83
+ x = x / np.timedelta64(1, self.diff_unit)
84
+ return x[x > 0]
85
+
86
+ def _agg(self, x):
87
+ method = getattr(np, self.aggregation, None)
88
+ default = np.nan
89
+ if method is None and self.aggregation in _ext_aggregations:
90
+ method, default = _ext_aggregations[self.aggregation]
91
+ elif not callable(method):
92
+ raise ValueError(f"Unsupported aggregation: {self.aggregation}")
93
+
94
+ return method(x) if len(x) > 0 else default
77
95
 
78
96
 
79
97
  class DateListDiffBounded(DateListDiff):
80
98
  lower_bound: Optional[int]
81
99
  upper_bound: Optional[int]
82
- inclusive: Optional[str]
83
100
 
84
101
  def __init__(self, **data: Any) -> None:
85
102
  if "name" not in data:
86
- inclusive = data.get("inclusive")
87
103
  lower_bound = data.get("lower_bound")
88
104
  upper_bound = data.get("upper_bound")
89
105
  components = [
@@ -92,18 +108,10 @@ class DateListDiffBounded(DateListDiff):
92
108
  str(lower_bound if lower_bound is not None else "minusinf"),
93
109
  str(upper_bound if upper_bound is not None else "plusinf"),
94
110
  ]
95
- if inclusive:
96
- components.append(inclusive)
97
111
  components.append(data.get("aggregation"))
98
112
  data["name"] = "_".join(components)
99
113
  super().__init__(**data)
100
114
 
101
- def reduce(self, diff_list: pd.Series) -> float:
102
- return diff_list[
103
- (diff_list > 0)
104
- & (
105
- diff_list.between(
106
- self.lower_bound or -np.inf, self.upper_bound or np.inf, inclusive=self.inclusive or "left"
107
- )
108
- )
109
- ].aggregate(self.aggregation)
115
+ def _agg(self, x):
116
+ x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
117
+ return super()._agg(x)
@@ -94,7 +94,7 @@ try:
94
94
  except Exception:
95
95
  from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
96
96
 
97
- from upgini.utils.target_utils import define_task
97
+ from upgini.utils.target_utils import calculate_psi, define_task
98
98
  from upgini.utils.warning_counter import WarningCounter
99
99
  from upgini.version_validator import validate_version
100
100
 
@@ -2226,14 +2226,11 @@ class FeaturesEnricher(TransformerMixin):
2226
2226
  validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter
2227
2227
  )
2228
2228
 
2229
- has_date = self._get_date_column(self.fit_search_keys) is not None
2229
+ maybe_date_column = self._get_date_column(self.fit_search_keys)
2230
+ has_date = maybe_date_column is not None
2230
2231
  model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
2231
2232
  self._validate_binary_observations(validated_y, model_task_type)
2232
2233
 
2233
- df = self.__handle_index_search_keys(df, self.fit_search_keys)
2234
-
2235
- df = self.__correct_target(df)
2236
-
2237
2234
  self.runtime_parameters = get_runtime_params_custom_loss(
2238
2235
  self.loss, model_task_type, self.runtime_parameters, self.logger
2239
2236
  )
@@ -2245,6 +2242,13 @@ class FeaturesEnricher(TransformerMixin):
2245
2242
  eval_df[EVAL_SET_INDEX] = idx + 1
2246
2243
  df = pd.concat([df, eval_df])
2247
2244
 
2245
+ df = self.__correct_target(df)
2246
+
2247
+ df = self.__handle_index_search_keys(df, self.fit_search_keys)
2248
+
2249
+ if is_numeric_dtype(df[self.TARGET_NAME]) and has_date:
2250
+ self._validate_PSI(df.sort_values(by=maybe_date_column))
2251
+
2248
2252
  if DEFAULT_INDEX in df.columns:
2249
2253
  msg = self.bundle.get("unsupported_index_column")
2250
2254
  self.logger.info(msg)
@@ -3567,6 +3571,34 @@ class FeaturesEnricher(TransformerMixin):
3567
3571
  self.logger.warning(msg)
3568
3572
  print(msg)
3569
3573
 
3574
+ def _validate_PSI(self, df: pd.DataFrame):
3575
+ if EVAL_SET_INDEX in df.columns:
3576
+ train = df.query(f"{EVAL_SET_INDEX} == 0")
3577
+ eval1 = df.query(f"{EVAL_SET_INDEX} == 1")
3578
+ else:
3579
+ train = df
3580
+ eval1 = None
3581
+
3582
+ # 1. Check train PSI
3583
+ half_train = round(len(train) / 2)
3584
+ part1 = train[:half_train]
3585
+ part2 = train[half_train:]
3586
+ train_psi = calculate_psi(part1[self.TARGET_NAME], part2[self.TARGET_NAME])
3587
+ if train_psi > 0.2:
3588
+ self.warning_counter.increment()
3589
+ msg = self.bundle.get("train_unstable_target").format(train_psi)
3590
+ print(msg)
3591
+ self.logger.warning(msg)
3592
+
3593
+ # 2. Check train-test PSI
3594
+ if eval1 is not None:
3595
+ train_test_psi = calculate_psi(train[self.TARGET_NAME], eval1[self.TARGET_NAME])
3596
+ if train_test_psi > 0.2:
3597
+ self.warning_counter.increment()
3598
+ msg = self.bundle.get("eval_unstable_target").format(train_test_psi)
3599
+ print(msg)
3600
+ self.logger.warning(msg)
3601
+
3570
3602
  def _dump_python_libs(self):
3571
3603
  try:
3572
3604
  from pip._internal.operations.freeze import freeze
@@ -111,7 +111,9 @@ x_is_empty=X is empty
111
111
  y_is_empty=y is empty
112
112
  x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
113
113
  missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
114
- x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample.
114
+ x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
115
+ train_unstable_target=\nWARNING: Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
116
+ eval_unstable_target=\nWARNING: Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
115
117
  # eval set validation
116
118
  unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
117
119
  eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
@@ -198,7 +200,7 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
198
200
  email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
199
201
  phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
200
202
  phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
201
- target_type_detected=Detected task type: {}\n
203
+ target_type_detected=\nDetected task type: {}\n
202
204
  # all_ok_community_invite=Chat with us in Slack community:
203
205
  all_ok_community_invite=❓ Support request
204
206
  too_small_for_metrics=Your train dataset contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
@@ -177,3 +177,21 @@ def balance_undersample(
177
177
 
178
178
  logger.info(f"Shape after rebalance resampling: {resampled_data}")
179
179
  return resampled_data
180
+
181
+
182
+ def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
183
+ df = pd.concat([expected, actual])
184
+
185
+ # Define the bins for the target variable
186
+ df_min = df.min()
187
+ df_max = df.max()
188
+ bins = [df_min, (df_min + df_max) / 2, df_max]
189
+
190
+ # Calculate the base distribution
191
+ train_distribution = expected.value_counts(bins=bins, normalize=True).sort_index().values
192
+
193
+ # Calculate the target distribution
194
+ test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
195
+
196
+ # Calculate the PSI
197
+ return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.266a3254.post2
3
+ Version: 1.1.267a3254.post3
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -2,7 +2,7 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
3
  upgini/dataset.py,sha256=xb4gIANyGbdcuM8Awyq2pJPiH_3k_LEbETApJgAoRBA,45529
4
4
  upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
5
- upgini/features_enricher.py,sha256=5rc9vcsCBwmRDb8aAPOFGmkRbC7_zGJGPlaSvkytqCk,172880
5
+ upgini/features_enricher.py,sha256=poGGf5MZgangMFmfTxRWtE6FDPDy5VUtXLmW2tGiorI,174170
6
6
  upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
7
7
  upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
8
8
  upgini/metrics.py,sha256=3VvSZW1cCOIPHImXuqcnWzD3fWcpPzVa9k8eulLbUmY,27426
@@ -14,7 +14,7 @@ upgini/ads_management/ads_manager.py,sha256=fP4Yqx3h2Snw5X335TbXEwFoupq1RYsE7y0P
14
14
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  upgini/autofe/all_operands.py,sha256=H66wqVLD-H9k8A4-q2wslhV9QaNxlb49f8YiT0Xfkps,2356
16
16
  upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
17
- upgini/autofe/date.py,sha256=0fOfhJwMk14P_L0oUkl9jDopxzzO0x-XpSG5rOAayUc,3885
17
+ upgini/autofe/date.py,sha256=ffASAn0CQiYRovRrTRLjnPmr_3Xy7GlGLieZv7yBoC0,4218
18
18
  upgini/autofe/feature.py,sha256=2FQRGtIumNz60hFAjfLReaY18SI7HxzYZOoC5avzSjQ,11847
19
19
  upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
20
20
  upgini/autofe/operand.py,sha256=dhtToPDGWtP_0u_RjayUpezJJZAgq_TzNbPH0bI9OXI,2805
@@ -28,7 +28,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
28
28
  upgini/normalizer/phone_normalizer.py,sha256=lhwsPEnfyjeIsndW2EcQGZksXYsfxaQ1ghAzVYoDRKM,9927
29
29
  upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
30
30
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
31
- upgini/resource_bundle/strings.properties,sha256=_bEfgRl2a9sgoy2RxvIf26NemnCW5CM-1AWWpljwZQE,25664
31
+ upgini/resource_bundle/strings.properties,sha256=00KNv1A3rxXioktqB9o_V_zX0etC2LZO7NBIEsCoNNQ,26087
32
32
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
33
33
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
34
  upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
@@ -52,11 +52,11 @@ upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,4
52
52
  upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3x_zs,409
53
53
  upgini/utils/progress_bar.py,sha256=iNXyqT3vKCeHpfiG5HHwr7Lk2cTtKViM93Fl8iZnjGc,1564
54
54
  upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,44027
55
- upgini/utils/target_utils.py,sha256=5BHcOsBRb4z7P8t3e9rsdXUWUUI7DBmQMmv-x6RwzHM,7152
55
+ upgini/utils/target_utils.py,sha256=9K67tkY7LWhQMO-vbbPqBaO-KriAmg_6fVz5RQRaLQc,7802
56
56
  upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
57
57
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
58
- upgini-1.1.266a3254.post2.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
59
- upgini-1.1.266a3254.post2.dist-info/METADATA,sha256=PC7rgzScYGYLi6O0T2PaaTmIBjR5Q9D3TMmZtd1-W9k,48167
60
- upgini-1.1.266a3254.post2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
61
- upgini-1.1.266a3254.post2.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
62
- upgini-1.1.266a3254.post2.dist-info/RECORD,,
58
+ upgini-1.1.267a3254.post3.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
59
+ upgini-1.1.267a3254.post3.dist-info/METADATA,sha256=8-ODvHx4kAE3IrjYFRmIsThFJ8nIeBsD1BWjP6iuDno,48167
60
+ upgini-1.1.267a3254.post3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
61
+ upgini-1.1.267a3254.post3.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
62
+ upgini-1.1.267a3254.post3.dist-info/RECORD,,