upgini 1.2.112__py3-none-any.whl → 1.2.113a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/metadata.py CHANGED
@@ -285,6 +285,7 @@ class FeaturesMetadataV2(BaseModel):
285
285
  doc_link: Optional[str] = None
286
286
  update_frequency: Optional[str] = None
287
287
  from_online_api: Optional[bool] = None
288
+ psi_value: Optional[float] = None
288
289
 
289
290
 
290
291
  class HitRateMetrics(BaseModel):
upgini/metrics.py CHANGED
@@ -1175,7 +1175,10 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
1175
1175
  >>> mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7])
1176
1176
  0.060...
1177
1177
  """
1178
- _, y_true, y_pred, multioutput = _check_reg_targets(y_true, y_pred, multioutput)
1178
+ try:
1179
+ _, y_true, y_pred, multioutput = _check_reg_targets(y_true, y_pred, multioutput)
1180
+ except TypeError:
1181
+ _, y_true, y_pred, sample_weight, multioutput = _check_reg_targets(y_true, y_pred, sample_weight, multioutput)
1179
1182
  check_consistent_length(y_true, y_pred, sample_weight)
1180
1183
 
1181
1184
  if (y_true < 0).any():
@@ -123,7 +123,7 @@ train_unstable_target=Your training sample contains an unstable target event, PS
123
123
  eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
124
124
  # eval set validation
125
125
  unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
126
- eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
126
+ eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y or X only
127
127
  unsupported_x_type_eval_set=Unsupported type of X in eval_set: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list.
128
128
  eval_x_and_x_diff_shape=The column set in eval_set are differ from the column set in X
129
129
  unsupported_y_type_eval_set=Unsupported type of y in eval_set: {}. Use pandas.Series, numpy.ndarray or list
@@ -139,6 +139,8 @@ eval_x_is_empty=X in eval_set is empty.
139
139
  eval_y_is_empty=y in eval_set is empty.
140
140
  x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
141
141
  eval_x_has_train_samples=Eval set X has rows that are present in train set X
142
+ oot_without_date_not_supported=Eval set {} provided as OOT but date column is missing. It will be ignored for stability check
143
+ oot_with_online_sources_not_supported=Eval set {} provided as OOT and also provided columns for online API. It will be ignored for stability check
142
144
 
143
145
  baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
144
146
  baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
@@ -255,6 +257,7 @@ features_info_provider=Provider
255
257
  features_info_source=Source
256
258
  features_info_name=Feature name
257
259
  features_info_shap=SHAP value
260
+ features_info_psi=PSI value
258
261
  features_info_hitrate=Coverage %
259
262
  features_info_type=Type
260
263
  # Deprecated
upgini/sampler/base.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """
2
2
  Base class for the under-sampling method.
3
3
  """
4
+
4
5
  # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
5
6
  # License: MIT
6
7
 
@@ -12,6 +13,7 @@ import numpy as np
12
13
  from sklearn.base import BaseEstimator
13
14
  from sklearn.preprocessing import label_binarize
14
15
  from sklearn.utils.multiclass import check_classification_targets
16
+ from sklearn.utils.validation import check_X_y
15
17
 
16
18
  from .utils import ArraysTransformer, check_sampling_strategy, check_target_type
17
19
 
@@ -125,7 +127,7 @@ class BaseSampler(SamplerMixin):
125
127
  if accept_sparse is None:
126
128
  accept_sparse = ["csr", "csc"]
127
129
  y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
128
- X, y = self._validate_data(X, y, reset=True, accept_sparse=accept_sparse)
130
+ X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=None, ensure_all_finite=False)
129
131
  return X, y, binarize_y
130
132
 
131
133
  def _more_tags(self):
@@ -80,14 +80,24 @@ RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE
80
80
 
81
81
  def _check_X_y(self, X, y):
82
82
  y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
83
- X, y = self._validate_data(
84
- X,
85
- y,
86
- reset=True,
87
- accept_sparse=["csr", "csc"],
88
- dtype=None,
89
- force_all_finite=False,
90
- )
83
+ try:
84
+ X, y = self._validate_data(
85
+ X,
86
+ y,
87
+ reset=True,
88
+ accept_sparse=["csr", "csc"],
89
+ dtype=None,
90
+ force_all_finite=False,
91
+ )
92
+ except AttributeError:
93
+ from sklearn.utils.validation import check_X_y
94
+ X, y = check_X_y(
95
+ X,
96
+ y,
97
+ accept_sparse=["csr", "csc"],
98
+ dtype=None,
99
+ ensure_all_finite=False,
100
+ )
91
101
  return X, y, binarize_y
92
102
 
93
103
  def _fit_resample(self, X, y):
@@ -136,6 +136,9 @@ def remove_fintech_duplicates(
136
136
  # Process each eval_set part separately
137
137
  new_eval_dfs = []
138
138
  for i, eval_df in enumerate(eval_dfs, 1):
139
+ # Skip OOT
140
+ if eval_df[TARGET].isna().all():
141
+ continue
139
142
  logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
140
143
  cleaned_eval_df, eval_warning = process_df(eval_df, i)
141
144
  if eval_warning:
@@ -190,16 +193,49 @@ def clean_full_duplicates(
190
193
  msg = None
191
194
  if TARGET in df.columns:
192
195
  unique_columns.remove(TARGET)
193
- marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
196
+
197
+ # Separate rows to exclude from deduplication:
198
+ # for each eval_set_index != 0 check separately, all TARGET values are NaN
199
+ excluded_from_dedup = pd.DataFrame()
200
+ df_for_dedup = df
201
+
202
+ if EVAL_SET_INDEX in df.columns:
203
+ excluded_parts = []
204
+ # Get all unique eval_set_index values, except 0
205
+ unique_eval_indices = df[df[EVAL_SET_INDEX] != 0][EVAL_SET_INDEX].unique()
206
+
207
+ for eval_idx in unique_eval_indices:
208
+ eval_subset = df[df[EVAL_SET_INDEX] == eval_idx]
209
+ # Check that all TARGET values for this specific eval_set_index are NaN
210
+ if len(eval_subset) > 0 and eval_subset[TARGET].isna().all():
211
+ excluded_parts.append(eval_subset)
212
+ logger.info(
213
+ f"Excluded {len(eval_subset)} rows from deduplication "
214
+ f"(eval_set_index={eval_idx} and all TARGET values are NaN)"
215
+ )
216
+
217
+ # Combine all excluded parts
218
+ if excluded_parts:
219
+ excluded_from_dedup = pd.concat(excluded_parts, ignore_index=False)
220
+ # Remove excluded rows from dataframe for deduplication
221
+ excluded_indices = excluded_from_dedup.index
222
+ df_for_dedup = df[~df.index.isin(excluded_indices)]
223
+ marked_duplicates = df_for_dedup.duplicated(subset=unique_columns, keep=False)
194
224
  if marked_duplicates.sum() > 0:
195
- dups_indices = df[marked_duplicates].index.to_list()[:100]
196
- nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
197
- num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
198
- share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
225
+ dups_indices = df_for_dedup[marked_duplicates].index.to_list()[:100]
226
+ nrows_after_tgt_dedup = len(df_for_dedup.drop_duplicates(subset=unique_columns, keep=False))
227
+ num_dup_rows = len(df_for_dedup) - nrows_after_tgt_dedup
228
+ share_tgt_dedup = 100 * num_dup_rows / len(df_for_dedup)
199
229
 
200
230
  msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
201
- df = df.drop_duplicates(subset=unique_columns, keep=False)
202
- logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
231
+ df_for_dedup = df_for_dedup.drop_duplicates(subset=unique_columns, keep=False)
232
+ logger.info(f"Dataset shape after clean invalid target duplicates: {df_for_dedup.shape}")
233
+ # Combine back excluded rows
234
+ if len(excluded_from_dedup) > 0:
235
+ df = pd.concat([df_for_dedup, excluded_from_dedup], ignore_index=False)
236
+ logger.info(f"Final dataset shape after adding back excluded rows: {df.shape}")
237
+ else:
238
+ df = df_for_dedup
203
239
 
204
240
  return df, msg
205
241
 
@@ -27,6 +27,7 @@ class FeatureInfo:
27
27
  doc_link: str
28
28
  data_provider_link: str
29
29
  data_source_link: str
30
+ psi_value: Optional[float] = None
30
31
 
31
32
  @staticmethod
32
33
  def from_metadata(
@@ -47,12 +48,14 @@ class FeatureInfo:
47
48
  doc_link=feature_meta.doc_link,
48
49
  data_provider_link=feature_meta.data_provider_link,
49
50
  data_source_link=feature_meta.data_source_link,
51
+ psi_value=feature_meta.psi_value,
50
52
  )
51
53
 
52
54
  def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
53
55
  return {
54
56
  bundle.get("features_info_name"): self.name,
55
57
  bundle.get("features_info_shap"): self.rounded_shap,
58
+ bundle.get("features_info_psi"): self.psi_value,
56
59
  bundle.get("features_info_hitrate"): self.hitrate,
57
60
  bundle.get("features_info_value_preview"): self.value_preview,
58
61
  bundle.get("features_info_provider"): self.provider,
@@ -64,6 +67,7 @@ class FeatureInfo:
64
67
  return {
65
68
  bundle.get("features_info_name"): self.internal_name,
66
69
  bundle.get("features_info_shap"): self.rounded_shap,
70
+ bundle.get("features_info_psi"): self.psi_value,
67
71
  bundle.get("features_info_hitrate"): self.hitrate,
68
72
  bundle.get("features_info_value_preview"): self.value_preview,
69
73
  bundle.get("features_info_provider"): self.internal_provider,
@@ -76,6 +80,7 @@ class FeatureInfo:
76
80
  bundle.get("features_info_name"): self.internal_name,
77
81
  "feature_link": self.doc_link,
78
82
  bundle.get("features_info_shap"): self.rounded_shap,
83
+ bundle.get("features_info_psi"): self.psi_value,
79
84
  bundle.get("features_info_hitrate"): self.hitrate,
80
85
  bundle.get("features_info_value_preview"): self.value_preview,
81
86
  bundle.get("features_info_provider"): self.internal_provider,
upgini/utils/psi.py ADDED
@@ -0,0 +1,268 @@
1
+ import itertools
2
+ import logging
3
+ import operator
4
+ from functools import reduce
5
+ from typing import Callable, Dict, Optional
6
+
7
+ import more_itertools
8
+ import numpy as np
9
+ import pandas as pd
10
+ from pandas.api.types import is_numeric_dtype
11
+ from pydantic import BaseModel
12
+
13
+ from upgini.metadata import TARGET, ModelTaskType
14
+
15
+
16
+ class StabilityParams(BaseModel):
17
+ threshold: float = 999
18
+ n_intervals: int = 12
19
+ min_intervals: int = 10
20
+ max_intervals: Optional[int] = None
21
+ min_values_in_interval: Optional[int] = None
22
+ n_bins: int = 10
23
+ min_values_in_bin: Optional[int] = None
24
+ cat_top_pct: float = 0.7
25
+ agg: str = "max"
26
+
27
+
28
+ DEFAULT_TARGET_PARAMS = StabilityParams(
29
+ n_intervals=12,
30
+ min_intervals=10,
31
+ max_intervals=None,
32
+ min_values_in_interval=None,
33
+ n_bins=5,
34
+ )
35
+
36
+ DEFAULT_FEATURES_PARAMS = StabilityParams(
37
+ n_intervals=12,
38
+ min_intervals=10,
39
+ max_intervals=None,
40
+ min_values_in_interval=None,
41
+ n_bins=10,
42
+ )
43
+
44
+
45
+ def calculate_features_psi(
46
+ df: pd.DataFrame,
47
+ cat_features: list[str],
48
+ date_column: str,
49
+ logger: logging.Logger,
50
+ model_task_type: ModelTaskType,
51
+ psi_features_params: StabilityParams = DEFAULT_FEATURES_PARAMS,
52
+ psi_target_params: StabilityParams = DEFAULT_TARGET_PARAMS,
53
+ ) -> Dict[str, float]:
54
+ empty_res = pd.Series(index=df.columns, data=0)
55
+
56
+ if not is_numeric_dtype(df[date_column]):
57
+ df[date_column] = pd.to_datetime(df[date_column]).dt.floor("D").astype(np.int64) / 10**6
58
+
59
+ n_months = pd.to_datetime(df[date_column], unit="ms").dt.month.nunique()
60
+
61
+ if TARGET in df.columns:
62
+ psi_target_params.n_intervals = min(
63
+ psi_target_params.max_intervals or np.inf, max(psi_target_params.min_intervals, n_months)
64
+ )
65
+ logger.info(f"Setting {psi_target_params.n_intervals} intervals for target PSI check")
66
+
67
+ logger.info(f"Calculating target PSI for {psi_target_params.n_intervals} intervals")
68
+ reference_mask, current_masks = _split_intervals(df, date_column, psi_target_params.n_intervals, logger)
69
+
70
+ if psi_target_params.min_values_in_interval is not None and any(
71
+ len(mask) < psi_target_params.min_values_in_interval
72
+ for mask in itertools.chain(current_masks, [reference_mask])
73
+ ):
74
+ logger.info(
75
+ f"Some intervals have less than {psi_target_params.min_values_in_interval} values. Skip PSI check"
76
+ )
77
+ return empty_res
78
+
79
+ target_agg_func = _get_agg_func(psi_target_params.agg)
80
+ logger.info(f"Calculating target PSI with agg function {target_agg_func}")
81
+ target_psi = _stability_agg(
82
+ [df[TARGET][cur] for cur in current_masks],
83
+ reference_data=df[TARGET][reference_mask],
84
+ is_numerical=model_task_type == ModelTaskType.REGRESSION,
85
+ min_values_in_bin=psi_target_params.min_values_in_bin,
86
+ n_bins=psi_target_params.n_bins,
87
+ cat_top_pct=psi_target_params.cat_top_pct,
88
+ agg_func=target_agg_func,
89
+ )
90
+ if target_psi is None:
91
+ logger.info("Cannot determine target PSI. Skip feature PSI check")
92
+ return pd.Series(index=df.columns, data=0)
93
+
94
+ if target_psi > psi_target_params.threshold:
95
+ logger.info(
96
+ f"Target PSI {target_psi} is more than threshold {psi_target_params.threshold}. Skip feature PSI check"
97
+ )
98
+ return empty_res
99
+
100
+ psi_features_params.n_intervals = min(
101
+ psi_features_params.max_intervals or np.inf, max(psi_features_params.min_intervals, n_months)
102
+ )
103
+ logger.info(f"Setting {psi_features_params.n_intervals} intervals for features PSI check")
104
+
105
+ logger.info(f"Calculating PSI for {len(df.columns)} features")
106
+ reference_mask, current_masks = _split_intervals(df, date_column, psi_features_params.n_intervals, logger)
107
+ features_agg_func = _get_agg_func(psi_features_params.agg)
108
+ logger.info(f"Calculating features PSI with agg function {features_agg_func}")
109
+ psi_values = [
110
+ _stability_agg(
111
+ [df[feature][cur] for cur in current_masks],
112
+ reference_data=df[feature][reference_mask],
113
+ is_numerical=feature not in cat_features,
114
+ min_values_in_bin=psi_features_params.min_values_in_bin,
115
+ n_bins=psi_features_params.n_bins,
116
+ cat_top_pct=psi_features_params.cat_top_pct,
117
+ agg_func=features_agg_func,
118
+ )
119
+ for feature in df.columns
120
+ if feature not in [TARGET, date_column]
121
+ ]
122
+ return {feature: psi for feature, psi in zip(df.columns, psi_values)}
123
+
124
+
125
+ def _split_intervals(
126
+ df: pd.DataFrame, date_column: str, n_intervals: int, logger: logging.Logger
127
+ ) -> tuple[pd.Series, list[pd.Series]]:
128
+ date_series = df[date_column]
129
+
130
+ # Check if we have enough unique values for the requested number of intervals
131
+ unique_values = date_series.nunique()
132
+
133
+ # If we have fewer unique values than requested intervals, adjust n_intervals
134
+ if unique_values < n_intervals:
135
+ logger.warning(f"Date column '{date_column}' has only {unique_values} unique values")
136
+
137
+ time_intervals = pd.qcut(date_series, q=n_intervals, duplicates="drop")
138
+ interval_labels = time_intervals.unique()
139
+ reference_mask = time_intervals == interval_labels[0]
140
+ current_masks = [time_intervals == label for label in interval_labels[1:]]
141
+ return reference_mask, current_masks
142
+
143
+
144
+ def _get_agg_func(agg: str):
145
+ np_agg = getattr(np, agg, None)
146
+ if np_agg is None and agg.startswith("q"):
147
+ q = int(agg[1:])
148
+ return lambda x: np.quantile(list(x), q / 100, method="higher")
149
+ return np_agg
150
+
151
+
152
+ def _psi(reference_percent: np.ndarray, current_percent: np.ndarray) -> float:
153
+ return np.sum((reference_percent - current_percent) * np.log(reference_percent / current_percent))
154
+
155
+
156
+ def _stability_agg(
157
+ current_data: list[pd.Series],
158
+ reference_data: pd.Series,
159
+ is_numerical: bool = True,
160
+ min_values_in_bin: int | None = None,
161
+ n_bins: int = 10,
162
+ cat_top_pct: float = 0.7,
163
+ agg_func: Callable = max,
164
+ ) -> float | None:
165
+ """Calculate the PSI
166
+ Args:
167
+ current_data: current data
168
+ reference_data: reference data
169
+ is_numerical: whether the feature is numerical
170
+ reference_ratio: ratio of current data to use as reference if reference_data is not provided
171
+ min_values_in_bin: minimum number of values in a bin to calculate PSI
172
+ n_bins: number of bins to use for numerical features
173
+ Returns:
174
+ psi_value: calculated PSI
175
+ """
176
+ reference, current = _get_binned_data(reference_data, current_data, is_numerical, n_bins, cat_top_pct)
177
+
178
+ if len(reference) == 0 or len(current) == 0:
179
+ return None
180
+
181
+ nonempty_current = [i for i, c in enumerate(current) if len(c) > 0]
182
+ current = [current[i] for i in nonempty_current]
183
+ current_data = [current_data[i] for i in nonempty_current]
184
+
185
+ if len(current) == 0:
186
+ return None
187
+
188
+ if min_values_in_bin is not None and (
189
+ np.array(reference).min() < min_values_in_bin or any(np.array(c).min() < min_values_in_bin for c in current)
190
+ ):
191
+ return None
192
+
193
+ reference = _fill_zeroes(reference / len(reference_data))
194
+ current = [_fill_zeroes(c / len(d)) for c, d in zip(current, current_data)]
195
+
196
+ psi_value = agg_func([_psi(reference, c) for c in current])
197
+
198
+ return psi_value
199
+
200
+
201
+ def _get_binned_data(
202
+ reference_data: pd.Series,
203
+ current_data: list[pd.Series],
204
+ is_numerical: bool,
205
+ n_bins: int,
206
+ cat_top_pct: float,
207
+ ):
208
+ """Split variable into n buckets based on reference quantiles
209
+ Args:
210
+ reference_data: reference data
211
+ current_data: current data
212
+ feature_type: feature type
213
+ n: number of quantiles
214
+ Returns:
215
+ reference_counts: number of records in each bucket for reference
216
+ current_counts: number of records in each bucket for current
217
+ """
218
+ n_vals = reference_data.nunique()
219
+
220
+ if is_numerical and n_vals > 20:
221
+ bins = _get_bin_edges(reference_data, n_bins)
222
+ reference_counts = np.histogram(reference_data, bins)[0]
223
+ current_counts = [np.histogram(d, bins)[0] for d in current_data]
224
+
225
+ else:
226
+ keys = _get_unique_not_nan_values_list_from_series([reference_data] + current_data)
227
+ ref_feature_dict = {**dict.fromkeys(keys, 0), **dict(reference_data.value_counts())}
228
+ current_feature_dict = [{**dict.fromkeys(keys, 0), **dict(d.value_counts())} for d in current_data]
229
+ key_dict = more_itertools.map_reduce(
230
+ itertools.chain(ref_feature_dict.items(), *(d.items() for d in current_feature_dict)),
231
+ keyfunc=operator.itemgetter(0),
232
+ valuefunc=operator.itemgetter(1),
233
+ reducefunc=sum,
234
+ )
235
+ key_dict = pd.Series(key_dict)
236
+ keys = key_dict.index[key_dict.rank(pct=True) >= cat_top_pct]
237
+ reference_counts = np.array([ref_feature_dict[key] for key in keys])
238
+ current_counts = [np.array([current_feature_dict[i][key] for key in keys]) for i in range(len(current_data))]
239
+
240
+ reference_counts = np.append(reference_counts, reference_data.isna().sum())
241
+ current_counts = [np.append(d, current_data[i].isna().sum()) for i, d in enumerate(current_counts)]
242
+
243
+ return reference_counts, current_counts
244
+
245
+
246
+ def _fill_zeroes(percents: np.ndarray) -> np.ndarray:
247
+ eps = 0.0001
248
+ if (percents == 0).all():
249
+ np.place(percents, percents == 0, eps)
250
+ else:
251
+ min_value = min(percents[percents != 0])
252
+ if min_value <= eps:
253
+ np.place(percents, percents == 0, eps)
254
+ else:
255
+ np.place(percents, percents == 0, min_value / 10**6)
256
+ return percents
257
+
258
+
259
+ def _get_bin_edges(data: pd.Series, n_bins: int) -> np.ndarray:
260
+ bins = np.nanquantile(data, np.linspace(0, 1, n_bins + 1))
261
+ bins[0] = -np.inf
262
+ bins[-1] = np.inf
263
+ return bins
264
+
265
+
266
+ def _get_unique_not_nan_values_list_from_series(series: list[pd.Series]) -> list:
267
+ """Get unique values from current and reference series, drop NaNs"""
268
+ return list(reduce(set.union, (set(s.dropna().unique()) for s in series)))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.112
3
+ Version: 1.2.113a2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=2l59GHTYScTlsiV491ecYRn_6bm6FIVavXCWQJfNn2Q,24
1
+ upgini/__about__.py,sha256=biW76aRiAOQOmbgly1mb3ZD32Tz7szlWijWrBRJeIPM,26
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=xFi0a-A3uvtxVwFM6JOyitkEPd1I2slIBj5SWfys3hQ,32724
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=rfVdHgUYEq9saqhWcI04jUmNQcAAn5Kto4w3WpxlOpA,221762
6
+ upgini/features_enricher.py,sha256=5zk_FiJZXB0oodsfg_6rG1U4PW6r08aqQpKG4r7_kdo,235864
7
7
  upgini/http.py,sha256=zeAZvT6IAzOs9jQ3WG8mJBANLajgvv2LZePFzKz004w,45482
8
- upgini/metadata.py,sha256=9_0lFEWPpIHRBW-xWYSEcwPzICTC6_bQ6dUUlE75Xns,12773
9
- upgini/metrics.py,sha256=V2SP6NS5bfFHzRqufeKVsCXME1yG4t_8Dmk2E3zKdYk,45715
8
+ upgini/metadata.py,sha256=sx4X9fPkyCgXB6FPk9Rq_S1Kx8ibkbaWA-qNDVCuSmg,12811
9
+ upgini/metrics.py,sha256=O19UqmgZ6SA136eCYV5lVU3J26ecgZlGXnxGblMvZJc,45869
10
10
  upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -38,11 +38,11 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
38
38
  upgini/normalizer/normalize_utils.py,sha256=mDh2mBW3aQMB4EFP2aHbf2dGMVkOcWnp4sKKvKDBh8w,8511
39
39
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
40
40
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
41
- upgini/resource_bundle/strings.properties,sha256=NyxRwzehkrL5LMoVyjkhN811MvalepavNfjlC9ubE0Q,28677
41
+ upgini/resource_bundle/strings.properties,sha256=6Q3dwI0v1aiXt7_3Xx0Ih6jMmSCBaaRGIoUiZ5-VnCY,28988
42
42
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
43
43
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
- upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
45
- upgini/sampler/random_under_sampler.py,sha256=TIbm7ATo-bCMF-IiS5sZeDC1ad1SYg0eY_rRmg84yIQ,4024
44
+ upgini/sampler/base.py,sha256=Fva2FEhLiNRPZ9Q6uOtJRtRzwsayjv7aphalAZO_4lc,6452
45
+ upgini/sampler/random_under_sampler.py,sha256=4mofmaRTmNwT_HqxecWJyfXdLKK0h9jMBwS46xdrIqE,4356
46
46
  upgini/sampler/utils.py,sha256=PYOk3kKSnFlyxcpdtDNLBEEhTB4lO_iP7pQHqeUcmAc,20211
47
47
  upgini/utils/Roboto-Regular.ttf,sha256=kqYnZjMRQMpbyLulIChCLSdgYa1XF8GsUIoRi2Gcauw,168260
48
48
  upgini/utils/__init__.py,sha256=O_KgzKiJjW3g4NoqZ7lAxUpoHcBi_gze6r3ndEjCH74,842
@@ -52,11 +52,11 @@ upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk
52
52
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
53
53
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
54
54
  upgini/utils/datetime_utils.py,sha256=UL1ernnawW0LV9mPDpCIc6sFy0HUhFscWVNwfH4V7rI,14366
55
- upgini/utils/deduplicate_utils.py,sha256=EpBVCov42-FJIAPfa4jY_ZRct3N2MFaC7i-oJNZ_MGI,8954
55
+ upgini/utils/deduplicate_utils.py,sha256=xXashCSIg87gCy6QyXc0eb8huuzPLANmckMVxUVBEgM,10729
56
56
  upgini/utils/display_utils.py,sha256=Ou7dYdgvvdh443OgOLTM_xKwC2ITx9DQrpKoC2vCRYc,11856
57
57
  upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
58
58
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
59
- upgini/utils/feature_info.py,sha256=b3RvAeOHSEu-ZXWTrf42Dll_3ZUBL0pw7sdk7hgUKD0,7284
59
+ upgini/utils/feature_info.py,sha256=6vihytwKma_TlXtTn4l6Aj4kqlOj0ouLy-yWVV6VUw8,7551
60
60
  upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
61
61
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
62
62
  upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
@@ -64,6 +64,7 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
64
64
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
65
65
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
66
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
67
+ upgini/utils/psi.py,sha256=gYeZ3FOwGriVUnuO3BbdicSgXGQqPU14f7yleyO55f0,10108
67
68
  upgini/utils/sample_utils.py,sha256=lZJ4yf9Jiq9Em2Ny9m3RIiF7WSxBPrc4E3xxn_8sQk8,15417
68
69
  upgini/utils/sklearn_ext.py,sha256=jLJWAKkqQinV15Z4y1ZnsN3c-fKFwXTsprs00COnyVU,49315
69
70
  upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
@@ -71,7 +72,7 @@ upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,
71
72
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
72
73
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
73
74
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
74
- upgini-1.2.112.dist-info/METADATA,sha256=0FctuJ3ulRlAtYCinvR1Y0Q3cD7yffMLLfWI7LctDBY,49529
75
- upgini-1.2.112.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
76
- upgini-1.2.112.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
- upgini-1.2.112.dist-info/RECORD,,
75
+ upgini-1.2.113a2.dist-info/METADATA,sha256=55WWYsP-6y2aWCwjdk6NTxZbggKqTpF4w-BS93qDr8M,49531
76
+ upgini-1.2.113a2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
77
+ upgini-1.2.113a2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
78
+ upgini-1.2.113a2.dist-info/RECORD,,