upgini 1.1.262a3250.post3__py3-none-any.whl → 1.1.274a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -38,6 +38,7 @@ loss_selection_warn=\nWARNING: Loss `{0}` is not supported for feature selection
38
38
  loss_calc_metrics_warn=\nWARNING: Loss `{0}` is not supported for metrics calculation with {1}
39
39
  multivariate_timeseries_detected=\nWARNING: Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
40
40
  group_k_fold_in_classification=\nWARNING: Using group K-fold cross-validation split for classification task.
41
+ current_date_added=\nWARNING: No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
41
42
 
42
43
  # Errors
43
44
  failed_search_by_task_id=Failed to retrieve the specified search results
@@ -111,6 +112,9 @@ x_is_empty=X is empty
111
112
  y_is_empty=y is empty
112
113
  x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
113
114
  missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
115
+ x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
116
+ train_unstable_target=\nWARNING: Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
117
+ eval_unstable_target=\nWARNING: Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
114
118
  # eval set validation
115
119
  unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
116
120
  eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
@@ -145,7 +149,8 @@ dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample
145
149
  dataset_empty_column_names=Some column names are empty. Add names please
146
150
  dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
147
151
  dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
148
- dataset_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
152
+ dataset_train_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
153
+ dataset_eval_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
149
154
  dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
150
155
  dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
151
156
  dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
@@ -196,10 +201,10 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
196
201
  email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
197
202
  phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
198
203
  phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
199
- target_type_detected=Detected task type: {}\n
204
+ target_type_detected=\nDetected task type: {}\n
200
205
  # all_ok_community_invite=Chat with us in Slack community:
201
206
  all_ok_community_invite=❓ Support request
202
- too_small_for_metrics=Your train dataset contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
207
+ too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
203
208
  imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
204
209
  loss_selection_info=Using loss `{}` for feature selection
205
210
  loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
upgini/search_task.py CHANGED
@@ -57,7 +57,7 @@ class SearchTask:
57
57
  if logger is not None:
58
58
  self.logger = logger
59
59
  else:
60
- self.logger = logging.getLogger()
60
+ self.logger = logging.getLogger("muted_logger")
61
61
  self.logger.setLevel("FATAL")
62
62
  self.provider_metadata_v2: Optional[List[ProviderTaskMetadataV2]] = None
63
63
  self.unused_features_for_generation: Optional[List[str]] = None
@@ -1,7 +1,7 @@
1
1
  import datetime
2
2
  import logging
3
3
  import re
4
- from typing import List, Optional
4
+ from typing import Dict, List, Optional
5
5
 
6
6
  import numpy as np
7
7
  import pandas as pd
@@ -9,7 +9,9 @@ from dateutil.relativedelta import relativedelta
9
9
  from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
10
10
 
11
11
  from upgini.errors import ValidationError
12
+ from upgini.metadata import SearchKey
12
13
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
14
+ from upgini.utils.warning_counter import WarningCounter
13
15
 
14
16
  DATE_FORMATS = [
15
17
  "%Y-%m-%d",
@@ -44,7 +46,7 @@ class DateTimeSearchKeyConverter:
44
46
  if logger is not None:
45
47
  self.logger = logger
46
48
  else:
47
- self.logger = logging.getLogger()
49
+ self.logger = logging.getLogger("muted_logger")
48
50
  self.logger.setLevel("FATAL")
49
51
  self.generated_features: List[str] = []
50
52
  self.bundle = bundle or get_custom_bundle()
@@ -98,6 +100,9 @@ class DateTimeSearchKeyConverter:
98
100
  msg = self.bundle.get("unsupported_date_type").format(self.date_column)
99
101
  self.logger.warning(msg)
100
102
  raise ValidationError(msg)
103
+ else:
104
+ df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
105
+ df[self.date_column] = self.parse_date(df)
101
106
 
102
107
  # If column with date is datetime then extract seconds of the day and minute of the hour
103
108
  # as additional features
@@ -225,3 +230,49 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
225
230
 
226
231
  is_diff_less_than_two_columns = grouped.apply(check_differences)
227
232
  return is_diff_less_than_two_columns.all()
233
+
234
+
235
+ def validate_dates_distribution(
236
+ X: pd.DataFrame,
237
+ search_keys: Dict[str, SearchKey],
238
+ logger: Optional[logging.Logger] = None,
239
+ bundle: Optional[ResourceBundle] = None,
240
+ warning_counter: Optional[WarningCounter] = None,
241
+ ):
242
+ maybe_date_col = None
243
+ for key, key_type in search_keys.items():
244
+ if key_type in [SearchKey.DATE, SearchKey.DATETIME]:
245
+ maybe_date_col = key
246
+
247
+ if maybe_date_col is None:
248
+ for col in X.columns:
249
+ if col in search_keys:
250
+ continue
251
+ try:
252
+ pd.to_datetime(X[col])
253
+ maybe_date_col = col
254
+ break
255
+ except Exception:
256
+ pass
257
+
258
+ if maybe_date_col is None:
259
+ return
260
+
261
+ dates = pd.to_datetime(X[maybe_date_col]).dt.date
262
+
263
+ date_counts = dates.value_counts().sort_index()
264
+
265
+ date_counts_1 = date_counts[: round(len(date_counts) / 2)]
266
+ date_counts_2 = date_counts[round(len(date_counts) / 2) :]
267
+ ratio = date_counts_2.mean() / date_counts_1.mean()
268
+
269
+ if ratio > 1.2 or ratio < 0.8:
270
+ if warning_counter is not None:
271
+ warning_counter.increment()
272
+ if logger is None:
273
+ logger = logging.getLogger("muted_logger")
274
+ logger.setLevel("FATAL")
275
+ bundle = bundle or get_custom_bundle()
276
+ msg = bundle.get("x_unstable_by_date")
277
+ print(msg)
278
+ logger.warning(msg)
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Union
3
3
 
4
4
  import pandas as pd
5
5
 
6
- from upgini.metadata import SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
6
+ from upgini.metadata import EVAL_SET_INDEX, SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
7
7
  from upgini.resource_bundle import ResourceBundle
8
8
  from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
9
9
  from upgini.utils.target_utils import define_task
@@ -78,20 +78,58 @@ def remove_fintech_duplicates(
78
78
  rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
79
79
  if len(rows_with_diff_target) > 0:
80
80
  unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
81
- rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
82
- rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
83
- perc = len(rows_to_remove) * 100 / len(df)
84
- msg = bundle.get("dataset_diff_target_duplicates_fintech").format(
85
- perc, len(rows_to_remove), rows_to_remove.index.to_list()
86
- )
87
- if not silent:
88
- print(msg)
89
- if logger:
90
- logger.warning(msg)
91
- logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
92
- df = df[~df.index.isin(rows_to_remove.index)]
93
- logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
94
-
81
+ if EVAL_SET_INDEX not in df.columns:
82
+ rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
83
+ rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
84
+ perc = len(rows_to_remove) * 100 / len(df)
85
+ msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
86
+ perc, len(rows_to_remove), rows_to_remove.index.to_list()
87
+ )
88
+ if not silent:
89
+ print(msg)
90
+ if logger:
91
+ logger.warning(msg)
92
+ logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
93
+ df = df[~df.index.isin(rows_to_remove.index)]
94
+ logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
95
+ else:
96
+ # Indices in train and eval_set can be the same so we remove rows from them separately
97
+ train = df.query(f"{EVAL_SET_INDEX} == 0")
98
+ train_rows_to_remove = pd.merge(train.reset_index(), unique_keys_to_delete, on=personal_cols)
99
+ train_rows_to_remove = train_rows_to_remove.set_index(train.index.name or "index")
100
+ train_perc = len(train_rows_to_remove) * 100 / len(train)
101
+ msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
102
+ train_perc, len(train_rows_to_remove), train_rows_to_remove.index.to_list()
103
+ )
104
+ if not silent:
105
+ print(msg)
106
+ if logger:
107
+ logger.warning(msg)
108
+ logger.info(f"Train dataset shape before clean fintech duplicates: {train.shape}")
109
+ train = train[~train.index.isin(train_rows_to_remove.index)]
110
+ logger.info(f"Train dataset shape after clean fintech duplicates: {train.shape}")
111
+
112
+ evals = [df.query(f"{EVAL_SET_INDEX} == {i}") for i in df[EVAL_SET_INDEX].unique() if i != 0]
113
+ new_evals = []
114
+ for i, eval in enumerate(evals):
115
+ eval_rows_to_remove = pd.merge(eval.reset_index(), unique_keys_to_delete, on=personal_cols)
116
+ eval_rows_to_remove = eval_rows_to_remove.set_index(eval.index.name or "index")
117
+ eval_perc = len(eval_rows_to_remove) * 100 / len(eval)
118
+ msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
119
+ eval_perc, len(eval_rows_to_remove), i + 1, eval_rows_to_remove.index.to_list()
120
+ )
121
+ if not silent:
122
+ print(msg)
123
+ if logger:
124
+ logger.warning(msg)
125
+ logger.info(f"Eval {i + 1} dataset shape before clean fintech duplicates: {eval.shape}")
126
+ eval = eval[~eval.index.isin(eval_rows_to_remove.index)]
127
+ logger.info(f"Eval {i + 1} dataset shape after clean fintech duplicates: {eval.shape}")
128
+ new_evals.append(eval)
129
+
130
+ logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
131
+ df = pd.concat([train] + new_evals)
132
+ logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
95
133
  return df
96
134
 
97
135
 
@@ -101,14 +139,18 @@ def clean_full_duplicates(
101
139
  nrows = len(df)
102
140
  if nrows == 0:
103
141
  return df
104
- # Remove absolute duplicates (exclude system_record_id)
142
+ # Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
105
143
  unique_columns = df.columns.tolist()
106
144
  if SYSTEM_RECORD_ID in unique_columns:
107
145
  unique_columns.remove(SYSTEM_RECORD_ID)
108
146
  if SORT_ID in unique_columns:
109
147
  unique_columns.remove(SORT_ID)
148
+ if EVAL_SET_INDEX in unique_columns:
149
+ unique_columns.remove(EVAL_SET_INDEX)
110
150
  logger.info(f"Dataset shape before clean duplicates: {df.shape}")
111
- df = df.drop_duplicates(subset=unique_columns)
151
+ # Train segment goes first so if duplicates are found in train and eval set
152
+ # then we keep unique rows in train segment
153
+ df = df.drop_duplicates(subset=unique_columns, keep="first")
112
154
  logger.info(f"Dataset shape after clean duplicates: {df.shape}")
113
155
  nrows_after_full_dedup = len(df)
114
156
  share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
@@ -123,7 +165,7 @@ def clean_full_duplicates(
123
165
  marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
124
166
  if marked_duplicates.sum() > 0:
125
167
  dups_indices = df[marked_duplicates].index.to_list()
126
- nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns))
168
+ nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
127
169
  num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
128
170
  share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
129
171
 
@@ -133,6 +175,7 @@ def clean_full_duplicates(
133
175
  print(msg)
134
176
  df = df.drop_duplicates(subset=unique_columns, keep=False)
135
177
  logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
178
+
136
179
  return df
137
180
 
138
181
 
@@ -1,5 +1,4 @@
1
1
  import functools
2
- import logging
3
2
  import numbers
4
3
  import time
5
4
  import warnings
@@ -313,7 +312,7 @@ def cross_validate(
313
312
 
314
313
  return ret
315
314
  except Exception:
316
- logging.exception("Failed to execute overriden cross_validate. Fallback to original")
315
+ # logging.exception("Failed to execute overriden cross_validate. Fallback to original")
317
316
  raise
318
317
  # fit_params["use_best_model"] = False
319
318
  # return original_cross_validate(
@@ -6,8 +6,10 @@ import pandas as pd
6
6
  from pandas.api.types import is_numeric_dtype
7
7
 
8
8
  from upgini.errors import ValidationError
9
- from upgini.metadata import ModelTaskType
10
- from upgini.resource_bundle import bundle
9
+ from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
10
+ from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
11
+ from upgini.sampler.random_under_sampler import RandomUnderSampler
12
+ from upgini.utils.warning_counter import WarningCounter
11
13
 
12
14
 
13
15
  def correct_string_target(y: Union[pd.Series, np.ndarray]) -> Union[pd.Series, np.ndarray]:
@@ -72,3 +74,124 @@ def is_int_encoding(unique_values):
72
74
  return set(unique_values) == set(range(len(unique_values))) or set(unique_values) == set(
73
75
  range(1, len(unique_values) + 1)
74
76
  )
77
+
78
+
79
+ def balance_undersample(
80
+ df: pd.DataFrame,
81
+ target_column: str,
82
+ task_type: ModelTaskType,
83
+ random_state: int,
84
+ imbalance_threshold: int = 0.2,
85
+ min_sample_threshold: int = 5000,
86
+ binary_bootstrap_loops: int = 5,
87
+ multiclass_bootstrap_loops: int = 2,
88
+ logger: Optional[logging.Logger] = None,
89
+ bundle: Optional[ResourceBundle] = None,
90
+ warning_counter: Optional[WarningCounter] = None,
91
+ ) -> pd.DataFrame:
92
+ if logger is None:
93
+ logger = logging.getLogger("muted_logger")
94
+ logger.setLevel("FATAL")
95
+ bundle = bundle or get_custom_bundle()
96
+ if SYSTEM_RECORD_ID not in df.columns:
97
+ raise Exception("System record id must be presented for undersampling")
98
+
99
+ count = len(df)
100
+ target = df[target_column].copy()
101
+ target_classes_count = target.nunique()
102
+
103
+ vc = target.value_counts()
104
+ max_class_value = vc.index[0]
105
+ min_class_value = vc.index[len(vc) - 1]
106
+ max_class_count = vc[max_class_value]
107
+ min_class_count = vc[min_class_value]
108
+
109
+ min_class_percent = imbalance_threshold / target_classes_count
110
+ min_class_threshold = min_class_percent * count
111
+
112
+ resampled_data = df
113
+ df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
114
+ if task_type == ModelTaskType.MULTICLASS:
115
+ # Sort classes by rows count and find 25% quantile class
116
+ classes = vc.index
117
+ quantile25_idx = int(0.75 * len(classes)) - 1
118
+ quantile25_class = classes[quantile25_idx]
119
+ quantile25_class_cnt = vc[quantile25_class]
120
+
121
+ if max_class_count > (quantile25_class_cnt * multiclass_bootstrap_loops):
122
+ msg = bundle.get("imbalance_multiclass").format(quantile25_class, quantile25_class_cnt)
123
+ logger.warning(msg)
124
+ print(msg)
125
+ if warning_counter:
126
+ warning_counter.increment()
127
+
128
+ # 25% and lower classes will stay as is. Higher classes will be downsampled
129
+ sample_strategy = dict()
130
+ for class_idx in range(quantile25_idx):
131
+ # compare class count with count_of_quantile25_class * 2
132
+ class_value = classes[class_idx]
133
+ class_count = vc[class_value]
134
+ sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
135
+ sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
136
+ X = df[SYSTEM_RECORD_ID]
137
+ X = X.to_frame(SYSTEM_RECORD_ID)
138
+ new_x, _ = sampler.fit_resample(X, target) # type: ignore
139
+
140
+ resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
141
+ elif len(df) > min_sample_threshold and min_class_count < min_sample_threshold / 2:
142
+ msg = bundle.get("dataset_rarest_class_less_threshold").format(
143
+ min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
144
+ )
145
+ logger.warning(msg)
146
+ print(msg)
147
+ if warning_counter:
148
+ warning_counter.increment()
149
+
150
+ # fill up to min_sample_threshold by majority class
151
+ minority_class = df[df[target_column] == min_class_value]
152
+ majority_class = df[df[target_column] != min_class_value]
153
+ sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
154
+ sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
155
+ resampled_data = df[
156
+ (df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
157
+ | (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
158
+ ]
159
+
160
+ elif max_class_count > min_class_count * binary_bootstrap_loops:
161
+ msg = bundle.get("dataset_rarest_class_less_threshold").format(
162
+ min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
163
+ )
164
+ logger.warning(msg)
165
+ print(msg)
166
+ if warning_counter:
167
+ warning_counter.increment()
168
+
169
+ sampler = RandomUnderSampler(
170
+ sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
171
+ )
172
+ X = df[SYSTEM_RECORD_ID]
173
+ X = X.to_frame(SYSTEM_RECORD_ID)
174
+ new_x, _ = sampler.fit_resample(X, target) # type: ignore
175
+
176
+ resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
177
+
178
+ logger.info(f"Shape after rebalance resampling: {resampled_data}")
179
+ return resampled_data
180
+
181
+
182
+ def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
183
+ df = pd.concat([expected, actual])
184
+
185
+ # Define the bins for the target variable
186
+ df_min = df.min()
187
+ df_max = df.max()
188
+ bins = [df_min, (df_min + df_max) / 2, df_max]
189
+
190
+ # Calculate the base distribution
191
+ train_distribution = expected.value_counts(bins=bins, normalize=True).sort_index().values
192
+
193
+ # Calculate the target distribution
194
+ test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
195
+
196
+ # Calculate the PSI
197
+ return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.262a3250.post3
3
+ Version: 1.1.274a4
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -28,7 +28,7 @@ Description-Content-Type: text/markdown
28
28
  License-File: LICENSE
29
29
  Requires-Dist: python-dateutil >=2.8.0
30
30
  Requires-Dist: requests >=2.8.0
31
- Requires-Dist: pandas <2.0.0,>=1.1.0
31
+ Requires-Dist: pandas <2.1.0,>=1.1.0
32
32
  Requires-Dist: numpy >=1.19.0
33
33
  Requires-Dist: scikit-learn >=1.3.0
34
34
  Requires-Dist: pydantic <2.0.0,>=1.8.2
@@ -1,34 +1,35 @@
1
1
  upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
- upgini/dataset.py,sha256=ywBwf93d0IH39ZGfmNDlAwe1ILQtt1WzJ87WfIOMI2g,48149
3
+ upgini/dataset.py,sha256=xb4gIANyGbdcuM8Awyq2pJPiH_3k_LEbETApJgAoRBA,45529
4
4
  upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
5
- upgini/features_enricher.py,sha256=fFSLW6aAzVq5YYaVcl-xbjSd3qYt8dW9hYAIestylSk,172118
5
+ upgini/features_enricher.py,sha256=WDj4DO5lqANBdihEcRmwox4w1kqWVOorlIKY4dbsqrU,175376
6
+ upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
6
7
  upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
7
8
  upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
8
- upgini/metrics.py,sha256=3VvSZW1cCOIPHImXuqcnWzD3fWcpPzVa9k8eulLbUmY,27426
9
- upgini/search_task.py,sha256=5n4qGJmtu48s0-FHAtF3L5qVLMd1JVW3FJlM8dFbh-s,17063
9
+ upgini/metrics.py,sha256=U3VJKbKmuWACqI4jTcszXo0WqeXFtV8bWyY9VLBL-rw,29129
10
+ upgini/search_task.py,sha256=tmJ17WUxv3J5NWrYUJB_NKdZ792Ifz8Z8UnDXeQnpss,17077
10
11
  upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
11
12
  upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
12
13
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
13
14
  upgini/ads_management/ads_manager.py,sha256=fP4Yqx3h2Snw5X335TbXEwFoupq1RYsE7y0PAduvetU,2646
14
15
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- upgini/autofe/all_operands.py,sha256=KWAdcYv6cToc6NZPcCmz6P3N8Nwjp8UqojKuz-f2BZY,1589
16
+ upgini/autofe/all_operands.py,sha256=H66wqVLD-H9k8A4-q2wslhV9QaNxlb49f8YiT0Xfkps,2356
16
17
  upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
17
- upgini/autofe/date.py,sha256=_VqhFMkzItbWxQjNgxqerx0sWbcV9yxq0q5kI33LvHk,1807
18
- upgini/autofe/feature.py,sha256=y5UMU8_cSrP9-3xmrmVlGXwIX2_bwTmzgQy4ShwEjMk,11812
18
+ upgini/autofe/date.py,sha256=_6RoEJZ5Kf-Q_aMOFucS6YSIZpCcelgpw-edV4qmRIM,3935
19
+ upgini/autofe/feature.py,sha256=2FQRGtIumNz60hFAjfLReaY18SI7HxzYZOoC5avzSjQ,11847
19
20
  upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
20
21
  upgini/autofe/operand.py,sha256=dhtToPDGWtP_0u_RjayUpezJJZAgq_TzNbPH0bI9OXI,2805
21
22
  upgini/autofe/unary.py,sha256=YRTzQLttbDdOnkogWBPnBexpu7uHWSLSFAxSCu3iFdY,3145
22
23
  upgini/autofe/vector.py,sha256=5qhI_bdwaWM1l7fgCkx1tMt9R9gxWzoYCl-7WO4KiOs,604
23
24
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- upgini/data_source/data_source_publisher.py,sha256=yCMyYwFTfv0e7h-kAdtiQCF42J1DbqmJ1Wi0xt_ZzeM,15578
25
+ upgini/data_source/data_source_publisher.py,sha256=taRzyGgrPrTTSGw4Y-Ca5k4bf30aiTa68rxqT9zfqeI,16478
25
26
  upgini/mdc/__init__.py,sha256=ETDh3JKbrDdPMOECiYLAa8lvKYe68mv4IY6fZa9FimA,1126
26
27
  upgini/mdc/context.py,sha256=Sl1S_InKlzzRxYqwJ2k24lawJdCKWgGJ-RIRfvzWJrk,1468
27
28
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
- upgini/normalizer/phone_normalizer.py,sha256=lhwsPEnfyjeIsndW2EcQGZksXYsfxaQ1ghAzVYoDRKM,9927
29
+ upgini/normalizer/phone_normalizer.py,sha256=_SYMX4GTgwzRXArK54Jp3vUBE5d4jZxSVyze-0tqzg0,9996
29
30
  upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
30
31
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
31
- upgini/resource_bundle/strings.properties,sha256=MGU_oBc15VAmbPZdThCpm3B4xERAKwbCIUTIG66dvUo,25228
32
+ upgini/resource_bundle/strings.properties,sha256=x-2fXtGc5Z2n7eUg9b6I4yhok56TTXDvzwU1JUaKcj4,26285
32
33
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
33
34
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
35
  upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
@@ -40,8 +41,8 @@ upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6P
40
41
  upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU,6462
41
42
  upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
42
43
  upgini/utils/cv_utils.py,sha256=Tn01RJvpZGZh0PUQUimlBkV-AXwe7s6yjCNFtw352Uc,3525
43
- upgini/utils/datetime_utils.py,sha256=5wvEz9DWL_RS4EST5FFIidfD36MSL-wij4P9AAJpMl0,8822
44
- upgini/utils/deduplicate_utils.py,sha256=ckJrpU8Ruc_vcwIPTopbUjyJuNiseLHNAbQlLfhUCxo,5888
44
+ upgini/utils/datetime_utils.py,sha256=XciFOIYI4Zi7PqQS8dHxuPDEtdtwXbOrWsiAa04v2J4,10511
45
+ upgini/utils/deduplicate_utils.py,sha256=6AbARehUCghJZ4PppFtrej2s3gFRruh41MEm6mzakHs,8607
45
46
  upgini/utils/display_utils.py,sha256=LKoSwjrE0xgS5_cqVhc2og2CQ1UCZ1nTI2VKboIhoQA,10858
46
47
  upgini/utils/email_utils.py,sha256=3CvHXTSzlgLyGsQOXfRYVfFhfPy6OXG4uXOBWRaLfHg,3479
47
48
  upgini/utils/fallback_progress_bar.py,sha256=cdbd1XGcWm4Ed4eAqV2_St3z7uC_kkH22gEyrN5ub6M,1090
@@ -51,12 +52,12 @@ upgini/utils/ip_utils.py,sha256=Zf3F2cnQmOCH09QLQHetpjMFu1PnD0cTmDymn0SnSy8,1672
51
52
  upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,408
52
53
  upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3x_zs,409
53
54
  upgini/utils/progress_bar.py,sha256=iNXyqT3vKCeHpfiG5HHwr7Lk2cTtKViM93Fl8iZnjGc,1564
54
- upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,44027
55
- upgini/utils/target_utils.py,sha256=DH812qcZ7Pvf9WVVb33fbwQjb1W9h1hXRNCCiG7Y6tI,2563
55
+ upgini/utils/sklearn_ext.py,sha256=e1aMNXk1zUt7uFnl0FcUF0zOnaXSE7z5xBHmJPknUVs,44014
56
+ upgini/utils/target_utils.py,sha256=9K67tkY7LWhQMO-vbbPqBaO-KriAmg_6fVz5RQRaLQc,7802
56
57
  upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
57
58
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
58
- upgini-1.1.262a3250.post3.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
59
- upgini-1.1.262a3250.post3.dist-info/METADATA,sha256=3IXK7QAB6WSAAiUgvdnudgEkXMCAz5e9tJQ4L35mOvE,48167
60
- upgini-1.1.262a3250.post3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
61
- upgini-1.1.262a3250.post3.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
62
- upgini-1.1.262a3250.post3.dist-info/RECORD,,
59
+ upgini-1.1.274a4.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
60
+ upgini-1.1.274a4.dist-info/METADATA,sha256=xng0cJvEGeFT2zSBqLDy-qf9I6ONKxdKtXsFWokPpPs,48158
61
+ upgini-1.1.274a4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
62
+ upgini-1.1.274a4.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
63
+ upgini-1.1.274a4.dist-info/RECORD,,