upgini 1.2.91a3884.dev1__py3-none-any.whl → 1.2.91a3884.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.91a3884.dev1"
1
+ __version__ = "1.2.91a3884.dev3"
@@ -196,6 +196,7 @@ timeseries_invalid_test_size_type=test_size={} should be a float in the (0, 1) r
196
196
  timeseries_splits_more_than_samples=Number of splits={} can't be more than number of samples={}
197
197
  timeseries_invalid_test_size=Wrong number of samples in a test fold: (test_size * n_samples / n_splits) <= 1
198
198
  date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns. Please remove them and try again
199
+ missing_ids_in_eval_set=Following ids are present in eval set but not in sampled train set: {}. They will be removed from eval set.
199
200
  # Upload ads validation
200
201
  ads_upload_too_few_rows=At least 1000 records per sample are needed. Increase the sample size for evaluation and resubmit the data
201
202
  ads_upload_search_key_not_found=Search key {} wasn't found in dataframe columns
@@ -92,12 +92,13 @@ def sample(
92
92
  fit_sample_rows = sample_config.fit_sample_rows
93
93
 
94
94
  if cv_type is not None and cv_type.is_time_series():
95
- return sample_time_series_trunc(
95
+ return sample_time_series_train_eval(
96
96
  df,
97
- sample_columns.ids,
98
- sample_columns.date,
97
+ sample_columns,
99
98
  sample_config.fit_sample_rows_ts,
100
- random_state,
99
+ trim_threshold=fit_sample_threshold,
100
+ max_rows=fit_sample_rows,
101
+ random_state=random_state,
101
102
  logger=logger,
102
103
  **kwargs,
103
104
  )
@@ -128,6 +129,68 @@ def sample(
128
129
  return df
129
130
 
130
131
 
132
+ def sample_time_series_train_eval(
133
+ df: pd.DataFrame,
134
+ sample_columns: SampleColumns,
135
+ sample_size: int,
136
+ trim_threshold: int,
137
+ max_rows: int,
138
+ random_state: int = 42,
139
+ logger: Optional[logging.Logger] = None,
140
+ bundle: Optional[ResourceBundle] = None,
141
+ **kwargs,
142
+ ):
143
+ if sample_columns.eval_set_index in df.columns:
144
+ train_df = df[df[sample_columns.eval_set_index] == 0]
145
+ eval_df = df[df[sample_columns.eval_set_index] > 0]
146
+ else:
147
+ train_df = df
148
+ eval_df = None
149
+
150
+ train_df = sample_time_series_trunc(
151
+ train_df, sample_columns.ids, sample_columns.date, sample_size, random_state, logger=logger, **kwargs
152
+ )
153
+ if sample_columns.ids and eval_df is not None:
154
+ missing_ids = (
155
+ eval_df[~eval_df[sample_columns.ids].isin(np.unique(train_df[sample_columns.ids]))][sample_columns.ids]
156
+ .dropna()
157
+ .drop_duplicates()
158
+ .values.tolist()
159
+ )
160
+ if missing_ids:
161
+ bundle = bundle or get_custom_bundle()
162
+ print(bundle.get("missing_ids_in_eval_set").format(missing_ids))
163
+ eval_df = eval_df.merge(train_df[sample_columns.ids].drop_duplicates())
164
+
165
+ if eval_df is not None:
166
+ if len(eval_df) > trim_threshold - len(train_df):
167
+ eval_df = sample_time_series_trunc(
168
+ eval_df,
169
+ sample_columns.ids,
170
+ sample_columns.date,
171
+ max_rows - len(train_df),
172
+ random_state,
173
+ logger=logger,
174
+ **kwargs,
175
+ )
176
+ df = pd.concat([train_df, eval_df])
177
+
178
+ elif len(train_df) > max_rows:
179
+ df = sample_time_series_trunc(
180
+ train_df,
181
+ sample_columns.ids,
182
+ sample_columns.date,
183
+ max_rows,
184
+ random_state,
185
+ logger=logger,
186
+ **kwargs,
187
+ )
188
+ else:
189
+ df = train_df
190
+
191
+ return df
192
+
193
+
131
194
  def sample_time_series_trunc(
132
195
  df: pd.DataFrame,
133
196
  id_columns: Optional[List[str]],
@@ -189,6 +252,7 @@ def sample_time_series(
189
252
  min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO,
190
253
  prefer_recent_dates: bool = True,
191
254
  logger: Optional[logging.Logger] = None,
255
+ **kwargs,
192
256
  ):
193
257
  def ensure_tuple(x):
194
258
  return tuple([x]) if not isinstance(x, tuple) else x
@@ -242,9 +306,7 @@ def sample_time_series(
242
306
 
243
307
  def balance_undersample_forced(
244
308
  df: pd.DataFrame,
245
- target_column: str,
246
- id_columns: Optional[List[str]],
247
- date_column: str,
309
+ sample_columns: SampleColumns,
248
310
  task_type: ModelTaskType,
249
311
  cv_type: Optional[CVType],
250
312
  random_state: int,
@@ -268,7 +330,7 @@ def balance_undersample_forced(
268
330
  if warning_callback is not None:
269
331
  warning_callback(msg)
270
332
 
271
- target = df[target_column].copy()
333
+ target = df[sample_columns.target].copy()
272
334
 
273
335
  vc = target.value_counts()
274
336
  max_class_value = vc.index[0]
@@ -280,11 +342,12 @@ def balance_undersample_forced(
280
342
  df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
281
343
  if cv_type is not None and cv_type.is_time_series():
282
344
  logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
283
- resampled_data = sample_time_series_trunc(
345
+ resampled_data = sample_time_series_train_eval(
284
346
  df,
285
- id_columns=id_columns,
286
- date_column=date_column,
347
+ sample_columns=sample_columns,
287
348
  sample_size=sample_size,
349
+ trim_threshold=sample_size,
350
+ max_rows=sample_size,
288
351
  random_state=random_state,
289
352
  logger=logger,
290
353
  )
@@ -296,8 +359,8 @@ def balance_undersample_forced(
296
359
  logger.warning(msg)
297
360
 
298
361
  # fill up to min_sample_threshold by majority class
299
- minority_class = df[df[target_column] == min_class_value]
300
- majority_class = df[df[target_column] != min_class_value]
362
+ minority_class = df[df[sample_columns.target] == min_class_value]
363
+ majority_class = df[df[sample_columns.target] != min_class_value]
301
364
  logger.info(
302
365
  f"Min class count: {min_class_count}. Max class count: {max_class_count}."
303
366
  f" Rebalance sample size: {sample_size}"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.91a3884.dev1
3
+ Version: 1.2.91a3884.dev3
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,4 +1,4 @@
1
- upgini/__about__.py,sha256=71INSTd7K-9v6Q1BhlXvzqKYEhEddj5rcyg_0HuQwMU,33
1
+ upgini/__about__.py,sha256=55Sg-JLu4aw-5ANNPanS_ciHPSsxXTa8YndbgltGREA,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=c0rZ-ydrnCdrTzx10WZl4WbO3LdyuF0fUCRD8Ugjitg,33093
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
38
38
  upgini/normalizer/normalize_utils.py,sha256=g2TcDXZeJp9kAFO2sTqZ4CAsN4J1qHNgoJHZ8gtzUWo,7376
39
39
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
40
40
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
41
- upgini/resource_bundle/strings.properties,sha256=YvW_vyK1klVdvkWFripy8cBH-wGjzsyomoe3Pd20LjY,28359
41
+ upgini/resource_bundle/strings.properties,sha256=Hfpr2-I5Ws6ugIN1QSz549OHayZeLYglRsbrGDT6g9g,28491
42
42
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
43
43
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -64,14 +64,14 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
64
64
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
65
65
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
66
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
67
- upgini/utils/sample_utils.py,sha256=g4OZwxN4LnLmQs3ZCebZDRuztQL0vFkdv8m2IBiv0S0,13204
67
+ upgini/utils/sample_utils.py,sha256=PpMXRVTPKi6TyAo0gPhF0OmXmutecHdonM7WYUsB1Wo,15249
68
68
  upgini/utils/sklearn_ext.py,sha256=jLJWAKkqQinV15Z4y1ZnsN3c-fKFwXTsprs00COnyVU,49315
69
69
  upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
70
70
  upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,9049
71
71
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
72
72
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
73
73
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
74
- upgini-1.2.91a3884.dev1.dist-info/METADATA,sha256=7y90EE_TXo-9fNsSlsQ7cZxoxvet0C2-ccrGGa9WX-k,49546
75
- upgini-1.2.91a3884.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
76
- upgini-1.2.91a3884.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
- upgini-1.2.91a3884.dev1.dist-info/RECORD,,
74
+ upgini-1.2.91a3884.dev3.dist-info/METADATA,sha256=teoc8dCmv4mb2eBV6QruZag3xnwK3YAdlKCHuIKllXw,49546
75
+ upgini-1.2.91a3884.dev3.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
76
+ upgini-1.2.91a3884.dev3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
+ upgini-1.2.91a3884.dev3.dist-info/RECORD,,