upgini 1.2.91a3884.dev1__tar.gz → 1.2.91a3884.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/PKG-INFO +1 -1
  2. upgini-1.2.91a3884.dev2/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/resource_bundle/strings.properties +1 -0
  4. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/sample_utils.py +75 -13
  5. upgini-1.2.91a3884.dev1/src/upgini/__about__.py +0 -1
  6. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/.gitignore +0 -0
  7. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/LICENSE +0 -0
  8. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/README.md +0 -0
  9. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/pyproject.toml +0 -0
  10. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/__init__.py +0 -0
  11. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/ads.py +0 -0
  12. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/ads_management/__init__.py +0 -0
  13. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/ads_management/ads_manager.py +0 -0
  14. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/autofe/__init__.py +0 -0
  15. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/autofe/all_operators.py +0 -0
  16. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/autofe/binary.py +0 -0
  17. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/autofe/date.py +0 -0
  18. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/autofe/feature.py +0 -0
  19. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/autofe/groupby.py +0 -0
  20. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/autofe/operator.py +0 -0
  21. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/autofe/timeseries/__init__.py +0 -0
  22. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/autofe/timeseries/base.py +0 -0
  23. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/autofe/timeseries/cross.py +0 -0
  24. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/autofe/timeseries/delta.py +0 -0
  25. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/autofe/timeseries/lag.py +0 -0
  26. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/autofe/timeseries/roll.py +0 -0
  27. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/autofe/timeseries/trend.py +0 -0
  28. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/autofe/timeseries/volatility.py +0 -0
  29. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/autofe/unary.py +0 -0
  30. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/autofe/utils.py +0 -0
  31. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/autofe/vector.py +0 -0
  32. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/data_source/__init__.py +0 -0
  33. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/data_source/data_source_publisher.py +0 -0
  34. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/dataset.py +0 -0
  35. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/errors.py +0 -0
  36. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/features_enricher.py +0 -0
  37. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/http.py +0 -0
  38. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/mdc/__init__.py +0 -0
  39. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/mdc/context.py +0 -0
  40. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/metadata.py +0 -0
  41. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/metrics.py +0 -0
  42. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/normalizer/__init__.py +0 -0
  43. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/normalizer/normalize_utils.py +0 -0
  44. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/resource_bundle/__init__.py +0 -0
  45. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/resource_bundle/exceptions.py +0 -0
  46. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  47. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/sampler/__init__.py +0 -0
  48. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/sampler/base.py +0 -0
  49. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/sampler/random_under_sampler.py +0 -0
  50. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/sampler/utils.py +0 -0
  51. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/search_task.py +0 -0
  52. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/spinner.py +0 -0
  53. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  54. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/__init__.py +0 -0
  55. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/base_search_key_detector.py +0 -0
  56. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/blocked_time_series.py +0 -0
  57. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/country_utils.py +0 -0
  58. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/custom_loss_utils.py +0 -0
  59. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/cv_utils.py +0 -0
  60. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/datetime_utils.py +0 -0
  61. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/deduplicate_utils.py +0 -0
  62. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/display_utils.py +0 -0
  63. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/email_utils.py +0 -0
  64. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/fallback_progress_bar.py +0 -0
  65. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/feature_info.py +0 -0
  66. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/features_validator.py +0 -0
  67. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/format.py +0 -0
  68. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/ip_utils.py +0 -0
  69. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/mstats.py +0 -0
  70. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/phone_utils.py +0 -0
  71. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/postal_code_utils.py +0 -0
  72. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/progress_bar.py +0 -0
  73. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/sklearn_ext.py +0 -0
  74. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/sort.py +0 -0
  75. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/target_utils.py +0 -0
  76. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/track_info.py +0 -0
  77. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/ts_utils.py +0 -0
  78. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/utils/warning_counter.py +0 -0
  79. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev2}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.91a3884.dev1
3
+ Version: 1.2.91a3884.dev2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.91a3884.dev2"
@@ -196,6 +196,7 @@ timeseries_invalid_test_size_type=test_size={} should be a float in the (0, 1) r
196
196
  timeseries_splits_more_than_samples=Number of splits={} can't be more than number of samples={}
197
197
  timeseries_invalid_test_size=Wrong number of samples in a test fold: (test_size * n_samples / n_splits) <= 1
198
198
  date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns. Please remove them and try again
199
+ missing_ids_in_eval_set=Following ids are present in eval set but not in sampled train set: {}. They will be removed from eval set.
199
200
  # Upload ads validation
200
201
  ads_upload_too_few_rows=At least 1000 records per sample are needed. Increase the sample size for evaluation and resubmit the data
201
202
  ads_upload_search_key_not_found=Search key {} wasn't found in dataframe columns
@@ -92,12 +92,13 @@ def sample(
92
92
  fit_sample_rows = sample_config.fit_sample_rows
93
93
 
94
94
  if cv_type is not None and cv_type.is_time_series():
95
- return sample_time_series_trunc(
95
+ return sample_time_series_train_eval(
96
96
  df,
97
- sample_columns.ids,
98
- sample_columns.date,
97
+ sample_columns,
99
98
  sample_config.fit_sample_rows_ts,
100
- random_state,
99
+ trim_threshold=fit_sample_threshold,
100
+ max_rows=fit_sample_rows,
101
+ random_state=random_state,
101
102
  logger=logger,
102
103
  **kwargs,
103
104
  )
@@ -128,6 +129,68 @@ def sample(
128
129
  return df
129
130
 
130
131
 
132
+ def sample_time_series_train_eval(
133
+ df: pd.DataFrame,
134
+ sample_columns: SampleColumns,
135
+ sample_size: int,
136
+ trim_threshold: int,
137
+ max_rows: int,
138
+ random_state: int = 42,
139
+ logger: Optional[logging.Logger] = None,
140
+ bundle: Optional[ResourceBundle] = None,
141
+ **kwargs,
142
+ ):
143
+ if sample_columns.eval_set_index in df.columns:
144
+ train_df = df[df[sample_columns.eval_set_index] == 0]
145
+ eval_df = df[df[sample_columns.eval_set_index] > 0]
146
+ else:
147
+ train_df = df
148
+ eval_df = None
149
+
150
+ train_df = sample_time_series_trunc(
151
+ train_df, sample_columns.ids, sample_columns.date, sample_size, random_state, logger=logger, **kwargs
152
+ )
153
+ if sample_columns.ids and eval_df is not None:
154
+ missing_ids = (
155
+ eval_df[~eval_df[sample_columns.ids].isin(np.unique(train_df[sample_columns.ids]))][sample_columns.ids]
156
+ .dropna()
157
+ .drop_duplicates()
158
+ .values.tolist()
159
+ )
160
+ if missing_ids:
161
+ bundle = bundle or get_custom_bundle()
162
+ print(bundle.get("missing_ids_in_eval_set").format(missing_ids))
163
+ eval_df = eval_df.merge(train_df[sample_columns.ids].drop_duplicates())
164
+
165
+ if eval_df is not None:
166
+ if len(eval_df) > trim_threshold - len(train_df):
167
+ eval_df = sample_time_series_trunc(
168
+ eval_df,
169
+ sample_columns.ids,
170
+ sample_columns.date,
171
+ max_rows - len(train_df),
172
+ random_state,
173
+ logger=logger,
174
+ **kwargs,
175
+ )
176
+ df = pd.concat([train_df, eval_df])
177
+
178
+ elif len(train_df) > max_rows:
179
+ df = sample_time_series_trunc(
180
+ train_df,
181
+ sample_columns.ids,
182
+ sample_columns.date,
183
+ max_rows,
184
+ random_state,
185
+ logger=logger,
186
+ **kwargs,
187
+ )
188
+ else:
189
+ df = train_df
190
+
191
+ return df
192
+
193
+
131
194
  def sample_time_series_trunc(
132
195
  df: pd.DataFrame,
133
196
  id_columns: Optional[List[str]],
@@ -242,9 +305,7 @@ def sample_time_series(
242
305
 
243
306
  def balance_undersample_forced(
244
307
  df: pd.DataFrame,
245
- target_column: str,
246
- id_columns: Optional[List[str]],
247
- date_column: str,
308
+ sample_columns: SampleColumns,
248
309
  task_type: ModelTaskType,
249
310
  cv_type: Optional[CVType],
250
311
  random_state: int,
@@ -268,7 +329,7 @@ def balance_undersample_forced(
268
329
  if warning_callback is not None:
269
330
  warning_callback(msg)
270
331
 
271
- target = df[target_column].copy()
332
+ target = df[sample_columns.target].copy()
272
333
 
273
334
  vc = target.value_counts()
274
335
  max_class_value = vc.index[0]
@@ -280,11 +341,12 @@ def balance_undersample_forced(
280
341
  df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
281
342
  if cv_type is not None and cv_type.is_time_series():
282
343
  logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
283
- resampled_data = sample_time_series_trunc(
344
+ resampled_data = sample_time_series_train_eval(
284
345
  df,
285
- id_columns=id_columns,
286
- date_column=date_column,
346
+ sample_columns=sample_columns,
287
347
  sample_size=sample_size,
348
+ trim_threshold=sample_size,
349
+ max_rows=sample_size,
288
350
  random_state=random_state,
289
351
  logger=logger,
290
352
  )
@@ -296,8 +358,8 @@ def balance_undersample_forced(
296
358
  logger.warning(msg)
297
359
 
298
360
  # fill up to min_sample_threshold by majority class
299
- minority_class = df[df[target_column] == min_class_value]
300
- majority_class = df[df[target_column] != min_class_value]
361
+ minority_class = df[df[sample_columns.target] == min_class_value]
362
+ majority_class = df[df[sample_columns.target] != min_class_value]
301
363
  logger.info(
302
364
  f"Min class count: {min_class_count}. Max class count: {max_class_count}."
303
365
  f" Rebalance sample size: {sample_size}"
@@ -1 +0,0 @@
1
- __version__ = "1.2.91a3884.dev1"