upgini 1.2.91a3884.dev1__tar.gz → 1.2.91a3884.dev3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/PKG-INFO +1 -1
  2. upgini-1.2.91a3884.dev3/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/resource_bundle/strings.properties +1 -0
  4. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/sample_utils.py +76 -13
  5. upgini-1.2.91a3884.dev1/src/upgini/__about__.py +0 -1
  6. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/.gitignore +0 -0
  7. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/LICENSE +0 -0
  8. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/README.md +0 -0
  9. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/pyproject.toml +0 -0
  10. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/__init__.py +0 -0
  11. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/ads.py +0 -0
  12. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/ads_management/__init__.py +0 -0
  13. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/ads_management/ads_manager.py +0 -0
  14. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/autofe/__init__.py +0 -0
  15. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/autofe/all_operators.py +0 -0
  16. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/autofe/binary.py +0 -0
  17. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/autofe/date.py +0 -0
  18. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/autofe/feature.py +0 -0
  19. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/autofe/groupby.py +0 -0
  20. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/autofe/operator.py +0 -0
  21. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/autofe/timeseries/__init__.py +0 -0
  22. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/autofe/timeseries/base.py +0 -0
  23. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/autofe/timeseries/cross.py +0 -0
  24. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/autofe/timeseries/delta.py +0 -0
  25. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/autofe/timeseries/lag.py +0 -0
  26. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/autofe/timeseries/roll.py +0 -0
  27. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/autofe/timeseries/trend.py +0 -0
  28. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/autofe/timeseries/volatility.py +0 -0
  29. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/autofe/unary.py +0 -0
  30. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/autofe/utils.py +0 -0
  31. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/autofe/vector.py +0 -0
  32. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/data_source/__init__.py +0 -0
  33. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/data_source/data_source_publisher.py +0 -0
  34. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/dataset.py +0 -0
  35. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/errors.py +0 -0
  36. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/features_enricher.py +0 -0
  37. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/http.py +0 -0
  38. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/mdc/__init__.py +0 -0
  39. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/mdc/context.py +0 -0
  40. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/metadata.py +0 -0
  41. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/metrics.py +0 -0
  42. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/normalizer/__init__.py +0 -0
  43. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/normalizer/normalize_utils.py +0 -0
  44. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/resource_bundle/__init__.py +0 -0
  45. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/resource_bundle/exceptions.py +0 -0
  46. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  47. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/sampler/__init__.py +0 -0
  48. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/sampler/base.py +0 -0
  49. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/sampler/random_under_sampler.py +0 -0
  50. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/sampler/utils.py +0 -0
  51. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/search_task.py +0 -0
  52. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/spinner.py +0 -0
  53. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  54. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/__init__.py +0 -0
  55. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/base_search_key_detector.py +0 -0
  56. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/blocked_time_series.py +0 -0
  57. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/country_utils.py +0 -0
  58. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/custom_loss_utils.py +0 -0
  59. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/cv_utils.py +0 -0
  60. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/datetime_utils.py +0 -0
  61. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/deduplicate_utils.py +0 -0
  62. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/display_utils.py +0 -0
  63. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/email_utils.py +0 -0
  64. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/fallback_progress_bar.py +0 -0
  65. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/feature_info.py +0 -0
  66. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/features_validator.py +0 -0
  67. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/format.py +0 -0
  68. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/ip_utils.py +0 -0
  69. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/mstats.py +0 -0
  70. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/phone_utils.py +0 -0
  71. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/postal_code_utils.py +0 -0
  72. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/progress_bar.py +0 -0
  73. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/sklearn_ext.py +0 -0
  74. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/sort.py +0 -0
  75. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/target_utils.py +0 -0
  76. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/track_info.py +0 -0
  77. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/ts_utils.py +0 -0
  78. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/utils/warning_counter.py +0 -0
  79. {upgini-1.2.91a3884.dev1 → upgini-1.2.91a3884.dev3}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.91a3884.dev1
3
+ Version: 1.2.91a3884.dev3
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.91a3884.dev3"
@@ -196,6 +196,7 @@ timeseries_invalid_test_size_type=test_size={} should be a float in the (0, 1) r
196
196
  timeseries_splits_more_than_samples=Number of splits={} can't be more than number of samples={}
197
197
  timeseries_invalid_test_size=Wrong number of samples in a test fold: (test_size * n_samples / n_splits) <= 1
198
198
  date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns. Please remove them and try again
199
+ missing_ids_in_eval_set=Following ids are present in eval set but not in sampled train set: {}. They will be removed from eval set.
199
200
  # Upload ads validation
200
201
  ads_upload_too_few_rows=At least 1000 records per sample are needed. Increase the sample size for evaluation and resubmit the data
201
202
  ads_upload_search_key_not_found=Search key {} wasn't found in dataframe columns
@@ -92,12 +92,13 @@ def sample(
92
92
  fit_sample_rows = sample_config.fit_sample_rows
93
93
 
94
94
  if cv_type is not None and cv_type.is_time_series():
95
- return sample_time_series_trunc(
95
+ return sample_time_series_train_eval(
96
96
  df,
97
- sample_columns.ids,
98
- sample_columns.date,
97
+ sample_columns,
99
98
  sample_config.fit_sample_rows_ts,
100
- random_state,
99
+ trim_threshold=fit_sample_threshold,
100
+ max_rows=fit_sample_rows,
101
+ random_state=random_state,
101
102
  logger=logger,
102
103
  **kwargs,
103
104
  )
@@ -128,6 +129,68 @@ def sample(
128
129
  return df
129
130
 
130
131
 
132
+ def sample_time_series_train_eval(
133
+ df: pd.DataFrame,
134
+ sample_columns: SampleColumns,
135
+ sample_size: int,
136
+ trim_threshold: int,
137
+ max_rows: int,
138
+ random_state: int = 42,
139
+ logger: Optional[logging.Logger] = None,
140
+ bundle: Optional[ResourceBundle] = None,
141
+ **kwargs,
142
+ ):
143
+ if sample_columns.eval_set_index in df.columns:
144
+ train_df = df[df[sample_columns.eval_set_index] == 0]
145
+ eval_df = df[df[sample_columns.eval_set_index] > 0]
146
+ else:
147
+ train_df = df
148
+ eval_df = None
149
+
150
+ train_df = sample_time_series_trunc(
151
+ train_df, sample_columns.ids, sample_columns.date, sample_size, random_state, logger=logger, **kwargs
152
+ )
153
+ if sample_columns.ids and eval_df is not None:
154
+ missing_ids = (
155
+ eval_df[~eval_df[sample_columns.ids].isin(np.unique(train_df[sample_columns.ids]))][sample_columns.ids]
156
+ .dropna()
157
+ .drop_duplicates()
158
+ .values.tolist()
159
+ )
160
+ if missing_ids:
161
+ bundle = bundle or get_custom_bundle()
162
+ print(bundle.get("missing_ids_in_eval_set").format(missing_ids))
163
+ eval_df = eval_df.merge(train_df[sample_columns.ids].drop_duplicates())
164
+
165
+ if eval_df is not None:
166
+ if len(eval_df) > trim_threshold - len(train_df):
167
+ eval_df = sample_time_series_trunc(
168
+ eval_df,
169
+ sample_columns.ids,
170
+ sample_columns.date,
171
+ max_rows - len(train_df),
172
+ random_state,
173
+ logger=logger,
174
+ **kwargs,
175
+ )
176
+ df = pd.concat([train_df, eval_df])
177
+
178
+ elif len(train_df) > max_rows:
179
+ df = sample_time_series_trunc(
180
+ train_df,
181
+ sample_columns.ids,
182
+ sample_columns.date,
183
+ max_rows,
184
+ random_state,
185
+ logger=logger,
186
+ **kwargs,
187
+ )
188
+ else:
189
+ df = train_df
190
+
191
+ return df
192
+
193
+
131
194
  def sample_time_series_trunc(
132
195
  df: pd.DataFrame,
133
196
  id_columns: Optional[List[str]],
@@ -189,6 +252,7 @@ def sample_time_series(
189
252
  min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO,
190
253
  prefer_recent_dates: bool = True,
191
254
  logger: Optional[logging.Logger] = None,
255
+ **kwargs,
192
256
  ):
193
257
  def ensure_tuple(x):
194
258
  return tuple([x]) if not isinstance(x, tuple) else x
@@ -242,9 +306,7 @@ def sample_time_series(
242
306
 
243
307
  def balance_undersample_forced(
244
308
  df: pd.DataFrame,
245
- target_column: str,
246
- id_columns: Optional[List[str]],
247
- date_column: str,
309
+ sample_columns: SampleColumns,
248
310
  task_type: ModelTaskType,
249
311
  cv_type: Optional[CVType],
250
312
  random_state: int,
@@ -268,7 +330,7 @@ def balance_undersample_forced(
268
330
  if warning_callback is not None:
269
331
  warning_callback(msg)
270
332
 
271
- target = df[target_column].copy()
333
+ target = df[sample_columns.target].copy()
272
334
 
273
335
  vc = target.value_counts()
274
336
  max_class_value = vc.index[0]
@@ -280,11 +342,12 @@ def balance_undersample_forced(
280
342
  df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
281
343
  if cv_type is not None and cv_type.is_time_series():
282
344
  logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
283
- resampled_data = sample_time_series_trunc(
345
+ resampled_data = sample_time_series_train_eval(
284
346
  df,
285
- id_columns=id_columns,
286
- date_column=date_column,
347
+ sample_columns=sample_columns,
287
348
  sample_size=sample_size,
349
+ trim_threshold=sample_size,
350
+ max_rows=sample_size,
288
351
  random_state=random_state,
289
352
  logger=logger,
290
353
  )
@@ -296,8 +359,8 @@ def balance_undersample_forced(
296
359
  logger.warning(msg)
297
360
 
298
361
  # fill up to min_sample_threshold by majority class
299
- minority_class = df[df[target_column] == min_class_value]
300
- majority_class = df[df[target_column] != min_class_value]
362
+ minority_class = df[df[sample_columns.target] == min_class_value]
363
+ majority_class = df[df[sample_columns.target] != min_class_value]
301
364
  logger.info(
302
365
  f"Min class count: {min_class_count}. Max class count: {max_class_count}."
303
366
  f" Rebalance sample size: {sample_size}"
@@ -1 +0,0 @@
1
- __version__ = "1.2.91a3884.dev1"