upgini 1.2.37__tar.gz → 1.2.38a3769.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (67) hide show
  1. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/PKG-INFO +1 -1
  2. upgini-1.2.38a3769.dev2/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/dataset.py +21 -2
  4. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/features_enricher.py +44 -24
  5. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/metadata.py +3 -0
  6. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/resource_bundle/strings.properties +0 -1
  7. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/target_utils.py +76 -3
  8. upgini-1.2.37/src/upgini/__about__.py +0 -1
  9. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/.gitignore +0 -0
  10. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/LICENSE +0 -0
  11. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/README.md +0 -0
  12. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/pyproject.toml +0 -0
  13. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/__init__.py +0 -0
  14. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/ads.py +0 -0
  15. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/ads_management/__init__.py +0 -0
  16. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/ads_management/ads_manager.py +0 -0
  17. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/autofe/__init__.py +0 -0
  18. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/autofe/all_operands.py +0 -0
  19. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/autofe/binary.py +0 -0
  20. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/autofe/date.py +0 -0
  21. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/autofe/feature.py +0 -0
  22. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/autofe/groupby.py +0 -0
  23. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/autofe/operand.py +0 -0
  24. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/autofe/unary.py +0 -0
  25. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/autofe/vector.py +0 -0
  26. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/data_source/__init__.py +0 -0
  27. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/data_source/data_source_publisher.py +0 -0
  28. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/errors.py +0 -0
  29. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/http.py +0 -0
  30. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/lazy_import.py +0 -0
  31. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/mdc/__init__.py +0 -0
  32. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/mdc/context.py +0 -0
  33. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/metrics.py +0 -0
  34. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/normalizer/__init__.py +0 -0
  35. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/normalizer/normalize_utils.py +0 -0
  36. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/resource_bundle/__init__.py +0 -0
  37. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/resource_bundle/exceptions.py +0 -0
  38. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  39. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/sampler/__init__.py +0 -0
  40. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/sampler/base.py +0 -0
  41. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/sampler/random_under_sampler.py +0 -0
  42. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/sampler/utils.py +0 -0
  43. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/search_task.py +0 -0
  44. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/spinner.py +0 -0
  45. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  46. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/__init__.py +0 -0
  47. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/base_search_key_detector.py +0 -0
  48. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/blocked_time_series.py +0 -0
  49. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/country_utils.py +0 -0
  50. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/custom_loss_utils.py +0 -0
  51. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/cv_utils.py +0 -0
  52. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/datetime_utils.py +0 -0
  53. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/deduplicate_utils.py +0 -0
  54. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/display_utils.py +0 -0
  55. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/email_utils.py +0 -0
  56. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/fallback_progress_bar.py +0 -0
  57. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/feature_info.py +0 -0
  58. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/features_validator.py +0 -0
  59. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/format.py +0 -0
  60. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/ip_utils.py +0 -0
  61. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/phone_utils.py +0 -0
  62. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/postal_code_utils.py +0 -0
  63. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/progress_bar.py +0 -0
  64. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/sklearn_ext.py +0 -0
  65. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/track_info.py +0 -0
  66. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/warning_counter.py +0 -0
  67. {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.37
3
+ Version: 1.2.38a3769.dev2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.38a3769.dev2"
@@ -22,6 +22,7 @@ from upgini.metadata import (
22
22
  EVAL_SET_INDEX,
23
23
  SYSTEM_RECORD_ID,
24
24
  TARGET,
25
+ CVType,
25
26
  DataType,
26
27
  FeaturesFilter,
27
28
  FileColumnMeaningType,
@@ -32,11 +33,12 @@ from upgini.metadata import (
32
33
  NumericInterval,
33
34
  RuntimeParameters,
34
35
  SearchCustomization,
36
+ SearchKey,
35
37
  )
36
38
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
37
39
  from upgini.search_task import SearchTask
38
40
  from upgini.utils.email_utils import EmailSearchKeyConverter
39
- from upgini.utils.target_utils import balance_undersample, balance_undersample_forced
41
+ from upgini.utils.target_utils import balance_undersample, balance_undersample_forced, balance_undersample_time_series
40
42
 
41
43
  try:
42
44
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
@@ -74,6 +76,7 @@ class Dataset: # (pd.DataFrame):
74
76
  search_keys: Optional[List[Tuple[str, ...]]] = None,
75
77
  unnest_search_keys: Optional[Dict[str, str]] = None,
76
78
  model_task_type: Optional[ModelTaskType] = None,
79
+ cv_type: Optional[CVType] = None,
77
80
  random_state: Optional[int] = None,
78
81
  rest_client: Optional[_RestClient] = None,
79
82
  logger: Optional[logging.Logger] = None,
@@ -104,6 +107,7 @@ class Dataset: # (pd.DataFrame):
104
107
 
105
108
  self.dataset_name = dataset_name
106
109
  self.task_type = model_task_type
110
+ self.cv_type = cv_type
107
111
  self.description = description
108
112
  self.meaning_types = meaning_types
109
113
  self.search_keys = search_keys
@@ -225,6 +229,7 @@ class Dataset: # (pd.DataFrame):
225
229
  df=self.data,
226
230
  target_column=target_column,
227
231
  task_type=self.task_type,
232
+ cv_type=self.cv_type,
228
233
  random_state=self.random_state,
229
234
  sample_size=self.FORCE_SAMPLE_SIZE,
230
235
  logger=self.logger,
@@ -297,7 +302,21 @@ class Dataset: # (pd.DataFrame):
297
302
  f"Etalon has size {len(self.data)} more than threshold {sample_threshold} "
298
303
  f"and will be downsampled to {sample_rows}"
299
304
  )
300
- resampled_data = self.data.sample(n=sample_rows, random_state=self.random_state)
305
+ if self.cv_type is not None and self.cv_type.is_time_series():
306
+ resampled_data = balance_undersample_time_series(
307
+ df=self.data,
308
+ id_columns=[k for k, v in self.meaning_types.items() if v == FileColumnMeaningType.CUSTOM_KEY],
309
+ date_column=next(
310
+ k
311
+ for k, v in self.meaning_types.items()
312
+ if v in [FileColumnMeaningType.DATE, FileColumnMeaningType.DATETIME]
313
+ ),
314
+ sample_size=sample_rows,
315
+ random_state=self.random_state,
316
+ logger=self.logger,
317
+ )
318
+ else:
319
+ resampled_data = self.data.sample(n=sample_rows, random_state=self.random_state)
301
320
  self.data = resampled_data
302
321
  self.logger.info(f"Shape after threshold resampling: {self.data.shape}")
303
322
 
@@ -237,6 +237,7 @@ class FeaturesEnricher(TransformerMixin):
237
237
  add_date_if_missing: bool = True,
238
238
  select_features: bool = False,
239
239
  disable_force_downsampling: bool = False,
240
+ id_columns: Optional[List[str]] = None,
240
241
  **kwargs,
241
242
  ):
242
243
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -277,9 +278,12 @@ class FeaturesEnricher(TransformerMixin):
277
278
  )
278
279
 
279
280
  validate_version(self.logger, self.__log_warning)
281
+
280
282
  self.search_keys = search_keys or {}
283
+ self.id_columns = id_columns
281
284
  self.country_code = country_code
282
285
  self.__validate_search_keys(search_keys, search_id)
286
+
283
287
  self.model_task_type = model_task_type
284
288
  self.endpoint = endpoint
285
289
  self._search_task: Optional[SearchTask] = None
@@ -983,7 +987,7 @@ class FeaturesEnricher(TransformerMixin):
983
987
  with Spinner():
984
988
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
985
989
 
986
- has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
990
+ has_date = self._get_date_column(search_keys) is not None
987
991
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
988
992
 
989
993
  wrapper = EstimatorWrapper.create(
@@ -1185,7 +1189,7 @@ class FeaturesEnricher(TransformerMixin):
1185
1189
  )
1186
1190
 
1187
1191
  uplift_col = self.bundle.get("quality_metrics_uplift_header")
1188
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1192
+ date_column = self._get_date_column(search_keys)
1189
1193
  if (
1190
1194
  uplift_col in metrics_df.columns
1191
1195
  and (metrics_df[uplift_col] < 0).any()
@@ -1354,7 +1358,7 @@ class FeaturesEnricher(TransformerMixin):
1354
1358
  groups = None
1355
1359
 
1356
1360
  if not isinstance(_cv, BaseCrossValidator):
1357
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1361
+ date_column = self._get_date_column(search_keys)
1358
1362
  date_series = X[date_column] if date_column is not None else None
1359
1363
  _cv, groups = CVConfig(
1360
1364
  _cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
@@ -1667,7 +1671,7 @@ class FeaturesEnricher(TransformerMixin):
1667
1671
  search_keys = self.search_keys.copy()
1668
1672
  search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
1669
1673
 
1670
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1674
+ date_column = self._get_date_column(search_keys)
1671
1675
  generated_features = []
1672
1676
  if date_column is not None:
1673
1677
  converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
@@ -1741,7 +1745,7 @@ class FeaturesEnricher(TransformerMixin):
1741
1745
  search_keys = self.fit_search_keys
1742
1746
 
1743
1747
  rows_to_drop = None
1744
- has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
1748
+ has_date = self._get_date_column(search_keys) is not None
1745
1749
  self.model_task_type = self.model_task_type or define_task(
1746
1750
  self.df_with_original_index[TARGET], has_date, self.logger, silent=True
1747
1751
  )
@@ -1853,7 +1857,10 @@ class FeaturesEnricher(TransformerMixin):
1853
1857
  df = balance_undersample_forced(
1854
1858
  df=df,
1855
1859
  target_column=TARGET,
1860
+ id_columns=self.id_columns,
1861
+ date_column=self._get_date_column(self.search_keys),
1856
1862
  task_type=self.model_task_type,
1863
+ cv_type=self.cv,
1857
1864
  random_state=self.random_state,
1858
1865
  sample_size=Dataset.FORCE_SAMPLE_SIZE,
1859
1866
  logger=self.logger,
@@ -1995,7 +2002,7 @@ class FeaturesEnricher(TransformerMixin):
1995
2002
  trace_id = trace_id or uuid.uuid4()
1996
2003
  return search_task.get_progress(trace_id)
1997
2004
 
1998
- def get_transactional_transform_api(self, only_online_sources=False):
2005
+ def get_transactional_transform_api(self):
1999
2006
  if self.api_key is None:
2000
2007
  raise ValidationError(self.bundle.get("transactional_transform_unregistered"))
2001
2008
  if self._search_task is None:
@@ -2053,7 +2060,7 @@ class FeaturesEnricher(TransformerMixin):
2053
2060
  api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
2054
2061
  -H 'Authorization: {self.api_key}' \\
2055
2062
  -H 'Content-Type: application/json' \\
2056
- -d '{{"search_keys": {keys}{features_section}, "only_online_sources": {str(only_online_sources).lower()}}}'"""
2063
+ -d '{{"search_keys": {keys}{features_section}}}'"""
2057
2064
  return api_example
2058
2065
 
2059
2066
  def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
@@ -2102,10 +2109,8 @@ class FeaturesEnricher(TransformerMixin):
2102
2109
  self.logger.warning(
2103
2110
  f"There are important features for transform, that generated by online API: {online_api_features}"
2104
2111
  )
2105
- msg = self.bundle.get("online_api_features_transform").format(online_api_features)
2106
- self.logger.warning(msg)
2107
- print(msg)
2108
- print(self.get_transactional_transform_api(only_online_sources=True))
2112
+ # TODO
2113
+ raise Exception("There are features selected that are paid. Contact support (sales@upgini.com)")
2109
2114
 
2110
2115
  if not metrics_calculation:
2111
2116
  transform_usage = self.rest_client.get_current_transform_usage(trace_id)
@@ -2155,7 +2160,7 @@ class FeaturesEnricher(TransformerMixin):
2155
2160
  df = self.__add_country_code(df, search_keys)
2156
2161
 
2157
2162
  generated_features = []
2158
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2163
+ date_column = self._get_date_column(search_keys)
2159
2164
  if date_column is not None:
2160
2165
  converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2161
2166
  df = converter.convert(df, keep_time=True)
@@ -2163,7 +2168,7 @@ class FeaturesEnricher(TransformerMixin):
2163
2168
  generated_features.extend(converter.generated_features)
2164
2169
  else:
2165
2170
  self.logger.info("Input dataset hasn't date column")
2166
- if self.add_date_if_missing:
2171
+ if self.__should_add_date_column():
2167
2172
  df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
2168
2173
 
2169
2174
  email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
@@ -2446,7 +2451,14 @@ class FeaturesEnricher(TransformerMixin):
2446
2451
  # Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
2447
2452
  multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
2448
2453
  for multi_key in multi_keys:
2449
- if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
2454
+ if multi_key not in [
2455
+ SearchKey.PHONE,
2456
+ SearchKey.IP,
2457
+ SearchKey.POSTAL_CODE,
2458
+ SearchKey.EMAIL,
2459
+ SearchKey.HEM,
2460
+ SearchKey.CUSTOM_KEY,
2461
+ ]:
2450
2462
  msg = self.bundle.get("unsupported_multi_key").format(multi_key)
2451
2463
  self.logger.warning(msg)
2452
2464
  raise ValidationError(msg)
@@ -2610,7 +2622,7 @@ class FeaturesEnricher(TransformerMixin):
2610
2622
  self.fit_generated_features.extend(converter.generated_features)
2611
2623
  else:
2612
2624
  self.logger.info("Input dataset hasn't date column")
2613
- if self.add_date_if_missing:
2625
+ if self.__should_add_date_column():
2614
2626
  df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2615
2627
 
2616
2628
  email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
@@ -2643,6 +2655,9 @@ class FeaturesEnricher(TransformerMixin):
2643
2655
 
2644
2656
  self.__adjust_cv(df)
2645
2657
 
2658
+ if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
2659
+ self.search_keys.update({col: SearchKey.CUSTOM_KEY for col in self.id_columns})
2660
+
2646
2661
  df, fintech_warnings = remove_fintech_duplicates(
2647
2662
  df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
2648
2663
  )
@@ -2764,6 +2779,7 @@ class FeaturesEnricher(TransformerMixin):
2764
2779
  search_keys=combined_search_keys,
2765
2780
  unnest_search_keys=unnest_search_keys,
2766
2781
  model_task_type=self.model_task_type,
2782
+ cv_type=self.cv,
2767
2783
  date_format=self.date_format,
2768
2784
  random_state=self.random_state,
2769
2785
  rest_client=self.rest_client,
@@ -2920,6 +2936,9 @@ class FeaturesEnricher(TransformerMixin):
2920
2936
  if not self.warning_counter.has_warnings():
2921
2937
  self.__display_support_link(self.bundle.get("all_ok_community_invite"))
2922
2938
 
2939
+ def __should_add_date_column(self):
2940
+ return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
2941
+
2923
2942
  def __adjust_cv(self, df: pd.DataFrame):
2924
2943
  date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2925
2944
  # Check Multivariate time series
@@ -3165,7 +3184,7 @@ class FeaturesEnricher(TransformerMixin):
3165
3184
  if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
3166
3185
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
3167
3186
  else:
3168
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3187
+ date_column = FeaturesEnricher._get_date_column(search_keys)
3169
3188
  sort_columns = [date_column] if date_column is not None else []
3170
3189
 
3171
3190
  # Xy = pd.concat([X, y], axis=1)
@@ -3357,6 +3376,10 @@ class FeaturesEnricher(TransformerMixin):
3357
3376
  if t == SearchKey.POSTAL_CODE:
3358
3377
  return col
3359
3378
 
3379
+ @staticmethod
3380
+ def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3381
+ return SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3382
+
3360
3383
  def _explode_multiple_search_keys(
3361
3384
  self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
3362
3385
  ) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
@@ -3365,7 +3388,9 @@ class FeaturesEnricher(TransformerMixin):
3365
3388
  for key_name, key_type in search_keys.items():
3366
3389
  search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
3367
3390
  search_key_names_by_type = {
3368
- key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
3391
+ key_type: key_names
3392
+ for key_type, key_names in search_key_names_by_type.items()
3393
+ if len(key_names) > 1 and key_type != SearchKey.CUSTOM_KEY
3369
3394
  }
3370
3395
  if len(search_key_names_by_type) == 0:
3371
3396
  return df, {}
@@ -3418,9 +3443,9 @@ class FeaturesEnricher(TransformerMixin):
3418
3443
  ]
3419
3444
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3420
3445
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
3421
- sort_exclude_columns.append(SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]))
3446
+ sort_exclude_columns.append(self._get_date_column(search_keys))
3422
3447
  else:
3423
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3448
+ date_column = self._get_date_column(search_keys)
3424
3449
  sort_columns = [date_column] if date_column is not None else []
3425
3450
 
3426
3451
  sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
@@ -3856,11 +3881,6 @@ class FeaturesEnricher(TransformerMixin):
3856
3881
  self.logger.warning(msg + f" Provided search keys: {search_keys}")
3857
3882
  raise ValidationError(msg)
3858
3883
 
3859
- if SearchKey.CUSTOM_KEY in valid_search_keys.values():
3860
- custom_keys = [column for column, key in valid_search_keys.items() if key == SearchKey.CUSTOM_KEY]
3861
- for key in custom_keys:
3862
- del valid_search_keys[key]
3863
-
3864
3884
  if (
3865
3885
  len(valid_search_keys.values()) == 1
3866
3886
  and self.country_code is None
@@ -350,3 +350,6 @@ class CVType(Enum):
350
350
  time_series = "time_series"
351
351
  blocked_time_series = "blocked_time_series"
352
352
  not_set = "not_set"
353
+
354
+ def is_time_series(self) -> bool:
355
+ return self in [CVType.time_series, CVType.blocked_time_series]
@@ -216,7 +216,6 @@ imbalanced_target=\nTarget is imbalanced and will be undersampled. Frequency of
216
216
  loss_selection_info=Using loss `{}` for feature selection
217
217
  loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
218
218
  forced_balance_undersample=For quick data retrieval, your dataset has been sampled. To use data search without data sampling please contact support (sales@upgini.com)
219
- online_api_features_transform=Please note that some of the selected features {} are provided through a slow enrichment interface and are not available via transformation. However, they can be accessed via the API:
220
219
 
221
220
  # Validation table
222
221
  validation_column_name_header=Column name
@@ -1,15 +1,18 @@
1
+ import itertools
1
2
  import logging
2
- from typing import Callable, Optional, Union
3
+ from typing import Callable, List, Optional, Union
3
4
 
4
5
  import numpy as np
5
6
  import pandas as pd
6
7
  from pandas.api.types import is_numeric_dtype, is_bool_dtype
7
8
 
8
9
  from upgini.errors import ValidationError
9
- from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
10
+ from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
10
11
  from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
11
12
  from upgini.sampler.random_under_sampler import RandomUnderSampler
12
13
 
14
+ TS_MIN_DIFFERENT_IDS_RATIO = 0.2
15
+
13
16
 
14
17
  def correct_string_target(y: Union[pd.Series, np.ndarray]) -> Union[pd.Series, np.ndarray]:
15
18
  if isinstance(y, pd.Series):
@@ -201,7 +204,10 @@ def balance_undersample(
201
204
  def balance_undersample_forced(
202
205
  df: pd.DataFrame,
203
206
  target_column: str,
207
+ id_columns: List[str],
208
+ date_column: str,
204
209
  task_type: ModelTaskType,
210
+ cv_type: CVType | None,
205
211
  random_state: int,
206
212
  sample_size: int = 7000,
207
213
  logger: Optional[logging.Logger] = None,
@@ -233,7 +239,17 @@ def balance_undersample_forced(
233
239
 
234
240
  resampled_data = df
235
241
  df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
236
- if task_type in [ModelTaskType.MULTICLASS, ModelTaskType.REGRESSION, ModelTaskType.TIMESERIES]:
242
+ if cv_type is not None and cv_type.is_time_series():
243
+ logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
244
+ resampled_data = balance_undersample_time_series(
245
+ df,
246
+ id_columns=id_columns,
247
+ date_column=date_column,
248
+ sample_size=sample_size,
249
+ random_state=random_state,
250
+ logger=logger,
251
+ )
252
+ elif task_type in [ModelTaskType.MULTICLASS, ModelTaskType.REGRESSION]:
237
253
  logger.warning(f"Sampling dataset from {len(df)} to {sample_size}")
238
254
  resampled_data = df.sample(n=sample_size, random_state=random_state)
239
255
  else:
@@ -264,6 +280,63 @@ def balance_undersample_forced(
264
280
  return resampled_data
265
281
 
266
282
 
283
+ def balance_undersample_time_series(
284
+ df: pd.DataFrame,
285
+ id_columns: List[str],
286
+ date_column: str,
287
+ sample_size: int,
288
+ random_state: int = 42,
289
+ min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO,
290
+ prefer_recent_dates: bool = True,
291
+ logger: Optional[logging.Logger] = None,
292
+ ):
293
+ def ensure_tuple(x):
294
+ return tuple([x]) if not isinstance(x, tuple) else x
295
+
296
+ random_state = np.random.RandomState(random_state)
297
+
298
+ ids_sort = df.groupby(id_columns)[date_column].aggregate(["max", "count"]).T.to_dict()
299
+ ids_sort = {
300
+ ensure_tuple(k): (
301
+ (v["max"], v["count"], random_state.rand()) if prefer_recent_dates else (v["count"], random_state.rand())
302
+ )
303
+ for k, v in ids_sort.items()
304
+ }
305
+ id_counts = df[id_columns].value_counts()
306
+ id_counts.index = [ensure_tuple(i) for i in id_counts.index]
307
+ id_counts = id_counts.sort_index(key=lambda x: [ids_sort[y] for y in x], ascending=False).cumsum()
308
+ id_counts = id_counts[id_counts <= sample_size]
309
+ min_different_ids = int(len(df[id_columns].drop_duplicates()) * min_different_ids_ratio)
310
+
311
+ def id_mask(sample_index: pd.Index) -> pd.Index:
312
+ if isinstance(sample_index, pd.MultiIndex):
313
+ return pd.MultiIndex.from_frame(df[id_columns]).isin(sample_index)
314
+ else:
315
+ return df[id_columns[0]].isin(sample_index)
316
+
317
+ if len(id_counts) < min_different_ids:
318
+ if logger is not None:
319
+ logger.info(
320
+ f"Different ids count {len(id_counts)} is less than min different ids {min_different_ids}, sampling time window"
321
+ )
322
+ date_counts = df.groupby(id_columns)[date_column].nunique().sort_values(ascending=False)
323
+ ids_to_sample = date_counts.index[:min_different_ids]
324
+ mask = id_mask(ids_to_sample)
325
+ df = df[mask]
326
+ sample_date_counts = df[date_column].value_counts().sort_index(ascending=False).cumsum()
327
+ sample_date_counts = sample_date_counts[sample_date_counts <= sample_size]
328
+ df = df[df[date_column].isin(sample_date_counts.index)]
329
+ else:
330
+ if len(id_columns) > 1:
331
+ id_counts.index = pd.MultiIndex.from_tuples(id_counts.index)
332
+ else:
333
+ id_counts.index = [i[0] for i in id_counts.index]
334
+ mask = id_mask(id_counts.index)
335
+ df = df[mask]
336
+
337
+ return df
338
+
339
+
267
340
  def calculate_psi(expected: pd.Series, actual: pd.Series) -> Union[float, Exception]:
268
341
  try:
269
342
  df = pd.concat([expected, actual])
@@ -1 +0,0 @@
1
- __version__ = "1.2.37"
File without changes
File without changes
File without changes