upgini 1.2.38a3769.dev8__tar.gz → 1.2.39a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (67) hide show
  1. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/PKG-INFO +1 -1
  2. upgini-1.2.39a1/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/dataset.py +2 -24
  4. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/features_enricher.py +37 -60
  5. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/metadata.py +0 -3
  6. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/resource_bundle/strings.properties +1 -0
  7. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/target_utils.py +3 -78
  8. upgini-1.2.38a3769.dev8/src/upgini/__about__.py +0 -1
  9. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/.gitignore +0 -0
  10. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/LICENSE +0 -0
  11. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/README.md +0 -0
  12. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/pyproject.toml +0 -0
  13. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/__init__.py +0 -0
  14. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/ads.py +0 -0
  15. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/ads_management/__init__.py +0 -0
  16. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/ads_management/ads_manager.py +0 -0
  17. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/autofe/__init__.py +0 -0
  18. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/autofe/all_operands.py +0 -0
  19. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/autofe/binary.py +0 -0
  20. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/autofe/date.py +0 -0
  21. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/autofe/feature.py +0 -0
  22. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/autofe/groupby.py +0 -0
  23. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/autofe/operand.py +0 -0
  24. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/autofe/unary.py +0 -0
  25. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/autofe/vector.py +0 -0
  26. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/data_source/__init__.py +0 -0
  27. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/data_source/data_source_publisher.py +0 -0
  28. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/errors.py +0 -0
  29. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/http.py +0 -0
  30. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/lazy_import.py +0 -0
  31. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/mdc/__init__.py +0 -0
  32. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/mdc/context.py +0 -0
  33. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/metrics.py +0 -0
  34. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/normalizer/__init__.py +0 -0
  35. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/normalizer/normalize_utils.py +0 -0
  36. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/resource_bundle/__init__.py +0 -0
  37. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/resource_bundle/exceptions.py +0 -0
  38. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  39. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/sampler/__init__.py +0 -0
  40. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/sampler/base.py +0 -0
  41. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/sampler/random_under_sampler.py +0 -0
  42. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/sampler/utils.py +0 -0
  43. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/search_task.py +0 -0
  44. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/spinner.py +0 -0
  45. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  46. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/__init__.py +0 -0
  47. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/base_search_key_detector.py +0 -0
  48. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/blocked_time_series.py +0 -0
  49. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/country_utils.py +0 -0
  50. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/custom_loss_utils.py +0 -0
  51. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/cv_utils.py +0 -0
  52. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/datetime_utils.py +0 -0
  53. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/deduplicate_utils.py +0 -0
  54. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/display_utils.py +0 -0
  55. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/email_utils.py +0 -0
  56. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
  57. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/feature_info.py +0 -0
  58. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/features_validator.py +0 -0
  59. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/format.py +0 -0
  60. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/ip_utils.py +0 -0
  61. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/phone_utils.py +0 -0
  62. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/postal_code_utils.py +0 -0
  63. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/progress_bar.py +0 -0
  64. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/sklearn_ext.py +0 -0
  65. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/track_info.py +0 -0
  66. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/warning_counter.py +0 -0
  67. {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.38a3769.dev8
3
+ Version: 1.2.39a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.39a1"
@@ -22,7 +22,6 @@ from upgini.metadata import (
22
22
  EVAL_SET_INDEX,
23
23
  SYSTEM_RECORD_ID,
24
24
  TARGET,
25
- CVType,
26
25
  DataType,
27
26
  FeaturesFilter,
28
27
  FileColumnMeaningType,
@@ -33,12 +32,11 @@ from upgini.metadata import (
33
32
  NumericInterval,
34
33
  RuntimeParameters,
35
34
  SearchCustomization,
36
- SearchKey,
37
35
  )
38
36
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
39
37
  from upgini.search_task import SearchTask
40
38
  from upgini.utils.email_utils import EmailSearchKeyConverter
41
- from upgini.utils.target_utils import balance_undersample, balance_undersample_forced, balance_undersample_time_series
39
+ from upgini.utils.target_utils import balance_undersample, balance_undersample_forced
42
40
 
43
41
  try:
44
42
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
@@ -76,8 +74,6 @@ class Dataset: # (pd.DataFrame):
76
74
  search_keys: Optional[List[Tuple[str, ...]]] = None,
77
75
  unnest_search_keys: Optional[Dict[str, str]] = None,
78
76
  model_task_type: Optional[ModelTaskType] = None,
79
- cv_type: Optional[CVType] = None,
80
- id_columns: Optional[List[str]] = None,
81
77
  random_state: Optional[int] = None,
82
78
  rest_client: Optional[_RestClient] = None,
83
79
  logger: Optional[logging.Logger] = None,
@@ -108,7 +104,6 @@ class Dataset: # (pd.DataFrame):
108
104
 
109
105
  self.dataset_name = dataset_name
110
106
  self.task_type = model_task_type
111
- self.cv_type = cv_type
112
107
  self.description = description
113
108
  self.meaning_types = meaning_types
114
109
  self.search_keys = search_keys
@@ -121,7 +116,6 @@ class Dataset: # (pd.DataFrame):
121
116
  self.random_state = random_state
122
117
  self.columns_renaming: Dict[str, str] = {}
123
118
  self.imbalanced: bool = False
124
- self.id_columns = id_columns
125
119
  if logger is not None:
126
120
  self.logger = logger
127
121
  else:
@@ -231,8 +225,6 @@ class Dataset: # (pd.DataFrame):
231
225
  df=self.data,
232
226
  target_column=target_column,
233
227
  task_type=self.task_type,
234
- cv_type=self.cv_type,
235
- id_columns=self.id_columns,
236
228
  random_state=self.random_state,
237
229
  sample_size=self.FORCE_SAMPLE_SIZE,
238
230
  logger=self.logger,
@@ -305,21 +297,7 @@ class Dataset: # (pd.DataFrame):
305
297
  f"Etalon has size {len(self.data)} more than threshold {sample_threshold} "
306
298
  f"and will be downsampled to {sample_rows}"
307
299
  )
308
- if self.cv_type is not None and self.cv_type.is_time_series():
309
- resampled_data = balance_undersample_time_series(
310
- df=self.data,
311
- id_columns=self.id_columns,
312
- date_column=next(
313
- k
314
- for k, v in self.meaning_types.items()
315
- if v in [FileColumnMeaningType.DATE, FileColumnMeaningType.DATETIME]
316
- ),
317
- sample_size=sample_rows,
318
- random_state=self.random_state,
319
- logger=self.logger,
320
- )
321
- else:
322
- resampled_data = self.data.sample(n=sample_rows, random_state=self.random_state)
300
+ resampled_data = self.data.sample(n=sample_rows, random_state=self.random_state)
323
301
  self.data = resampled_data
324
302
  self.logger.info(f"Shape after threshold resampling: {self.data.shape}")
325
303
 
@@ -237,7 +237,6 @@ class FeaturesEnricher(TransformerMixin):
237
237
  add_date_if_missing: bool = True,
238
238
  select_features: bool = False,
239
239
  disable_force_downsampling: bool = False,
240
- id_columns: Optional[List[str]] = None,
241
240
  **kwargs,
242
241
  ):
243
242
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -278,12 +277,9 @@ class FeaturesEnricher(TransformerMixin):
278
277
  )
279
278
 
280
279
  validate_version(self.logger, self.__log_warning)
281
-
282
280
  self.search_keys = search_keys or {}
283
- self.id_columns = id_columns
284
281
  self.country_code = country_code
285
282
  self.__validate_search_keys(search_keys, search_id)
286
-
287
283
  self.model_task_type = model_task_type
288
284
  self.endpoint = endpoint
289
285
  self._search_task: Optional[SearchTask] = None
@@ -932,9 +928,6 @@ class FeaturesEnricher(TransformerMixin):
932
928
  cat_features, search_keys_for_metrics = self._get_client_cat_features(
933
929
  estimator, validated_X, self.search_keys
934
930
  )
935
- search_keys_for_metrics.extend(
936
- [c for c in self.__get_renamed_id_columns() or [] if c not in search_keys_for_metrics]
937
- )
938
931
 
939
932
  prepared_data = self._prepare_data_for_metrics(
940
933
  trace_id=trace_id,
@@ -990,7 +983,7 @@ class FeaturesEnricher(TransformerMixin):
990
983
  with Spinner():
991
984
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
992
985
 
993
- has_date = self._get_date_column(search_keys) is not None
986
+ has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
994
987
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
995
988
 
996
989
  wrapper = EstimatorWrapper.create(
@@ -1192,7 +1185,7 @@ class FeaturesEnricher(TransformerMixin):
1192
1185
  )
1193
1186
 
1194
1187
  uplift_col = self.bundle.get("quality_metrics_uplift_header")
1195
- date_column = self._get_date_column(search_keys)
1188
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1196
1189
  if (
1197
1190
  uplift_col in metrics_df.columns
1198
1191
  and (metrics_df[uplift_col] < 0).any()
@@ -1361,7 +1354,7 @@ class FeaturesEnricher(TransformerMixin):
1361
1354
  groups = None
1362
1355
 
1363
1356
  if not isinstance(_cv, BaseCrossValidator):
1364
- date_column = self._get_date_column(search_keys)
1357
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1365
1358
  date_series = X[date_column] if date_column is not None else None
1366
1359
  _cv, groups = CVConfig(
1367
1360
  _cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
@@ -1450,11 +1443,9 @@ class FeaturesEnricher(TransformerMixin):
1450
1443
 
1451
1444
  excluding_search_keys = list(search_keys.keys())
1452
1445
  if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
1453
- excluded = set()
1454
1446
  for sk in excluding_search_keys:
1455
1447
  if columns_renaming.get(sk) in search_keys_for_metrics:
1456
- excluded.add(sk)
1457
- excluding_search_keys = [sk for sk in excluding_search_keys if sk not in excluded]
1448
+ excluding_search_keys.remove(sk)
1458
1449
 
1459
1450
  client_features = [
1460
1451
  c
@@ -1676,7 +1667,7 @@ class FeaturesEnricher(TransformerMixin):
1676
1667
  search_keys = self.search_keys.copy()
1677
1668
  search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
1678
1669
 
1679
- date_column = self._get_date_column(search_keys)
1670
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1680
1671
  generated_features = []
1681
1672
  if date_column is not None:
1682
1673
  converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
@@ -1750,7 +1741,7 @@ class FeaturesEnricher(TransformerMixin):
1750
1741
  search_keys = self.fit_search_keys
1751
1742
 
1752
1743
  rows_to_drop = None
1753
- has_date = self._get_date_column(search_keys) is not None
1744
+ has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
1754
1745
  self.model_task_type = self.model_task_type or define_task(
1755
1746
  self.df_with_original_index[TARGET], has_date, self.logger, silent=True
1756
1747
  )
@@ -1862,10 +1853,7 @@ class FeaturesEnricher(TransformerMixin):
1862
1853
  df = balance_undersample_forced(
1863
1854
  df=df,
1864
1855
  target_column=TARGET,
1865
- id_columns=self.id_columns,
1866
- date_column=self._get_date_column(self.search_keys),
1867
1856
  task_type=self.model_task_type,
1868
- cv_type=self.cv,
1869
1857
  random_state=self.random_state,
1870
1858
  sample_size=Dataset.FORCE_SAMPLE_SIZE,
1871
1859
  logger=self.logger,
@@ -2007,7 +1995,7 @@ class FeaturesEnricher(TransformerMixin):
2007
1995
  trace_id = trace_id or uuid.uuid4()
2008
1996
  return search_task.get_progress(trace_id)
2009
1997
 
2010
- def get_transactional_transform_api(self):
1998
+ def get_transactional_transform_api(self, only_online_sources=False):
2011
1999
  if self.api_key is None:
2012
2000
  raise ValidationError(self.bundle.get("transactional_transform_unregistered"))
2013
2001
  if self._search_task is None:
@@ -2065,7 +2053,7 @@ class FeaturesEnricher(TransformerMixin):
2065
2053
  api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
2066
2054
  -H 'Authorization: {self.api_key}' \\
2067
2055
  -H 'Content-Type: application/json' \\
2068
- -d '{{"search_keys": {keys}{features_section}}}'"""
2056
+ -d '{{"search_keys": {keys}{features_section}, "only_online_sources": {str(only_online_sources).lower()}}}'"""
2069
2057
  return api_example
2070
2058
 
2071
2059
  def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
@@ -2109,13 +2097,15 @@ class FeaturesEnricher(TransformerMixin):
2109
2097
  return None, {c: c for c in X.columns}, []
2110
2098
 
2111
2099
  features_meta = self._search_task.get_all_features_metadata_v2()
2112
- online_api_features = [fm.name for fm in features_meta if fm.from_online_api]
2100
+ online_api_features = [fm.name for fm in features_meta if fm.from_online_api and fm.shap_value > 0]
2113
2101
  if len(online_api_features) > 0:
2114
2102
  self.logger.warning(
2115
2103
  f"There are important features for transform, that generated by online API: {online_api_features}"
2116
2104
  )
2117
- # TODO
2118
- raise Exception("There are features selected that are paid. Contact support (sales@upgini.com)")
2105
+ msg = self.bundle.get("online_api_features_transform").format(online_api_features)
2106
+ self.logger.warning(msg)
2107
+ print(msg)
2108
+ print(self.get_transactional_transform_api(only_online_sources=True))
2119
2109
 
2120
2110
  if not metrics_calculation:
2121
2111
  transform_usage = self.rest_client.get_current_transform_usage(trace_id)
@@ -2165,7 +2155,7 @@ class FeaturesEnricher(TransformerMixin):
2165
2155
  df = self.__add_country_code(df, search_keys)
2166
2156
 
2167
2157
  generated_features = []
2168
- date_column = self._get_date_column(search_keys)
2158
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2169
2159
  if date_column is not None:
2170
2160
  converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2171
2161
  df = converter.convert(df, keep_time=True)
@@ -2173,7 +2163,7 @@ class FeaturesEnricher(TransformerMixin):
2173
2163
  generated_features.extend(converter.generated_features)
2174
2164
  else:
2175
2165
  self.logger.info("Input dataset hasn't date column")
2176
- if self.__should_add_date_column():
2166
+ if self.add_date_if_missing:
2177
2167
  df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
2178
2168
 
2179
2169
  email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
@@ -2304,7 +2294,6 @@ class FeaturesEnricher(TransformerMixin):
2304
2294
  meaning_types=meaning_types,
2305
2295
  search_keys=combined_search_keys,
2306
2296
  unnest_search_keys=unnest_search_keys,
2307
- id_columns=self.__get_renamed_id_columns(),
2308
2297
  date_format=self.date_format,
2309
2298
  rest_client=self.rest_client,
2310
2299
  logger=self.logger,
@@ -2457,14 +2446,7 @@ class FeaturesEnricher(TransformerMixin):
2457
2446
  # Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
2458
2447
  multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
2459
2448
  for multi_key in multi_keys:
2460
- if multi_key not in [
2461
- SearchKey.PHONE,
2462
- SearchKey.IP,
2463
- SearchKey.POSTAL_CODE,
2464
- SearchKey.EMAIL,
2465
- SearchKey.HEM,
2466
- SearchKey.CUSTOM_KEY,
2467
- ]:
2449
+ if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
2468
2450
  msg = self.bundle.get("unsupported_multi_key").format(multi_key)
2469
2451
  self.logger.warning(msg)
2470
2452
  raise ValidationError(msg)
@@ -2628,7 +2610,7 @@ class FeaturesEnricher(TransformerMixin):
2628
2610
  self.fit_generated_features.extend(converter.generated_features)
2629
2611
  else:
2630
2612
  self.logger.info("Input dataset hasn't date column")
2631
- if self.__should_add_date_column():
2613
+ if self.add_date_if_missing:
2632
2614
  df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2633
2615
 
2634
2616
  email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
@@ -2661,12 +2643,6 @@ class FeaturesEnricher(TransformerMixin):
2661
2643
 
2662
2644
  self.__adjust_cv(df)
2663
2645
 
2664
- if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
2665
- id_columns = self.__get_renamed_id_columns()
2666
- if id_columns:
2667
- self.fit_search_keys.update({col: SearchKey.CUSTOM_KEY for col in id_columns})
2668
- self.runtime_parameters.properties["id_columns"] = ",".join(id_columns)
2669
-
2670
2646
  df, fintech_warnings = remove_fintech_duplicates(
2671
2647
  df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
2672
2648
  )
@@ -2696,6 +2672,7 @@ class FeaturesEnricher(TransformerMixin):
2696
2672
  self.fit_search_keys,
2697
2673
  self.fit_columns_renaming,
2698
2674
  list(unnest_search_keys.keys()),
2675
+ self.bundle,
2699
2676
  self.logger,
2700
2677
  )
2701
2678
  df = converter.convert(df)
@@ -2788,8 +2765,6 @@ class FeaturesEnricher(TransformerMixin):
2788
2765
  search_keys=combined_search_keys,
2789
2766
  unnest_search_keys=unnest_search_keys,
2790
2767
  model_task_type=self.model_task_type,
2791
- cv_type=self.cv,
2792
- id_columns=self.__get_renamed_id_columns(),
2793
2768
  date_format=self.date_format,
2794
2769
  random_state=self.random_state,
2795
2770
  rest_client=self.rest_client,
@@ -2946,13 +2921,6 @@ class FeaturesEnricher(TransformerMixin):
2946
2921
  if not self.warning_counter.has_warnings():
2947
2922
  self.__display_support_link(self.bundle.get("all_ok_community_invite"))
2948
2923
 
2949
- def __should_add_date_column(self):
2950
- return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
2951
-
2952
- def __get_renamed_id_columns(self):
2953
- reverse_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
2954
- return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
2955
-
2956
2924
  def __adjust_cv(self, df: pd.DataFrame):
2957
2925
  date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2958
2926
  # Check Multivariate time series
@@ -3198,7 +3166,7 @@ class FeaturesEnricher(TransformerMixin):
3198
3166
  if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
3199
3167
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
3200
3168
  else:
3201
- date_column = FeaturesEnricher._get_date_column(search_keys)
3169
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3202
3170
  sort_columns = [date_column] if date_column is not None else []
3203
3171
 
3204
3172
  # Xy = pd.concat([X, y], axis=1)
@@ -3261,6 +3229,7 @@ class FeaturesEnricher(TransformerMixin):
3261
3229
  f"Generate features: {self.generate_features}\n"
3262
3230
  f"Round embeddings: {self.round_embeddings}\n"
3263
3231
  f"Detect missing search keys: {self.detect_missing_search_keys}\n"
3232
+ f"Exclude columns: {self.exclude_columns}\n"
3264
3233
  f"Exclude features sources: {exclude_features_sources}\n"
3265
3234
  f"Calculate metrics: {calculate_metrics}\n"
3266
3235
  f"Scoring: {scoring}\n"
@@ -3268,6 +3237,15 @@ class FeaturesEnricher(TransformerMixin):
3268
3237
  f"Remove target outliers: {remove_outliers_calc_metrics}\n"
3269
3238
  f"Exclude columns: {self.exclude_columns}\n"
3270
3239
  f"Search id: {self.search_id}\n"
3240
+ f"Custom loss: {self.loss}\n"
3241
+ f"Logs enabled: {self.logs_enabled}\n"
3242
+ f"Raise validation error: {self.raise_validation_error}\n"
3243
+ f"Baseline score column: {self.baseline_score_column}\n"
3244
+ f"Client ip: {self.client_ip}\n"
3245
+ f"Client visitorId: {self.client_visitorid}\n"
3246
+ f"Add date if missing: {self.add_date_if_missing}\n"
3247
+ f"Select features: {self.select_features}\n"
3248
+ f"Disable force downsampling: {self.disable_force_downsampling}\n"
3271
3249
  )
3272
3250
 
3273
3251
  def sample(df):
@@ -3390,10 +3368,6 @@ class FeaturesEnricher(TransformerMixin):
3390
3368
  if t == SearchKey.POSTAL_CODE:
3391
3369
  return col
3392
3370
 
3393
- @staticmethod
3394
- def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3395
- return SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3396
-
3397
3371
  def _explode_multiple_search_keys(
3398
3372
  self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
3399
3373
  ) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
@@ -3402,9 +3376,7 @@ class FeaturesEnricher(TransformerMixin):
3402
3376
  for key_name, key_type in search_keys.items():
3403
3377
  search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
3404
3378
  search_key_names_by_type = {
3405
- key_type: key_names
3406
- for key_type, key_names in search_key_names_by_type.items()
3407
- if len(key_names) > 1 and key_type != SearchKey.CUSTOM_KEY
3379
+ key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
3408
3380
  }
3409
3381
  if len(search_key_names_by_type) == 0:
3410
3382
  return df, {}
@@ -3457,9 +3429,9 @@ class FeaturesEnricher(TransformerMixin):
3457
3429
  ]
3458
3430
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3459
3431
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
3460
- sort_exclude_columns.append(self._get_date_column(search_keys))
3432
+ sort_exclude_columns.append(SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]))
3461
3433
  else:
3462
- date_column = self._get_date_column(search_keys)
3434
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3463
3435
  sort_columns = [date_column] if date_column is not None else []
3464
3436
 
3465
3437
  sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
@@ -3895,6 +3867,11 @@ class FeaturesEnricher(TransformerMixin):
3895
3867
  self.logger.warning(msg + f" Provided search keys: {search_keys}")
3896
3868
  raise ValidationError(msg)
3897
3869
 
3870
+ if SearchKey.CUSTOM_KEY in valid_search_keys.values():
3871
+ custom_keys = [column for column, key in valid_search_keys.items() if key == SearchKey.CUSTOM_KEY]
3872
+ for key in custom_keys:
3873
+ del valid_search_keys[key]
3874
+
3898
3875
  if (
3899
3876
  len(valid_search_keys.values()) == 1
3900
3877
  and self.country_code is None
@@ -350,6 +350,3 @@ class CVType(Enum):
350
350
  time_series = "time_series"
351
351
  blocked_time_series = "blocked_time_series"
352
352
  not_set = "not_set"
353
-
354
- def is_time_series(self) -> bool:
355
- return self in [CVType.time_series, CVType.blocked_time_series]
@@ -216,6 +216,7 @@ imbalanced_target=\nTarget is imbalanced and will be undersampled. Frequency of
216
216
  loss_selection_info=Using loss `{}` for feature selection
217
217
  loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
218
218
  forced_balance_undersample=For quick data retrieval, your dataset has been sampled. To use data search without data sampling please contact support (sales@upgini.com)
219
+ online_api_features_transform=Please note that some of the selected features {} are provided through a slow enrichment interface and are not available via transformation. However, they can be accessed via the API:
219
220
 
220
221
  # Validation table
221
222
  validation_column_name_header=Column name
@@ -1,18 +1,15 @@
1
- import itertools
2
1
  import logging
3
- from typing import Callable, List, Optional, Union
2
+ from typing import Callable, Optional, Union
4
3
 
5
4
  import numpy as np
6
5
  import pandas as pd
7
6
  from pandas.api.types import is_numeric_dtype, is_bool_dtype
8
7
 
9
8
  from upgini.errors import ValidationError
10
- from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
9
+ from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
11
10
  from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
12
11
  from upgini.sampler.random_under_sampler import RandomUnderSampler
13
12
 
14
- TS_MIN_DIFFERENT_IDS_RATIO = 0.2
15
-
16
13
 
17
14
  def correct_string_target(y: Union[pd.Series, np.ndarray]) -> Union[pd.Series, np.ndarray]:
18
15
  if isinstance(y, pd.Series):
@@ -204,10 +201,7 @@ def balance_undersample(
204
201
  def balance_undersample_forced(
205
202
  df: pd.DataFrame,
206
203
  target_column: str,
207
- id_columns: List[str],
208
- date_column: str,
209
204
  task_type: ModelTaskType,
210
- cv_type: CVType | None,
211
205
  random_state: int,
212
206
  sample_size: int = 7000,
213
207
  logger: Optional[logging.Logger] = None,
@@ -239,17 +233,7 @@ def balance_undersample_forced(
239
233
 
240
234
  resampled_data = df
241
235
  df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
242
- if cv_type is not None and cv_type.is_time_series():
243
- logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
244
- resampled_data = balance_undersample_time_series(
245
- df,
246
- id_columns=id_columns,
247
- date_column=date_column,
248
- sample_size=sample_size,
249
- random_state=random_state,
250
- logger=logger,
251
- )
252
- elif task_type in [ModelTaskType.MULTICLASS, ModelTaskType.REGRESSION]:
236
+ if task_type in [ModelTaskType.MULTICLASS, ModelTaskType.REGRESSION, ModelTaskType.TIMESERIES]:
253
237
  logger.warning(f"Sampling dataset from {len(df)} to {sample_size}")
254
238
  resampled_data = df.sample(n=sample_size, random_state=random_state)
255
239
  else:
@@ -280,65 +264,6 @@ def balance_undersample_forced(
280
264
  return resampled_data
281
265
 
282
266
 
283
- def balance_undersample_time_series(
284
- df: pd.DataFrame,
285
- id_columns: List[str],
286
- date_column: str,
287
- sample_size: int,
288
- random_state: int = 42,
289
- min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO,
290
- prefer_recent_dates: bool = True,
291
- logger: Optional[logging.Logger] = None,
292
- ):
293
- def ensure_tuple(x):
294
- return tuple([x]) if not isinstance(x, tuple) else x
295
-
296
- random_state = np.random.RandomState(random_state)
297
-
298
- if not id_columns:
299
- id_columns = [date_column]
300
- ids_sort = df.groupby(id_columns)[date_column].aggregate(["max", "count"]).T.to_dict()
301
- ids_sort = {
302
- ensure_tuple(k): (
303
- (v["max"], v["count"], random_state.rand()) if prefer_recent_dates else (v["count"], random_state.rand())
304
- )
305
- for k, v in ids_sort.items()
306
- }
307
- id_counts = df[id_columns].value_counts()
308
- id_counts.index = [ensure_tuple(i) for i in id_counts.index]
309
- id_counts = id_counts.sort_index(key=lambda x: [ids_sort[y] for y in x], ascending=False).cumsum()
310
- id_counts = id_counts[id_counts <= sample_size]
311
- min_different_ids = max(int(len(df[id_columns].drop_duplicates()) * min_different_ids_ratio), 1)
312
-
313
- def id_mask(sample_index: pd.Index) -> pd.Index:
314
- if isinstance(sample_index, pd.MultiIndex):
315
- return pd.MultiIndex.from_frame(df[id_columns]).isin(sample_index)
316
- else:
317
- return df[id_columns[0]].isin(sample_index)
318
-
319
- if len(id_counts) < min_different_ids:
320
- if logger is not None:
321
- logger.info(
322
- f"Different ids count {len(id_counts)} for sample size {sample_size} is less than min different ids {min_different_ids}, sampling time window"
323
- )
324
- date_counts = df.groupby(id_columns)[date_column].nunique().sort_values(ascending=False)
325
- ids_to_sample = date_counts.index[:min_different_ids] if len(id_counts) > 0 else date_counts.index
326
- mask = id_mask(ids_to_sample)
327
- df = df[mask]
328
- sample_date_counts = df[date_column].value_counts().sort_index(ascending=False).cumsum()
329
- sample_date_counts = sample_date_counts[sample_date_counts <= sample_size]
330
- df = df[df[date_column].isin(sample_date_counts.index)]
331
- else:
332
- if len(id_columns) > 1:
333
- id_counts.index = pd.MultiIndex.from_tuples(id_counts.index)
334
- else:
335
- id_counts.index = [i[0] for i in id_counts.index]
336
- mask = id_mask(id_counts.index)
337
- df = df[mask]
338
-
339
- return df
340
-
341
-
342
267
  def calculate_psi(expected: pd.Series, actual: pd.Series) -> Union[float, Exception]:
343
268
  try:
344
269
  df = pd.concat([expected, actual])
@@ -1 +0,0 @@
1
- __version__ = "1.2.38a3769.dev8"
File without changes
File without changes
File without changes