upgini 1.2.123__py3-none-any.whl → 1.2.125__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.123"
1
+ __version__ = "1.2.125"
upgini/autofe/binary.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import abc
2
2
  from typing import Optional
3
+
3
4
  import Levenshtein
4
5
  import numpy as np
5
6
  import pandas as pd
@@ -201,7 +202,7 @@ class JaroWinklerSim1(StringSim):
201
202
  has_symmetry_importance: bool = True
202
203
 
203
204
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
204
- return value
205
+ return value if value is not None and len(value) > 0 else None
205
206
 
206
207
  def _similarity(self, left: str, right: str) -> float:
207
208
  return jarowinkler_similarity(left, right)
@@ -216,7 +217,7 @@ class JaroWinklerSim2(StringSim):
216
217
  has_symmetry_importance: bool = True
217
218
 
218
219
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
219
- return value[::-1] if value is not None else None
220
+ return value[::-1] if value is not None and len(value) > 0 else None
220
221
 
221
222
  def _similarity(self, left: str, right: str) -> float:
222
223
  return jarowinkler_similarity(left, right)
@@ -231,7 +232,7 @@ class LevenshteinSim(StringSim):
231
232
  has_symmetry_importance: bool = True
232
233
 
233
234
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
234
- return value
235
+ return value if value is not None and len(value) > 0 else None
235
236
 
236
237
  def _similarity(self, left: str, right: str) -> float:
237
238
  return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
upgini/dataset.py CHANGED
@@ -151,7 +151,9 @@ class Dataset:
151
151
  def etalon_def_checked(self) -> Dict[str, str]:
152
152
  if self.etalon_def is None:
153
153
  self.etalon_def = {
154
- v.value: k for k, v in self.meaning_types_checked.items() if v != FileColumnMeaningType.FEATURE
154
+ v.value: k
155
+ for k, v in self.meaning_types_checked.items()
156
+ if v not in [FileColumnMeaningType.FEATURE, FileColumnMeaningType.DATE_FEATURE]
155
157
  }
156
158
 
157
159
  return self.etalon_def
@@ -76,7 +76,7 @@ from upgini.utils.custom_loss_utils import (
76
76
  )
77
77
  from upgini.utils.cv_utils import CVConfig, get_groups
78
78
  from upgini.utils.datetime_utils import (
79
- DateTimeSearchKeyConverter,
79
+ DateTimeConverter,
80
80
  is_blocked_time_series,
81
81
  is_dates_distribution_valid,
82
82
  is_time_series,
@@ -220,7 +220,9 @@ class FeaturesEnricher(TransformerMixin):
220
220
  cv: CVType | None = None,
221
221
  loss: str | None = None,
222
222
  autodetect_search_keys: bool = True,
223
+ # deprecated, use text_features instead
223
224
  generate_features: list[str] | None = None,
225
+ text_features: list[str] | None = None,
224
226
  columns_for_online_api: list[str] | None = None,
225
227
  round_embeddings: int | None = None,
226
228
  logs_enabled: bool = True,
@@ -284,8 +286,7 @@ class FeaturesEnricher(TransformerMixin):
284
286
  self.country_code = country_code
285
287
  self.__validate_search_keys(search_keys, search_id)
286
288
 
287
- if model_task_type is not None:
288
- self.model_task_type = ModelTaskType.parse(model_task_type)
289
+ self.model_task_type = ModelTaskType.parse(model_task_type)
289
290
  self.endpoint = endpoint
290
291
  self._search_task: SearchTask | None = None
291
292
  self.features_info: pd.DataFrame = self.EMPTY_FEATURES_INFO
@@ -306,10 +307,8 @@ class FeaturesEnricher(TransformerMixin):
306
307
  search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
307
308
 
308
309
  print(self.bundle.get("search_by_task_id_start"))
309
- trace_id = str(uuid.uuid4())
310
- if self.print_trace_id:
311
- print(f"https://app.datadoghq.eu/logs?query=%40trace_id%3A{trace_id}")
312
- with MDC(trace_id=trace_id):
310
+ trace_id = time.time_ns()
311
+ with MDC(correlation_id=trace_id):
313
312
  try:
314
313
  self.logger.debug(f"FeaturesEnricher created from existing search: {search_id}")
315
314
  self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
@@ -343,14 +342,14 @@ class FeaturesEnricher(TransformerMixin):
343
342
  self.shared_datasets = shared_datasets
344
343
  if shared_datasets is not None:
345
344
  self.runtime_parameters.properties["shared_datasets"] = ",".join(shared_datasets)
346
- self.generate_features = generate_features
345
+ self.generate_features = text_features or generate_features
347
346
  self.round_embeddings = round_embeddings
348
- if generate_features is not None:
349
- if len(generate_features) > self.GENERATE_FEATURES_LIMIT:
347
+ if self.generate_features is not None:
348
+ if len(self.generate_features) > self.GENERATE_FEATURES_LIMIT:
350
349
  msg = self.bundle.get("too_many_generate_features").format(self.GENERATE_FEATURES_LIMIT)
351
350
  self.logger.error(msg)
352
351
  raise ValidationError(msg)
353
- self.runtime_parameters.properties["generate_features"] = ",".join(generate_features)
352
+ self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
354
353
  if round_embeddings is not None:
355
354
  if not isinstance(round_embeddings, int) or round_embeddings < 0:
356
355
  msg = self.bundle.get("invalid_round_embeddings")
@@ -485,9 +484,9 @@ class FeaturesEnricher(TransformerMixin):
485
484
  stability_agg_func: str, optional (default="max")
486
485
  Function to aggregate stability values. Can be "max", "min", "mean".
487
486
  """
488
- trace_id = str(uuid.uuid4())
487
+ trace_id = time.time_ns()
489
488
  if self.print_trace_id:
490
- print(f"https://app.datadoghq.eu/logs?query=%40trace_id%3A{trace_id}")
489
+ print(f"https://app.datadoghq.eu/logs?query=%40correlation_id%3A{trace_id}")
491
490
  start_time = time.time()
492
491
  auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
493
492
  search_progress = SearchProgress(0.0, ProgressStage.START_FIT)
@@ -499,7 +498,7 @@ class FeaturesEnricher(TransformerMixin):
499
498
  progress_bar.progress = search_progress.to_progress_bar()
500
499
  progress_bar.display()
501
500
 
502
- with MDC(trace_id=trace_id):
501
+ with MDC(correlation_id=trace_id):
503
502
  if len(args) > 0:
504
503
  msg = f"WARNING: Unsupported positional arguments for fit: {args}"
505
504
  self.logger.warning(msg)
@@ -644,11 +643,11 @@ class FeaturesEnricher(TransformerMixin):
644
643
 
645
644
  self.warning_counter.reset()
646
645
  auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
647
- trace_id = str(uuid.uuid4())
646
+ trace_id = time.time_ns()
648
647
  if self.print_trace_id:
649
- print(f"https://app.datadoghq.eu/logs?query=%40trace_id%3A{trace_id}")
648
+ print(f"https://app.datadoghq.eu/logs?query=%40correlation_id%3A{trace_id}")
650
649
  start_time = time.time()
651
- with MDC(trace_id=trace_id):
650
+ with MDC(correlation_id=trace_id):
652
651
  if len(args) > 0:
653
652
  msg = f"WARNING: Unsupported positional arguments for fit_transform: {args}"
654
653
  self.logger.warning(msg)
@@ -746,8 +745,8 @@ class FeaturesEnricher(TransformerMixin):
746
745
  def transform(
747
746
  self,
748
747
  X: pd.DataFrame,
749
- *args,
750
748
  y: pd.Series | None = None,
749
+ *args,
751
750
  exclude_features_sources: list[str] | None = None,
752
751
  keep_input: bool = True,
753
752
  trace_id: str | None = None,
@@ -788,9 +787,11 @@ class FeaturesEnricher(TransformerMixin):
788
787
  progress_bar.progress = search_progress.to_progress_bar()
789
788
  if new_progress:
790
789
  progress_bar.display()
791
- trace_id = trace_id or str(uuid.uuid4())
790
+ trace_id = trace_id or time.time_ns()
791
+ if self.print_trace_id:
792
+ print(f"https://app.datadoghq.eu/logs?query=%40correlation_id%3A{trace_id}")
792
793
  search_id = self.search_id or (self._search_task.search_task_id if self._search_task is not None else None)
793
- with MDC(trace_id=trace_id, search_id=search_id):
794
+ with MDC(correlation_id=trace_id, search_id=search_id):
794
795
  self.dump_input(trace_id, X)
795
796
  if len(args) > 0:
796
797
  msg = f"WARNING: Unsupported positional arguments for transform: {args}"
@@ -905,10 +906,10 @@ class FeaturesEnricher(TransformerMixin):
905
906
  Dataframe with metrics calculated on train and validation datasets.
906
907
  """
907
908
 
908
- trace_id = trace_id or str(uuid.uuid4())
909
+ trace_id = trace_id or time.time_ns()
909
910
  start_time = time.time()
910
911
  search_id = self.search_id or (self._search_task.search_task_id if self._search_task is not None else None)
911
- with MDC(trace_id=trace_id, search_id=search_id):
912
+ with MDC(correlation_id=trace_id, search_id=search_id):
912
913
  self.logger.info("Start calculate metrics")
913
914
  if len(args) > 0:
914
915
  msg = f"WARNING: Unsupported positional arguments for calculate_metrics: {args}"
@@ -1416,13 +1417,11 @@ class FeaturesEnricher(TransformerMixin):
1416
1417
  # Find latest eval set or earliest if all eval sets are before train set
1417
1418
  date_column = self._get_date_column(search_keys)
1418
1419
 
1419
- date_converter = DateTimeSearchKeyConverter(
1420
+ date_converter = DateTimeConverter(
1420
1421
  date_column, self.date_format, self.logger, self.bundle, generate_cyclical_features=False
1421
1422
  )
1422
1423
 
1423
- X = date_converter.convert(X)
1424
-
1425
- x_date = X[date_column].dropna()
1424
+ x_date = date_converter.to_date_ms(X).dropna()
1426
1425
  if len(x_date) == 0:
1427
1426
  self.logger.warning("Empty date column in X")
1428
1427
  return []
@@ -1435,8 +1434,7 @@ class FeaturesEnricher(TransformerMixin):
1435
1434
  if date_column not in eval_x.columns:
1436
1435
  self.logger.warning(f"Date column not found in eval_set {i + 1}")
1437
1436
  continue
1438
- eval_x = date_converter.convert(eval_x)
1439
- eval_x_date = eval_x[date_column].dropna()
1437
+ eval_x_date = date_converter.to_date_ms(eval_x).dropna()
1440
1438
  if len(eval_x_date) < 1000:
1441
1439
  self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
1442
1440
  continue
@@ -1473,8 +1471,7 @@ class FeaturesEnricher(TransformerMixin):
1473
1471
  )
1474
1472
  checking_eval_set_df = checking_eval_set_df.copy()
1475
1473
 
1476
- checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
1477
- checking_eval_set_df = date_converter.convert(checking_eval_set_df)
1474
+ checking_eval_set_df[date_column] = date_converter.to_date_ms(eval_set_dates[selected_eval_set_idx].to_frame())
1478
1475
 
1479
1476
  psi_values_sparse = calculate_sparsity_psi(
1480
1477
  checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
@@ -1482,7 +1479,11 @@ class FeaturesEnricher(TransformerMixin):
1482
1479
 
1483
1480
  self.logger.info(f"PSI values by sparsity: {psi_values_sparse}")
1484
1481
 
1485
- unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
1482
+ unstable_by_sparsity = [
1483
+ feature
1484
+ for feature, psi in psi_values_sparse.items()
1485
+ if psi > stability_threshold
1486
+ ]
1486
1487
  if unstable_by_sparsity:
1487
1488
  self.logger.info(f"Unstable by sparsity features ({stability_threshold}): {sorted(unstable_by_sparsity)}")
1488
1489
 
@@ -1492,7 +1493,11 @@ class FeaturesEnricher(TransformerMixin):
1492
1493
 
1493
1494
  self.logger.info(f"PSI values by value: {psi_values}")
1494
1495
 
1495
- unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
1496
+ unstable_by_value = [
1497
+ feature
1498
+ for feature, psi in psi_values.items()
1499
+ if psi > stability_threshold
1500
+ ]
1496
1501
  if unstable_by_value:
1497
1502
  self.logger.info(f"Unstable by value features ({stability_threshold}): {sorted(unstable_by_value)}")
1498
1503
 
@@ -1746,9 +1751,11 @@ class FeaturesEnricher(TransformerMixin):
1746
1751
  not in (
1747
1752
  excluding_search_keys
1748
1753
  + list(self.fit_dropped_features)
1749
- + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
1754
+ + [DateTimeConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
1750
1755
  )
1751
1756
  ]
1757
+ if self.baseline_score_column is not None and self.baseline_score_column not in client_features:
1758
+ client_features.append(self.baseline_score_column)
1752
1759
  self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
1753
1760
 
1754
1761
  selected_enriched_features = [c for c in self.feature_names_ if c not in client_features]
@@ -1996,7 +2003,7 @@ class FeaturesEnricher(TransformerMixin):
1996
2003
  date_column = self._get_date_column(search_keys)
1997
2004
  generated_features = []
1998
2005
  if date_column is not None:
1999
- converter = DateTimeSearchKeyConverter(
2006
+ converter = DateTimeConverter(
2000
2007
  date_column,
2001
2008
  self.date_format,
2002
2009
  self.logger,
@@ -2005,6 +2012,7 @@ class FeaturesEnricher(TransformerMixin):
2005
2012
  )
2006
2013
  # Leave original date column values
2007
2014
  df_with_date_features = converter.convert(df, keep_time=True)
2015
+ # TODO check if this is correct
2008
2016
  df_with_date_features[date_column] = df[date_column]
2009
2017
  df = df_with_date_features
2010
2018
  generated_features = converter.generated_features
@@ -2035,15 +2043,17 @@ class FeaturesEnricher(TransformerMixin):
2035
2043
 
2036
2044
  # Sample after sorting by system_record_id for idempotency
2037
2045
  df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
2038
- df = self.__downsample_for_metrics(df)
2039
2046
 
2040
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2041
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2047
+ if DateTimeConverter.DATETIME_COL in df.columns:
2048
+ df = df.drop(columns=DateTimeConverter.DATETIME_COL)
2042
2049
 
2043
2050
  df = df.rename(columns=columns_renaming)
2044
2051
  generated_features = [columns_renaming.get(c, c) for c in generated_features]
2045
2052
  search_keys = {columns_renaming.get(k, k): v for k, v in search_keys.items()}
2046
2053
 
2054
+ # It uses original columns names!
2055
+ df = self.__downsample_for_metrics(df)
2056
+
2047
2057
  train_df = df.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df
2048
2058
  X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
2049
2059
  y_sampled = train_df[TARGET].copy()
@@ -2387,7 +2397,7 @@ class FeaturesEnricher(TransformerMixin):
2387
2397
  def get_progress(self, trace_id: str | None = None, search_task: SearchTask | None = None) -> SearchProgress:
2388
2398
  search_task = search_task or self._search_task
2389
2399
  if search_task is not None:
2390
- trace_id = trace_id or uuid.uuid4()
2400
+ trace_id = trace_id or time.time_ns()
2391
2401
  return search_task.get_progress(trace_id)
2392
2402
 
2393
2403
  def display_transactional_transform_api(self, only_online_sources=False):
@@ -2415,7 +2425,7 @@ class FeaturesEnricher(TransformerMixin):
2415
2425
  return "12345678"
2416
2426
  return "test_value"
2417
2427
 
2418
- file_metadata = self._search_task.get_file_metadata(str(uuid.uuid4()))
2428
+ file_metadata = self._search_task.get_file_metadata(time.time_ns())
2419
2429
 
2420
2430
  def get_column_meta(column_name: str) -> FileColumnMetadata:
2421
2431
  for c in file_metadata.columns:
@@ -2509,7 +2519,7 @@ if response.status_code == 200:
2509
2519
 
2510
2520
  start_time = time.time()
2511
2521
  search_id = self.search_id or (self._search_task.search_task_id if self._search_task is not None else None)
2512
- with MDC(trace_id=trace_id, search_id=search_id):
2522
+ with MDC(correlation_id=trace_id, search_id=search_id):
2513
2523
  self.logger.info("Start transform")
2514
2524
 
2515
2525
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(
@@ -2598,7 +2608,7 @@ if response.status_code == 200:
2598
2608
  generated_features = []
2599
2609
  date_column = self._get_date_column(search_keys)
2600
2610
  if date_column is not None:
2601
- converter = DateTimeSearchKeyConverter(
2611
+ converter = DateTimeConverter(
2602
2612
  date_column,
2603
2613
  self.date_format,
2604
2614
  self.logger,
@@ -2655,8 +2665,8 @@ if response.status_code == 200:
2655
2665
 
2656
2666
  # Don't pass all features in backend on transform
2657
2667
  runtime_parameters = self._get_copy_of_runtime_parameters()
2658
- features_for_transform = self._search_task.get_features_for_transform() or []
2659
- if len(features_for_transform) > 0:
2668
+ features_for_transform = self._search_task.get_features_for_transform()
2669
+ if features_for_transform:
2660
2670
  missing_features_for_transform = [
2661
2671
  columns_renaming.get(f) or f for f in features_for_transform if f not in df.columns
2662
2672
  ]
@@ -2667,7 +2677,10 @@ if response.status_code == 200:
2667
2677
  raise ValidationError(
2668
2678
  self.bundle.get("missing_features_for_transform").format(missing_features_for_transform)
2669
2679
  )
2670
- runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
2680
+ features_for_embeddings = self._search_task.get_features_for_embeddings()
2681
+ if features_for_embeddings:
2682
+ runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_embeddings)
2683
+ features_for_transform = [f for f in features_for_transform if f not in search_keys.keys()]
2671
2684
 
2672
2685
  columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2673
2686
 
@@ -2728,8 +2741,17 @@ if response.status_code == 200:
2728
2741
  )
2729
2742
  df = converter.convert(df)
2730
2743
 
2744
+ date_features = []
2745
+ for col in features_for_transform:
2746
+ if DateTimeConverter(col).is_datetime(df):
2747
+ df[col] = DateTimeConverter(col).to_date_string(df)
2748
+ date_features.append(col)
2749
+
2731
2750
  meaning_types = {}
2732
- meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
2751
+ meaning_types.update(
2752
+ {col: FileColumnMeaningType.FEATURE for col in features_for_transform if col not in date_features}
2753
+ )
2754
+ meaning_types.update({col: FileColumnMeaningType.DATE_FEATURE for col in date_features})
2733
2755
  meaning_types.update({col: key.value for col, key in search_keys.items()})
2734
2756
 
2735
2757
  features_not_to_pass.extend(
@@ -2742,8 +2764,8 @@ if response.status_code == 200:
2742
2764
  ]
2743
2765
  )
2744
2766
 
2745
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2746
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2767
+ if DateTimeConverter.DATETIME_COL in df.columns:
2768
+ df = df.drop(columns=DateTimeConverter.DATETIME_COL)
2747
2769
 
2748
2770
  # search keys might be changed after explode
2749
2771
  columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
@@ -2925,6 +2947,7 @@ if response.status_code == 200:
2925
2947
  or c in self.search_keys
2926
2948
  or c in (self.id_columns or [])
2927
2949
  or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
2950
+ or c == self.baseline_score_column
2928
2951
  ]
2929
2952
  else:
2930
2953
  selected_input_columns = []
@@ -3123,7 +3146,7 @@ if response.status_code == 200:
3123
3146
  self.fit_generated_features = []
3124
3147
 
3125
3148
  if has_date:
3126
- converter = DateTimeSearchKeyConverter(
3149
+ converter = DateTimeConverter(
3127
3150
  maybe_date_column,
3128
3151
  self.date_format,
3129
3152
  self.logger,
@@ -3176,8 +3199,8 @@ if response.status_code == 200:
3176
3199
  self.TARGET_NAME,
3177
3200
  EVAL_SET_INDEX,
3178
3201
  ] + list(self.fit_search_keys.keys())
3179
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3180
- non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
3202
+ if DateTimeConverter.DATETIME_COL in df.columns:
3203
+ non_feature_columns.append(DateTimeConverter.DATETIME_COL)
3181
3204
 
3182
3205
  features_columns = [c for c in df.columns if c not in non_feature_columns]
3183
3206
 
@@ -3264,15 +3287,27 @@ if response.status_code == 200:
3264
3287
  ENTITY_SYSTEM_RECORD_ID,
3265
3288
  SEARCH_KEY_UNNEST,
3266
3289
  ] + list(self.fit_search_keys.keys())
3267
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3268
- non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
3290
+ if DateTimeConverter.DATETIME_COL in df.columns:
3291
+ non_feature_columns.append(DateTimeConverter.DATETIME_COL)
3269
3292
 
3270
3293
  features_columns = [c for c in df.columns if c not in non_feature_columns]
3271
3294
 
3295
+ # find date features
3296
+ date_features = []
3297
+ for col in features_columns:
3298
+ if DateTimeConverter(col).is_datetime(df):
3299
+ df[col] = DateTimeConverter(col).to_date_string(df)
3300
+ date_features.append(col)
3301
+
3272
3302
  meaning_types = {
3273
3303
  **{col: key.value for col, key in self.fit_search_keys.items()},
3274
- **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
3304
+ **{
3305
+ str(c): FileColumnMeaningType.FEATURE
3306
+ for c in df.columns
3307
+ if c not in non_feature_columns and c not in date_features
3308
+ },
3275
3309
  }
3310
+ meaning_types.update({col: FileColumnMeaningType.DATE_FEATURE for col in date_features})
3276
3311
  meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
3277
3312
  meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3278
3313
  if SEARCH_KEY_UNNEST in df.columns:
@@ -3293,8 +3328,8 @@ if response.status_code == 200:
3293
3328
  self.bundle,
3294
3329
  )
3295
3330
 
3296
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3297
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
3331
+ if DateTimeConverter.DATETIME_COL in df.columns:
3332
+ df = df.drop(columns=DateTimeConverter.DATETIME_COL)
3298
3333
 
3299
3334
  meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
3300
3335
 
@@ -3331,7 +3366,9 @@ if response.status_code == 200:
3331
3366
  dataset.columns_renaming = self.fit_columns_renaming
3332
3367
 
3333
3368
  self.passed_features = [
3334
- column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
3369
+ column
3370
+ for column, meaning_type in meaning_types.items()
3371
+ if meaning_type in [FileColumnMeaningType.FEATURE, FileColumnMeaningType.DATE_FEATURE]
3335
3372
  ]
3336
3373
 
3337
3374
  self._search_task = dataset.search(
@@ -3859,8 +3896,8 @@ if response.status_code == 200:
3859
3896
  X = Xy.drop(columns=TARGET)
3860
3897
  y = Xy[TARGET].copy()
3861
3898
 
3862
- if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
3863
- X.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
3899
+ if DateTimeConverter.DATETIME_COL in X.columns:
3900
+ X.drop(columns=DateTimeConverter.DATETIME_COL, inplace=True)
3864
3901
 
3865
3902
  return X, y
3866
3903
 
@@ -3870,8 +3907,8 @@ if response.status_code == 200:
3870
3907
  X: pd.DataFrame, y: pd.Series, search_keys: dict[str, SearchKey], cv: CVType | None
3871
3908
  ) -> tuple[pd.DataFrame, pd.Series]:
3872
3909
  if cv not in [CVType.time_series, CVType.blocked_time_series]:
3873
- if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
3874
- date_column = DateTimeSearchKeyConverter.DATETIME_COL
3910
+ if DateTimeConverter.DATETIME_COL in X.columns:
3911
+ date_column = DateTimeConverter.DATETIME_COL
3875
3912
  else:
3876
3913
  date_column = FeaturesEnricher._get_date_column(search_keys)
3877
3914
  sort_columns = [date_column] if date_column is not None else []
@@ -3899,8 +3936,8 @@ if response.status_code == 200:
3899
3936
 
3900
3937
  y = Xy[TARGET].copy()
3901
3938
 
3902
- if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
3903
- X.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
3939
+ if DateTimeConverter.DATETIME_COL in X.columns:
3940
+ X.drop(columns=DateTimeConverter.DATETIME_COL, inplace=True)
3904
3941
 
3905
3942
  return X, y
3906
3943
 
@@ -3979,12 +4016,10 @@ if response.status_code == 200:
3979
4016
  maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3980
4017
  if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
3981
4018
  # TODO cast date column to single dtype
3982
- date_converter = DateTimeSearchKeyConverter(
3983
- maybe_date_col, self.date_format, generate_cyclical_features=False
3984
- )
3985
- converted_X = date_converter.convert(X)
3986
- min_date = converted_X[maybe_date_col].min()
3987
- max_date = converted_X[maybe_date_col].max()
4019
+ date_converter = DateTimeConverter(maybe_date_col, self.date_format, generate_cyclical_features=False)
4020
+ date_col_values = date_converter.to_date_ms(X)
4021
+ min_date = date_col_values.min()
4022
+ max_date = date_col_values.max()
3988
4023
  self.logger.info(f"Dates interval is ({min_date}, {max_date})")
3989
4024
 
3990
4025
  except Exception:
@@ -4021,7 +4056,7 @@ if response.status_code == 200:
4021
4056
  self.__log_warning(bundle.get("current_date_added"))
4022
4057
  df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
4023
4058
  search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
4024
- converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
4059
+ converter = DateTimeConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
4025
4060
  df = converter.convert(df)
4026
4061
  return df
4027
4062
 
@@ -4152,8 +4187,8 @@ if response.status_code == 200:
4152
4187
  "__target",
4153
4188
  ENTITY_SYSTEM_RECORD_ID,
4154
4189
  ]
4155
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
4156
- date_column = DateTimeSearchKeyConverter.DATETIME_COL
4190
+ if DateTimeConverter.DATETIME_COL in df.columns:
4191
+ date_column = DateTimeConverter.DATETIME_COL
4157
4192
  sort_exclude_columns.append(FeaturesEnricher._get_date_column(search_keys))
4158
4193
  else:
4159
4194
  date_column = FeaturesEnricher._get_date_column(search_keys)
@@ -4953,7 +4988,7 @@ if response.status_code == 200:
4953
4988
  eval_set: tuple | None = None,
4954
4989
  ):
4955
4990
  def dump_task(X_, y_, eval_set_):
4956
- with MDC(trace_id=trace_id):
4991
+ with MDC(correlation_id=trace_id):
4957
4992
  try:
4958
4993
  if isinstance(X_, pd.Series):
4959
4994
  X_ = X_.to_frame()
upgini/metadata.py CHANGED
@@ -36,6 +36,7 @@ class FileColumnMeaningType(Enum):
36
36
  SCORE = "SCORE"
37
37
  TARGET = "TARGET"
38
38
  FEATURE = "FEATURE"
39
+ DATE_FEATURE = "DATE_FEATURE"
39
40
  CUSTOM_KEY = "CUSTOM_KEY"
40
41
  COUNTRY = "COUNTRY"
41
42
  POSTAL_CODE = "POSTAL_CODE"
@@ -163,7 +164,9 @@ class ModelTaskType(Enum):
163
164
  return self in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
164
165
 
165
166
  @staticmethod
166
- def parse(task_type: Any) -> "ModelTaskType":
167
+ def parse(task_type: Any) -> Optional["ModelTaskType"]:
168
+ if task_type is None:
169
+ return None
167
170
  if isinstance(task_type, ModelTaskType):
168
171
  return task_type
169
172
  elif isinstance(task_type, str):
@@ -25,7 +25,7 @@ from upgini.metadata import (
25
25
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
26
26
  from upgini.utils import find_numbers_with_decimal_comma
27
27
  from upgini.utils.country_utils import CountrySearchKeyConverter
28
- from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
28
+ from upgini.utils.datetime_utils import DateTimeConverter
29
29
  from upgini.utils.ip_utils import IpSearchKeyConverter
30
30
  from upgini.utils.phone_utils import PhoneSearchKeyConverter
31
31
  from upgini.utils.postal_code_utils import PostalCodeSearchKeyConverter
@@ -89,7 +89,7 @@ class Normalizer:
89
89
  SYSTEM_RECORD_ID,
90
90
  ENTITY_SYSTEM_RECORD_ID,
91
91
  SEARCH_KEY_UNNEST,
92
- DateTimeSearchKeyConverter.DATETIME_COL,
92
+ DateTimeConverter.DATETIME_COL,
93
93
  ]:
94
94
  self.columns_renaming[column] = column
95
95
  new_columns.append(column)
upgini/search_task.py CHANGED
@@ -165,10 +165,21 @@ class SearchTask:
165
165
 
166
166
  return list(zero_hit_search_keys)
167
167
 
168
- def get_features_for_transform(self) -> Optional[List[str]]:
168
+ def get_features_for_embeddings(self) -> Optional[List[str]]:
169
169
  if self.provider_metadata_v2 is None:
170
170
  return None
171
171
 
172
+ features_for_transform = set()
173
+ for meta in self.provider_metadata_v2:
174
+ if meta.features_used_for_embeddings is not None:
175
+ features_for_transform.update(meta.features_used_for_embeddings)
176
+
177
+ return list(features_for_transform)
178
+
179
+ def get_features_for_transform(self) -> List[str]:
180
+ if self.provider_metadata_v2 is None:
181
+ return []
182
+
172
183
  features_for_transform = set()
173
184
  for meta in self.provider_metadata_v2:
174
185
  if meta.features_used_for_embeddings is not None:
@@ -30,7 +30,7 @@ DATE_FORMATS = [
30
30
  DATETIME_PATTERN = r"^[\d\s\.\-:T/+]+$"
31
31
 
32
32
 
33
- class DateTimeSearchKeyConverter:
33
+ class DateTimeConverter:
34
34
  DATETIME_COL = "_date_time"
35
35
  # MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31) # 946684800000 # 2000-01-01
36
36
  MIN_SUPPORTED_DATE_TS = pd.to_datetime(datetime.datetime(1999, 12, 31)).tz_localize(None)
@@ -73,41 +73,99 @@ class DateTimeSearchKeyConverter:
73
73
  except Exception:
74
74
  return None
75
75
 
76
- def convert(self, df: pd.DataFrame, keep_time=False) -> pd.DataFrame:
77
- if len(df) == 0:
78
- return df
76
+ def is_datetime(self, df: pd.DataFrame) -> bool:
77
+ if len(df) == 0 or df[self.date_column].isna().all():
78
+ return False
79
+
80
+ if pd.api.types.is_datetime64_any_dtype(df[self.date_column]):
81
+ return True
82
+
83
+ parsed = self.parse_datetime(df, raise_errors=False)
84
+ return parsed is not None and not parsed.isna().all()
79
85
 
86
+ def parse_datetime(self, df: pd.DataFrame, raise_errors=True) -> pd.Series | None:
80
87
  df = df.copy()
81
- if df[self.date_column].apply(lambda x: isinstance(x, datetime.datetime)).all():
82
- df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
83
- elif isinstance(df[self.date_column].values[0], datetime.date):
84
- df[self.date_column] = pd.to_datetime(df[self.date_column], errors="coerce")
85
- elif isinstance(df[self.date_column].dtype, pd.PeriodDtype):
86
- df[self.date_column] = df[self.date_column].dt.to_timestamp()
87
- elif is_numeric_dtype(df[self.date_column]):
88
- # 315532801 - 2524608001 - seconds
89
- # 315532801000 - 2524608001000 - milliseconds
90
- # 315532801000000 - 2524608001000000 - microseconds
91
- # 315532801000000000 - 2524608001000000000 - nanoseconds
92
- if df[self.date_column].apply(lambda x: 10**16 < x).all():
93
- df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ns")
94
- elif df[self.date_column].apply(lambda x: 10**14 < x < 10**16).all():
95
- df[self.date_column] = pd.to_datetime(df[self.date_column], unit="us")
96
- elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
97
- df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ms")
98
- elif df[self.date_column].apply(lambda x: 0 < x < 10**11).all():
99
- df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
88
+ if len(df) == 0 or df[self.date_column].isna().all():
89
+ return None
90
+
91
+ try:
92
+ if df[self.date_column].apply(lambda x: isinstance(x, datetime.datetime)).all():
93
+ parsed_datetime = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
94
+ elif isinstance(df[self.date_column].dropna().values[0], datetime.date):
95
+ parsed_datetime = pd.to_datetime(df[self.date_column], errors="coerce")
96
+ elif isinstance(df[self.date_column].dtype, pd.PeriodDtype):
97
+ parsed_datetime = df[self.date_column].dt.to_timestamp()
98
+ elif is_numeric_dtype(df[self.date_column]):
99
+ # 315532801 - 2524608001 - seconds
100
+ # 315532801000 - 2524608001000 - milliseconds
101
+ # 315532801000000 - 2524608001000000 - microseconds
102
+ # 315532801000000000 - 2524608001000000000 - nanoseconds
103
+ if df[self.date_column].apply(lambda x: 10**16 < x).all():
104
+ parsed_datetime = pd.to_datetime(df[self.date_column], unit="ns")
105
+ elif df[self.date_column].apply(lambda x: 10**14 < x < 10**16).all():
106
+ parsed_datetime = pd.to_datetime(df[self.date_column], unit="us")
107
+ elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
108
+ parsed_datetime = pd.to_datetime(df[self.date_column], unit="ms")
109
+ elif df[self.date_column].apply(lambda x: 10**8 < x < 10**11).all():
110
+ parsed_datetime = pd.to_datetime(df[self.date_column], unit="s")
111
+ else:
112
+ msg = self.bundle.get("unsupported_date_type").format(self.date_column)
113
+ if raise_errors:
114
+ raise ValidationError(msg)
115
+ else:
116
+ return None
117
+ else:
118
+ df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
119
+ parsed_datetime = self.parse_string_date(df, raise_errors)
120
+ parsed_datetime = parsed_datetime.dt.tz_localize(None)
121
+ return parsed_datetime
122
+ except Exception as e:
123
+ if raise_errors:
124
+ raise ValidationError(e)
100
125
  else:
101
- msg = self.bundle.get("unsupported_date_type").format(self.date_column)
102
- raise ValidationError(msg)
126
+ return None
127
+
128
+ def to_date_string(self, df: pd.DataFrame) -> pd.Series:
129
+ parsed_datetime = self.parse_datetime(df)
130
+ if parsed_datetime is None:
131
+ return df[self.date_column]
132
+ return parsed_datetime.dt.strftime("%Y-%m-%d")
133
+
134
+ def to_date_ms(self, df: pd.DataFrame) -> pd.Series:
135
+ parsed_datetime = self.parse_datetime(df)
136
+ if parsed_datetime is None:
137
+ return df[self.date_column]
138
+ return self.convert_datetime_to_date_ms(parsed_datetime)
139
+
140
+ def convert_datetime_to_datetime_ms(self, date_col: pd.Series) -> pd.Series:
141
+ if date_col.dt.unit == "ns":
142
+ date_col = date_col.astype(np.int64) // 1_000_000
143
+ elif date_col.dt.unit == "us":
144
+ date_col = date_col.astype(np.int64) // 1_000
145
+ elif date_col.dt.unit == "ms":
146
+ date_col = date_col.astype(np.int64)
147
+ elif date_col.dt.unit == "s":
148
+ date_col = date_col.astype(np.int64) * 1_000
103
149
  else:
104
- df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
105
- df[self.date_column] = self.parse_date(df)
150
+ raise ValueError(f"Unsupported date unit: {date_col.dt.unit}")
151
+
152
+ return date_col.apply(self._int_to_opt).astype("Int64")
153
+
154
+ def convert_datetime_to_date_ms(self, date_col: pd.Series) -> pd.Series:
155
+ date_col = date_col.dt.floor("D")
156
+ return self.convert_datetime_to_datetime_ms(date_col)
157
+
158
+ def convert(self, df: pd.DataFrame, keep_time=False) -> pd.DataFrame:
159
+ df = df.copy()
160
+ parsed_datetime = self.parse_datetime(df)
161
+ if parsed_datetime is None:
162
+ return df
163
+
164
+ df[self.date_column] = parsed_datetime
106
165
 
107
166
  # If column with date is datetime then extract seconds of the day and minute of the hour
108
167
  # as additional features
109
168
  seconds = "datetime_seconds"
110
- df[self.date_column] = df[self.date_column].dt.tz_localize(None)
111
169
 
112
170
  df = self.clean_old_dates(df)
113
171
 
@@ -182,21 +240,22 @@ class DateTimeSearchKeyConverter:
182
240
  df.drop(columns=seconds, inplace=True)
183
241
 
184
242
  if keep_time:
185
- df[self.DATETIME_COL] = df[self.date_column].astype(np.int64) // 1_000_000
186
- df[self.DATETIME_COL] = df[self.DATETIME_COL].apply(self._int_to_opt).astype("Int64")
187
- df[self.date_column] = df[self.date_column].dt.floor("D").astype(np.int64) // 1_000_000
188
- df[self.date_column] = df[self.date_column].apply(self._int_to_opt).astype("Int64")
243
+ df[self.DATETIME_COL] = self.convert_datetime_to_datetime_ms(df[self.date_column])
244
+ df[self.date_column] = self.convert_datetime_to_date_ms(df[self.date_column])
189
245
 
190
246
  self.logger.info(f"Date after convertion to timestamp: {df[self.date_column]}")
191
247
 
192
248
  return df
193
249
 
194
- def parse_date(self, df: pd.DataFrame):
250
+ def parse_string_date(self, df: pd.DataFrame, raise_errors=True) -> pd.Series | None:
195
251
  if self.date_format is not None:
196
252
  try:
197
253
  return pd.to_datetime(df[self.date_column], format=self.date_format)
198
254
  except ValueError as e:
199
- raise ValidationError(e)
255
+ if raise_errors:
256
+ raise ValidationError(e)
257
+ else:
258
+ return None
200
259
  else:
201
260
  for date_format in DATE_FORMATS:
202
261
  try:
@@ -204,9 +263,17 @@ class DateTimeSearchKeyConverter:
204
263
  except ValueError:
205
264
  pass
206
265
  try:
207
- return pd.to_datetime(df[self.date_column])
266
+ # Suppress warning for intentional fallback to dateutil parsing
267
+ import warnings
268
+
269
+ with warnings.catch_warnings():
270
+ warnings.filterwarnings("ignore", message="Could not infer format")
271
+ return pd.to_datetime(df[self.date_column])
208
272
  except ValueError:
209
- raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
273
+ if raise_errors:
274
+ raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
275
+ else:
276
+ return None
210
277
 
211
278
  def clean_old_dates(self, df: pd.DataFrame) -> pd.DataFrame:
212
279
  condition = df[self.date_column] <= self.MIN_SUPPORTED_DATE_TS
@@ -14,7 +14,7 @@ from upgini.metadata import (
14
14
  SearchKey,
15
15
  )
16
16
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
17
- from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
17
+ from upgini.utils.datetime_utils import DateTimeConverter
18
18
  from upgini.utils.target_utils import define_task
19
19
 
20
20
 
@@ -104,7 +104,7 @@ def remove_fintech_duplicates(
104
104
  sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
105
105
 
106
106
  # Convert date columns for further checks
107
- sub_df = DateTimeSearchKeyConverter(
107
+ sub_df = DateTimeConverter(
108
108
  date_col, date_format=date_format, logger=logger, bundle=bundle, generate_cyclical_features=False
109
109
  ).convert(sub_df)
110
110
  grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.123
3
+ Version: 1.2.125
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -30,6 +30,7 @@ Requires-Dist: ipywidgets>=8.1.0
30
30
  Requires-Dist: jarowinkler>=2.0.0
31
31
  Requires-Dist: levenshtein>=0.25.1
32
32
  Requires-Dist: lightgbm>=4.6.0
33
+ Requires-Dist: more-itertools==10.7.0
33
34
  Requires-Dist: numpy<3.0.0,>=1.19.0
34
35
  Requires-Dist: pandas<3.0.0,>=1.1.0
35
36
  Requires-Dist: psutil>=5.9.0
@@ -1,20 +1,20 @@
1
- upgini/__about__.py,sha256=siGr3k7uTvd5-HzBPobT1R8em9DBHJhVkabyMz1HdSU,24
1
+ upgini/__about__.py,sha256=khvL6Ma3KHnaaXtUCPR9kKBJFG5qg7emKoKVlrbEt0k,24
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=pQ8JQe0cdygD-W9GefJmfE6bnj4EYzXsjlgWdIS9nS8,31578
4
+ upgini/dataset.py,sha256=Nm2ZmwyQqvTnymYpGUwyJWy7y2ebXlHMyYmGeGcyA_s,31652
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=sMH37pXVqhZcYKB1_cEYmiFfhJ_aGwnuk1_k-8P3eLw,231792
6
+ upgini/features_enricher.py,sha256=tmKeERG2b0YfJ47g-UXQQ3S-9tyagwUOhI4oqN3kG2w,233058
7
7
  upgini/http.py,sha256=-J_wOpnwVnT0ebPC6sOs6fN3AWtCD0LJLu6nlYmxaqk,44348
8
- upgini/metadata.py,sha256=VzgtgEbPPtNxTrj9LM5qSDP3DujHwAXqbUSKBjPcb9c,12477
8
+ upgini/metadata.py,sha256=CL9bFytdUZlbQYtTgNgAkt_sxO9klARQtULDBgb2Hlg,12575
9
9
  upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
10
- upgini/search_task.py,sha256=SAiUd1AytbA2Q6PSnnztr7oTRKpud1wQZ5YtKjsmQHU,18256
10
+ upgini/search_task.py,sha256=5mL_qV5mVtDkIumM9xCOgfa9Lc2B8mxJ1qI21iaScnQ,18656
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
13
13
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
14
14
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
15
15
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
17
- upgini/autofe/binary.py,sha256=oOEECc4nRzZN2tYaiqx8F2XHnfWpk1bVvb7ZkZJ0lO8,7709
17
+ upgini/autofe/binary.py,sha256=o3TQuP3EnECAVIeToGczu4yJ4vX7BJ2iSCN9Ra1SZJI,7829
18
18
  upgini/autofe/date.py,sha256=RvexgrL1_6ISYPVrl9HUQmPgpVSGQsTNv8YhNQWs-5M,11329
19
19
  upgini/autofe/feature.py,sha256=W9sZHdz5Vi0H_oPyY5saZAPjyd5wunpULnCqrGLpQc4,16879
20
20
  upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
@@ -35,7 +35,7 @@ upgini/data_source/data_source_publisher.py,sha256=qXQUYErhCmkWHm2FWgTL0FYZ2aJbx
35
35
  upgini/mdc/__init__.py,sha256=iHJlXQg6xRM1-ZOUtaPSJqw5SpQDszvxp4LyqviNLIQ,1027
36
36
  upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
37
37
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
- upgini/normalizer/normalize_utils.py,sha256=mDh2mBW3aQMB4EFP2aHbf2dGMVkOcWnp4sKKvKDBh8w,8511
38
+ upgini/normalizer/normalize_utils.py,sha256=w9f_9udrwqbhXgFMTs2keuce-6X_j6h3D7EdNo_2X7g,8493
39
39
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
40
40
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
41
41
  upgini/resource_bundle/strings.properties,sha256=KcXm1Nl6c3zswL91tIbG0DjuuNpzxUdCg1cY9f2-9cg,29283
@@ -52,8 +52,8 @@ upgini/utils/config.py,sha256=zFdnjchykfp_1Tm3Qep7phLzXBpXIOzr2tIuXchRBLw,1754
52
52
  upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
53
53
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
54
54
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
55
- upgini/utils/datetime_utils.py,sha256=UL1ernnawW0LV9mPDpCIc6sFy0HUhFscWVNwfH4V7rI,14366
56
- upgini/utils/deduplicate_utils.py,sha256=oZEiZeN-A92zwAPysV4OP9hO-niC2RLt-Dhc_hynBTU,11273
55
+ upgini/utils/datetime_utils.py,sha256=l85UzSQLhtMeI2G6m-m8y8bCColCLSXNHb2-G6fKpLM,16988
56
+ upgini/utils/deduplicate_utils.py,sha256=6czbn1q0p-lOmrNvbAzueBpDHmfIP4TfV4poWqbjX5w,11255
57
57
  upgini/utils/display_utils.py,sha256=uSG3JwpwCIgRJXsp-8ktuJ0Dh-WFti7IrRLMUfHfoDc,11973
58
58
  upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
59
59
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,
74
74
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
75
75
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
76
76
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
77
- upgini-1.2.123.dist-info/METADATA,sha256=VCwJkPdHkgS9UQC_o7zS5kS4HmOjMPrkOIBfutNLWPE,50743
78
- upgini-1.2.123.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
79
- upgini-1.2.123.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
- upgini-1.2.123.dist-info/RECORD,,
77
+ upgini-1.2.125.dist-info/METADATA,sha256=CAoP8m15syLZEVmnYuUjUMI1Jo-XvMCGhz-CZnRYwy4,50781
78
+ upgini-1.2.125.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
79
+ upgini-1.2.125.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
+ upgini-1.2.125.dist-info/RECORD,,