upgini 1.2.124__tar.gz → 1.2.127__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (82) hide show
  1. {upgini-1.2.124 → upgini-1.2.127}/PKG-INFO +2 -1
  2. {upgini-1.2.124 → upgini-1.2.127}/pyproject.toml +1 -0
  3. upgini-1.2.127/src/upgini/__about__.py +1 -0
  4. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/autofe/binary.py +4 -3
  5. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/data_source/data_source_publisher.py +1 -9
  6. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/dataset.py +3 -1
  7. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/features_enricher.py +129 -76
  8. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/metadata.py +2 -0
  9. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/normalizer/normalize_utils.py +2 -2
  10. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/resource_bundle/strings.properties +2 -1
  11. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/search_task.py +12 -1
  12. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/datetime_utils.py +103 -36
  13. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/deduplicate_utils.py +2 -2
  14. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/display_utils.py +44 -7
  15. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/feature_info.py +18 -7
  16. upgini-1.2.124/src/upgini/__about__.py +0 -1
  17. {upgini-1.2.124 → upgini-1.2.127}/.gitignore +0 -0
  18. {upgini-1.2.124 → upgini-1.2.127}/LICENSE +0 -0
  19. {upgini-1.2.124 → upgini-1.2.127}/README.md +0 -0
  20. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/__init__.py +0 -0
  21. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/ads.py +0 -0
  22. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/ads_management/__init__.py +0 -0
  23. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/ads_management/ads_manager.py +0 -0
  24. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/autofe/__init__.py +0 -0
  25. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/autofe/all_operators.py +0 -0
  26. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/autofe/date.py +0 -0
  27. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/autofe/feature.py +0 -0
  28. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/autofe/groupby.py +0 -0
  29. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/autofe/operator.py +0 -0
  30. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/autofe/timeseries/__init__.py +0 -0
  31. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/autofe/timeseries/base.py +0 -0
  32. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/autofe/timeseries/cross.py +0 -0
  33. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/autofe/timeseries/delta.py +0 -0
  34. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/autofe/timeseries/lag.py +0 -0
  35. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/autofe/timeseries/roll.py +0 -0
  36. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/autofe/timeseries/trend.py +0 -0
  37. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/autofe/timeseries/volatility.py +0 -0
  38. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/autofe/unary.py +0 -0
  39. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/autofe/utils.py +0 -0
  40. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/autofe/vector.py +0 -0
  41. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/data_source/__init__.py +0 -0
  42. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/errors.py +0 -0
  43. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/http.py +0 -0
  44. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/mdc/__init__.py +0 -0
  45. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/mdc/context.py +0 -0
  46. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/metrics.py +0 -0
  47. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/normalizer/__init__.py +0 -0
  48. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/resource_bundle/__init__.py +0 -0
  49. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/resource_bundle/exceptions.py +0 -0
  50. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  51. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/sampler/__init__.py +0 -0
  52. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/sampler/base.py +0 -0
  53. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/sampler/random_under_sampler.py +0 -0
  54. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/sampler/utils.py +0 -0
  55. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/spinner.py +0 -0
  56. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  57. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/__init__.py +0 -0
  58. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/base_search_key_detector.py +0 -0
  59. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/blocked_time_series.py +0 -0
  60. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/config.py +0 -0
  61. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/country_utils.py +0 -0
  62. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/custom_loss_utils.py +0 -0
  63. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/cv_utils.py +0 -0
  64. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/email_utils.py +0 -0
  65. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/fallback_progress_bar.py +0 -0
  66. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/features_validator.py +0 -0
  67. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/format.py +0 -0
  68. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/hash_utils.py +0 -0
  69. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/ip_utils.py +0 -0
  70. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/mstats.py +0 -0
  71. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/phone_utils.py +0 -0
  72. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/postal_code_utils.py +0 -0
  73. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/progress_bar.py +0 -0
  74. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/psi.py +0 -0
  75. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/sample_utils.py +0 -0
  76. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/sklearn_ext.py +0 -0
  77. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/sort.py +0 -0
  78. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/target_utils.py +0 -0
  79. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/track_info.py +0 -0
  80. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/ts_utils.py +0 -0
  81. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/utils/warning_counter.py +0 -0
  82. {upgini-1.2.124 → upgini-1.2.127}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.124
3
+ Version: 1.2.127
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -30,6 +30,7 @@ Requires-Dist: ipywidgets>=8.1.0
30
30
  Requires-Dist: jarowinkler>=2.0.0
31
31
  Requires-Dist: levenshtein>=0.25.1
32
32
  Requires-Dist: lightgbm>=4.6.0
33
+ Requires-Dist: more-itertools==10.7.0
33
34
  Requires-Dist: numpy<3.0.0,>=1.19.0
34
35
  Requires-Dist: pandas<3.0.0,>=1.1.0
35
36
  Requires-Dist: psutil>=5.9.0
@@ -55,6 +55,7 @@ dependencies = [
55
55
  "levenshtein>=0.25.1",
56
56
  "psutil>=5.9.0",
57
57
  "category-encoders>=2.8.1",
58
+ "more_itertools==10.7.0",
58
59
  ]
59
60
 
60
61
  [project.urls]
@@ -0,0 +1 @@
1
+ __version__ = "1.2.127"
@@ -1,5 +1,6 @@
1
1
  import abc
2
2
  from typing import Optional
3
+
3
4
  import Levenshtein
4
5
  import numpy as np
5
6
  import pandas as pd
@@ -201,7 +202,7 @@ class JaroWinklerSim1(StringSim):
201
202
  has_symmetry_importance: bool = True
202
203
 
203
204
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
204
- return value
205
+ return value if value is not None and len(value) > 0 else None
205
206
 
206
207
  def _similarity(self, left: str, right: str) -> float:
207
208
  return jarowinkler_similarity(left, right)
@@ -216,7 +217,7 @@ class JaroWinklerSim2(StringSim):
216
217
  has_symmetry_importance: bool = True
217
218
 
218
219
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
219
- return value[::-1] if value is not None else None
220
+ return value[::-1] if value is not None and len(value) > 0 else None
220
221
 
221
222
  def _similarity(self, left: str, right: str) -> float:
222
223
  return jarowinkler_similarity(left, right)
@@ -231,7 +232,7 @@ class LevenshteinSim(StringSim):
231
232
  has_symmetry_importance: bool = True
232
233
 
233
234
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
234
- return value
235
+ return value if value is not None and len(value) > 0 else None
235
236
 
236
237
  def _similarity(self, left: str, right: str) -> float:
237
238
  return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
@@ -123,17 +123,9 @@ class DataSourcePublisher:
123
123
  set(search_keys.values()) == {SearchKey.IP_RANGE_FROM, SearchKey.IP_RANGE_TO}
124
124
  or set(search_keys.values()) == {SearchKey.IPV6_RANGE_FROM, SearchKey.IPV6_RANGE_TO}
125
125
  or set(search_keys.values()) == {SearchKey.MSISDN_RANGE_FROM, SearchKey.MSISDN_RANGE_TO}
126
+ or snapshot_frequency_days is not None or join_date_abs_limit_days is not None
126
127
  ) and sort_column is None:
127
128
  raise ValidationError("Sort column is required for passed search keys")
128
- if (
129
- set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
130
- and snapshot_frequency_days is None
131
- and join_date_abs_limit_days is None
132
- ):
133
- raise ValidationError(
134
- "With MSISDN and DATE keys one of the snapshot_frequency_days or"
135
- " join_date_abs_limit_days parameters is required"
136
- )
137
129
  if (
138
130
  set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
139
131
  or set(search_keys.values()) == {SearchKey.HEM, SearchKey.DATE}
@@ -151,7 +151,9 @@ class Dataset:
151
151
  def etalon_def_checked(self) -> Dict[str, str]:
152
152
  if self.etalon_def is None:
153
153
  self.etalon_def = {
154
- v.value: k for k, v in self.meaning_types_checked.items() if v != FileColumnMeaningType.FEATURE
154
+ v.value: k
155
+ for k, v in self.meaning_types_checked.items()
156
+ if v not in [FileColumnMeaningType.FEATURE, FileColumnMeaningType.DATE_FEATURE]
155
157
  }
156
158
 
157
159
  return self.etalon_def
@@ -76,7 +76,7 @@ from upgini.utils.custom_loss_utils import (
76
76
  )
77
77
  from upgini.utils.cv_utils import CVConfig, get_groups
78
78
  from upgini.utils.datetime_utils import (
79
- DateTimeSearchKeyConverter,
79
+ DateTimeConverter,
80
80
  is_blocked_time_series,
81
81
  is_dates_distribution_valid,
82
82
  is_time_series,
@@ -220,7 +220,9 @@ class FeaturesEnricher(TransformerMixin):
220
220
  cv: CVType | None = None,
221
221
  loss: str | None = None,
222
222
  autodetect_search_keys: bool = True,
223
+ # deprecated, use text_features instead
223
224
  generate_features: list[str] | None = None,
225
+ text_features: list[str] | None = None,
224
226
  columns_for_online_api: list[str] | None = None,
225
227
  round_embeddings: int | None = None,
226
228
  logs_enabled: bool = True,
@@ -305,10 +307,8 @@ class FeaturesEnricher(TransformerMixin):
305
307
  search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
306
308
 
307
309
  print(self.bundle.get("search_by_task_id_start"))
308
- trace_id = str(uuid.uuid4())
309
- if self.print_trace_id:
310
- print(f"https://app.datadoghq.eu/logs?query=%40trace_id%3A{trace_id}")
311
- with MDC(trace_id=trace_id):
310
+ trace_id = time.time_ns()
311
+ with MDC(correlation_id=trace_id):
312
312
  try:
313
313
  self.logger.debug(f"FeaturesEnricher created from existing search: {search_id}")
314
314
  self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
@@ -342,14 +342,14 @@ class FeaturesEnricher(TransformerMixin):
342
342
  self.shared_datasets = shared_datasets
343
343
  if shared_datasets is not None:
344
344
  self.runtime_parameters.properties["shared_datasets"] = ",".join(shared_datasets)
345
- self.generate_features = generate_features
345
+ self.generate_features = text_features or generate_features
346
346
  self.round_embeddings = round_embeddings
347
- if generate_features is not None:
348
- if len(generate_features) > self.GENERATE_FEATURES_LIMIT:
347
+ if self.generate_features is not None:
348
+ if len(self.generate_features) > self.GENERATE_FEATURES_LIMIT:
349
349
  msg = self.bundle.get("too_many_generate_features").format(self.GENERATE_FEATURES_LIMIT)
350
350
  self.logger.error(msg)
351
351
  raise ValidationError(msg)
352
- self.runtime_parameters.properties["generate_features"] = ",".join(generate_features)
352
+ self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
353
353
  if round_embeddings is not None:
354
354
  if not isinstance(round_embeddings, int) or round_embeddings < 0:
355
355
  msg = self.bundle.get("invalid_round_embeddings")
@@ -484,9 +484,9 @@ class FeaturesEnricher(TransformerMixin):
484
484
  stability_agg_func: str, optional (default="max")
485
485
  Function to aggregate stability values. Can be "max", "min", "mean".
486
486
  """
487
- trace_id = str(uuid.uuid4())
487
+ trace_id = time.time_ns()
488
488
  if self.print_trace_id:
489
- print(f"https://app.datadoghq.eu/logs?query=%40trace_id%3A{trace_id}")
489
+ print(f"https://app.datadoghq.eu/logs?query=%40correlation_id%3A{trace_id}")
490
490
  start_time = time.time()
491
491
  auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
492
492
  search_progress = SearchProgress(0.0, ProgressStage.START_FIT)
@@ -498,7 +498,7 @@ class FeaturesEnricher(TransformerMixin):
498
498
  progress_bar.progress = search_progress.to_progress_bar()
499
499
  progress_bar.display()
500
500
 
501
- with MDC(trace_id=trace_id):
501
+ with MDC(correlation_id=trace_id):
502
502
  if len(args) > 0:
503
503
  msg = f"WARNING: Unsupported positional arguments for fit: {args}"
504
504
  self.logger.warning(msg)
@@ -643,11 +643,11 @@ class FeaturesEnricher(TransformerMixin):
643
643
 
644
644
  self.warning_counter.reset()
645
645
  auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
646
- trace_id = str(uuid.uuid4())
646
+ trace_id = time.time_ns()
647
647
  if self.print_trace_id:
648
- print(f"https://app.datadoghq.eu/logs?query=%40trace_id%3A{trace_id}")
648
+ print(f"https://app.datadoghq.eu/logs?query=%40correlation_id%3A{trace_id}")
649
649
  start_time = time.time()
650
- with MDC(trace_id=trace_id):
650
+ with MDC(correlation_id=trace_id):
651
651
  if len(args) > 0:
652
652
  msg = f"WARNING: Unsupported positional arguments for fit_transform: {args}"
653
653
  self.logger.warning(msg)
@@ -745,8 +745,8 @@ class FeaturesEnricher(TransformerMixin):
745
745
  def transform(
746
746
  self,
747
747
  X: pd.DataFrame,
748
- *args,
749
748
  y: pd.Series | None = None,
749
+ *args,
750
750
  exclude_features_sources: list[str] | None = None,
751
751
  keep_input: bool = True,
752
752
  trace_id: str | None = None,
@@ -787,9 +787,11 @@ class FeaturesEnricher(TransformerMixin):
787
787
  progress_bar.progress = search_progress.to_progress_bar()
788
788
  if new_progress:
789
789
  progress_bar.display()
790
- trace_id = trace_id or str(uuid.uuid4())
790
+ trace_id = trace_id or time.time_ns()
791
+ if self.print_trace_id:
792
+ print(f"https://app.datadoghq.eu/logs?query=%40correlation_id%3A{trace_id}")
791
793
  search_id = self.search_id or (self._search_task.search_task_id if self._search_task is not None else None)
792
- with MDC(trace_id=trace_id, search_id=search_id):
794
+ with MDC(correlation_id=trace_id, search_id=search_id):
793
795
  self.dump_input(trace_id, X)
794
796
  if len(args) > 0:
795
797
  msg = f"WARNING: Unsupported positional arguments for transform: {args}"
@@ -904,10 +906,10 @@ class FeaturesEnricher(TransformerMixin):
904
906
  Dataframe with metrics calculated on train and validation datasets.
905
907
  """
906
908
 
907
- trace_id = trace_id or str(uuid.uuid4())
909
+ trace_id = trace_id or time.time_ns()
908
910
  start_time = time.time()
909
911
  search_id = self.search_id or (self._search_task.search_task_id if self._search_task is not None else None)
910
- with MDC(trace_id=trace_id, search_id=search_id):
912
+ with MDC(correlation_id=trace_id, search_id=search_id):
911
913
  self.logger.info("Start calculate metrics")
912
914
  if len(args) > 0:
913
915
  msg = f"WARNING: Unsupported positional arguments for calculate_metrics: {args}"
@@ -1415,13 +1417,11 @@ class FeaturesEnricher(TransformerMixin):
1415
1417
  # Find latest eval set or earliest if all eval sets are before train set
1416
1418
  date_column = self._get_date_column(search_keys)
1417
1419
 
1418
- date_converter = DateTimeSearchKeyConverter(
1420
+ date_converter = DateTimeConverter(
1419
1421
  date_column, self.date_format, self.logger, self.bundle, generate_cyclical_features=False
1420
1422
  )
1421
1423
 
1422
- X = date_converter.convert(X)
1423
-
1424
- x_date = X[date_column].dropna()
1424
+ x_date = date_converter.to_date_ms(X).dropna()
1425
1425
  if len(x_date) == 0:
1426
1426
  self.logger.warning("Empty date column in X")
1427
1427
  return []
@@ -1434,8 +1434,7 @@ class FeaturesEnricher(TransformerMixin):
1434
1434
  if date_column not in eval_x.columns:
1435
1435
  self.logger.warning(f"Date column not found in eval_set {i + 1}")
1436
1436
  continue
1437
- eval_x = date_converter.convert(eval_x)
1438
- eval_x_date = eval_x[date_column].dropna()
1437
+ eval_x_date = date_converter.to_date_ms(eval_x).dropna()
1439
1438
  if len(eval_x_date) < 1000:
1440
1439
  self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
1441
1440
  continue
@@ -1472,8 +1471,7 @@ class FeaturesEnricher(TransformerMixin):
1472
1471
  )
1473
1472
  checking_eval_set_df = checking_eval_set_df.copy()
1474
1473
 
1475
- checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
1476
- checking_eval_set_df = date_converter.convert(checking_eval_set_df)
1474
+ checking_eval_set_df[date_column] = date_converter.to_date_ms(eval_set_dates[selected_eval_set_idx].to_frame())
1477
1475
 
1478
1476
  psi_values_sparse = calculate_sparsity_psi(
1479
1477
  checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
@@ -1745,9 +1743,11 @@ class FeaturesEnricher(TransformerMixin):
1745
1743
  not in (
1746
1744
  excluding_search_keys
1747
1745
  + list(self.fit_dropped_features)
1748
- + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
1746
+ + [DateTimeConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
1749
1747
  )
1750
1748
  ]
1749
+ if self.baseline_score_column is not None and self.baseline_score_column not in client_features:
1750
+ client_features.append(self.baseline_score_column)
1751
1751
  self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
1752
1752
 
1753
1753
  selected_enriched_features = [c for c in self.feature_names_ if c not in client_features]
@@ -1995,7 +1995,7 @@ class FeaturesEnricher(TransformerMixin):
1995
1995
  date_column = self._get_date_column(search_keys)
1996
1996
  generated_features = []
1997
1997
  if date_column is not None:
1998
- converter = DateTimeSearchKeyConverter(
1998
+ converter = DateTimeConverter(
1999
1999
  date_column,
2000
2000
  self.date_format,
2001
2001
  self.logger,
@@ -2004,6 +2004,7 @@ class FeaturesEnricher(TransformerMixin):
2004
2004
  )
2005
2005
  # Leave original date column values
2006
2006
  df_with_date_features = converter.convert(df, keep_time=True)
2007
+ # TODO check if this is correct
2007
2008
  df_with_date_features[date_column] = df[date_column]
2008
2009
  df = df_with_date_features
2009
2010
  generated_features = converter.generated_features
@@ -2035,8 +2036,8 @@ class FeaturesEnricher(TransformerMixin):
2035
2036
  # Sample after sorting by system_record_id for idempotency
2036
2037
  df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
2037
2038
 
2038
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2039
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2039
+ if DateTimeConverter.DATETIME_COL in df.columns:
2040
+ df = df.drop(columns=DateTimeConverter.DATETIME_COL)
2040
2041
 
2041
2042
  df = df.rename(columns=columns_renaming)
2042
2043
  generated_features = [columns_renaming.get(c, c) for c in generated_features]
@@ -2388,7 +2389,7 @@ class FeaturesEnricher(TransformerMixin):
2388
2389
  def get_progress(self, trace_id: str | None = None, search_task: SearchTask | None = None) -> SearchProgress:
2389
2390
  search_task = search_task or self._search_task
2390
2391
  if search_task is not None:
2391
- trace_id = trace_id or uuid.uuid4()
2392
+ trace_id = trace_id or time.time_ns()
2392
2393
  return search_task.get_progress(trace_id)
2393
2394
 
2394
2395
  def display_transactional_transform_api(self, only_online_sources=False):
@@ -2416,7 +2417,7 @@ class FeaturesEnricher(TransformerMixin):
2416
2417
  return "12345678"
2417
2418
  return "test_value"
2418
2419
 
2419
- file_metadata = self._search_task.get_file_metadata(str(uuid.uuid4()))
2420
+ file_metadata = self._search_task.get_file_metadata(time.time_ns())
2420
2421
 
2421
2422
  def get_column_meta(column_name: str) -> FileColumnMetadata:
2422
2423
  for c in file_metadata.columns:
@@ -2510,7 +2511,7 @@ if response.status_code == 200:
2510
2511
 
2511
2512
  start_time = time.time()
2512
2513
  search_id = self.search_id or (self._search_task.search_task_id if self._search_task is not None else None)
2513
- with MDC(trace_id=trace_id, search_id=search_id):
2514
+ with MDC(correlation_id=trace_id, search_id=search_id):
2514
2515
  self.logger.info("Start transform")
2515
2516
 
2516
2517
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(
@@ -2552,10 +2553,15 @@ if response.status_code == 200:
2552
2553
  if transform_usage.has_limit:
2553
2554
  if len(X) > transform_usage.rest_rows:
2554
2555
  rest_rows = max(transform_usage.rest_rows, 0)
2555
- msg = self.bundle.get("transform_usage_warning").format(len(X), rest_rows)
2556
+ bundle_msg = (
2557
+ "transform_usage_warning_registered"
2558
+ if self.__is_registered
2559
+ else "transform_usage_warning_demo"
2560
+ )
2561
+ msg = self.bundle.get(bundle_msg).format(len(X), rest_rows)
2556
2562
  self.logger.warning(msg)
2557
2563
  print(msg)
2558
- show_request_quote_button()
2564
+ show_request_quote_button(is_registered=self.__is_registered)
2559
2565
  return None, {}, [], {}
2560
2566
  else:
2561
2567
  msg = self.bundle.get("transform_usage_info").format(
@@ -2599,7 +2605,7 @@ if response.status_code == 200:
2599
2605
  generated_features = []
2600
2606
  date_column = self._get_date_column(search_keys)
2601
2607
  if date_column is not None:
2602
- converter = DateTimeSearchKeyConverter(
2608
+ converter = DateTimeConverter(
2603
2609
  date_column,
2604
2610
  self.date_format,
2605
2611
  self.logger,
@@ -2656,8 +2662,8 @@ if response.status_code == 200:
2656
2662
 
2657
2663
  # Don't pass all features in backend on transform
2658
2664
  runtime_parameters = self._get_copy_of_runtime_parameters()
2659
- features_for_transform = self._search_task.get_features_for_transform() or []
2660
- if len(features_for_transform) > 0:
2665
+ features_for_transform = self._search_task.get_features_for_transform()
2666
+ if features_for_transform:
2661
2667
  missing_features_for_transform = [
2662
2668
  columns_renaming.get(f) or f for f in features_for_transform if f not in df.columns
2663
2669
  ]
@@ -2668,7 +2674,10 @@ if response.status_code == 200:
2668
2674
  raise ValidationError(
2669
2675
  self.bundle.get("missing_features_for_transform").format(missing_features_for_transform)
2670
2676
  )
2671
- runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
2677
+ features_for_embeddings = self._search_task.get_features_for_embeddings()
2678
+ if features_for_embeddings:
2679
+ runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_embeddings)
2680
+ features_for_transform = [f for f in features_for_transform if f not in search_keys.keys()]
2672
2681
 
2673
2682
  columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2674
2683
 
@@ -2729,8 +2738,22 @@ if response.status_code == 200:
2729
2738
  )
2730
2739
  df = converter.convert(df)
2731
2740
 
2741
+ date_features = []
2742
+ for col in features_for_transform:
2743
+ if DateTimeConverter(col).is_datetime(df):
2744
+ df[col] = DateTimeConverter(col).to_date_string(df)
2745
+ date_features.append(col)
2746
+
2732
2747
  meaning_types = {}
2733
- meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
2748
+ meaning_types.update(
2749
+ {
2750
+ col: FileColumnMeaningType.FEATURE
2751
+ for col in features_for_transform
2752
+ if col not in date_features and col not in generated_features
2753
+ }
2754
+ )
2755
+ meaning_types.update({col: FileColumnMeaningType.GENERATED_FEATURE for col in generated_features})
2756
+ meaning_types.update({col: FileColumnMeaningType.DATE_FEATURE for col in date_features})
2734
2757
  meaning_types.update({col: key.value for col, key in search_keys.items()})
2735
2758
 
2736
2759
  features_not_to_pass.extend(
@@ -2743,8 +2766,8 @@ if response.status_code == 200:
2743
2766
  ]
2744
2767
  )
2745
2768
 
2746
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2747
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2769
+ if DateTimeConverter.DATETIME_COL in df.columns:
2770
+ df = df.drop(columns=DateTimeConverter.DATETIME_COL)
2748
2771
 
2749
2772
  # search keys might be changed after explode
2750
2773
  columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
@@ -2926,6 +2949,7 @@ if response.status_code == 200:
2926
2949
  or c in self.search_keys
2927
2950
  or c in (self.id_columns or [])
2928
2951
  or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
2952
+ or c == self.baseline_score_column
2929
2953
  ]
2930
2954
  else:
2931
2955
  selected_input_columns = []
@@ -3124,7 +3148,7 @@ if response.status_code == 200:
3124
3148
  self.fit_generated_features = []
3125
3149
 
3126
3150
  if has_date:
3127
- converter = DateTimeSearchKeyConverter(
3151
+ converter = DateTimeConverter(
3128
3152
  maybe_date_column,
3129
3153
  self.date_format,
3130
3154
  self.logger,
@@ -3177,8 +3201,8 @@ if response.status_code == 200:
3177
3201
  self.TARGET_NAME,
3178
3202
  EVAL_SET_INDEX,
3179
3203
  ] + list(self.fit_search_keys.keys())
3180
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3181
- non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
3204
+ if DateTimeConverter.DATETIME_COL in df.columns:
3205
+ non_feature_columns.append(DateTimeConverter.DATETIME_COL)
3182
3206
 
3183
3207
  features_columns = [c for c in df.columns if c not in non_feature_columns]
3184
3208
 
@@ -3265,15 +3289,28 @@ if response.status_code == 200:
3265
3289
  ENTITY_SYSTEM_RECORD_ID,
3266
3290
  SEARCH_KEY_UNNEST,
3267
3291
  ] + list(self.fit_search_keys.keys())
3268
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3269
- non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
3292
+ if DateTimeConverter.DATETIME_COL in df.columns:
3293
+ non_feature_columns.append(DateTimeConverter.DATETIME_COL)
3270
3294
 
3271
3295
  features_columns = [c for c in df.columns if c not in non_feature_columns]
3272
3296
 
3297
+ # find date features
3298
+ date_features = []
3299
+ for col in features_columns:
3300
+ if DateTimeConverter(col).is_datetime(df):
3301
+ df[col] = DateTimeConverter(col).to_date_string(df)
3302
+ date_features.append(col)
3303
+
3273
3304
  meaning_types = {
3274
3305
  **{col: key.value for col, key in self.fit_search_keys.items()},
3275
- **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
3306
+ **{
3307
+ str(c): FileColumnMeaningType.FEATURE
3308
+ for c in df.columns
3309
+ if c not in non_feature_columns and c not in date_features and c not in self.fit_generated_features
3310
+ },
3276
3311
  }
3312
+ meaning_types.update({col: FileColumnMeaningType.GENERATED_FEATURE for col in self.fit_generated_features})
3313
+ meaning_types.update({col: FileColumnMeaningType.DATE_FEATURE for col in date_features})
3277
3314
  meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
3278
3315
  meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3279
3316
  if SEARCH_KEY_UNNEST in df.columns:
@@ -3294,8 +3331,8 @@ if response.status_code == 200:
3294
3331
  self.bundle,
3295
3332
  )
3296
3333
 
3297
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3298
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
3334
+ if DateTimeConverter.DATETIME_COL in df.columns:
3335
+ df = df.drop(columns=DateTimeConverter.DATETIME_COL)
3299
3336
 
3300
3337
  meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
3301
3338
 
@@ -3332,7 +3369,14 @@ if response.status_code == 200:
3332
3369
  dataset.columns_renaming = self.fit_columns_renaming
3333
3370
 
3334
3371
  self.passed_features = [
3335
- column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
3372
+ column
3373
+ for column, meaning_type in meaning_types.items()
3374
+ if meaning_type
3375
+ in [
3376
+ FileColumnMeaningType.FEATURE,
3377
+ FileColumnMeaningType.DATE_FEATURE,
3378
+ FileColumnMeaningType.GENERATED_FEATURE,
3379
+ ]
3336
3380
  ]
3337
3381
 
3338
3382
  self._search_task = dataset.search(
@@ -3860,8 +3904,8 @@ if response.status_code == 200:
3860
3904
  X = Xy.drop(columns=TARGET)
3861
3905
  y = Xy[TARGET].copy()
3862
3906
 
3863
- if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
3864
- X.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
3907
+ if DateTimeConverter.DATETIME_COL in X.columns:
3908
+ X.drop(columns=DateTimeConverter.DATETIME_COL, inplace=True)
3865
3909
 
3866
3910
  return X, y
3867
3911
 
@@ -3871,8 +3915,8 @@ if response.status_code == 200:
3871
3915
  X: pd.DataFrame, y: pd.Series, search_keys: dict[str, SearchKey], cv: CVType | None
3872
3916
  ) -> tuple[pd.DataFrame, pd.Series]:
3873
3917
  if cv not in [CVType.time_series, CVType.blocked_time_series]:
3874
- if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
3875
- date_column = DateTimeSearchKeyConverter.DATETIME_COL
3918
+ if DateTimeConverter.DATETIME_COL in X.columns:
3919
+ date_column = DateTimeConverter.DATETIME_COL
3876
3920
  else:
3877
3921
  date_column = FeaturesEnricher._get_date_column(search_keys)
3878
3922
  sort_columns = [date_column] if date_column is not None else []
@@ -3900,8 +3944,8 @@ if response.status_code == 200:
3900
3944
 
3901
3945
  y = Xy[TARGET].copy()
3902
3946
 
3903
- if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
3904
- X.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
3947
+ if DateTimeConverter.DATETIME_COL in X.columns:
3948
+ X.drop(columns=DateTimeConverter.DATETIME_COL, inplace=True)
3905
3949
 
3906
3950
  return X, y
3907
3951
 
@@ -3980,12 +4024,10 @@ if response.status_code == 200:
3980
4024
  maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3981
4025
  if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
3982
4026
  # TODO cast date column to single dtype
3983
- date_converter = DateTimeSearchKeyConverter(
3984
- maybe_date_col, self.date_format, generate_cyclical_features=False
3985
- )
3986
- converted_X = date_converter.convert(X)
3987
- min_date = converted_X[maybe_date_col].min()
3988
- max_date = converted_X[maybe_date_col].max()
4027
+ date_converter = DateTimeConverter(maybe_date_col, self.date_format, generate_cyclical_features=False)
4028
+ date_col_values = date_converter.to_date_ms(X)
4029
+ min_date = date_col_values.min()
4030
+ max_date = date_col_values.max()
3989
4031
  self.logger.info(f"Dates interval is ({min_date}, {max_date})")
3990
4032
 
3991
4033
  except Exception:
@@ -4022,7 +4064,7 @@ if response.status_code == 200:
4022
4064
  self.__log_warning(bundle.get("current_date_added"))
4023
4065
  df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
4024
4066
  search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
4025
- converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
4067
+ converter = DateTimeConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
4026
4068
  df = converter.convert(df)
4027
4069
  return df
4028
4070
 
@@ -4153,8 +4195,8 @@ if response.status_code == 200:
4153
4195
  "__target",
4154
4196
  ENTITY_SYSTEM_RECORD_ID,
4155
4197
  ]
4156
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
4157
- date_column = DateTimeSearchKeyConverter.DATETIME_COL
4198
+ if DateTimeConverter.DATETIME_COL in df.columns:
4199
+ date_column = DateTimeConverter.DATETIME_COL
4158
4200
  sort_exclude_columns.append(FeaturesEnricher._get_date_column(search_keys))
4159
4201
  else:
4160
4202
  date_column = FeaturesEnricher._get_date_column(search_keys)
@@ -4399,7 +4441,9 @@ if response.status_code == 200:
4399
4441
  raise Exception(self.bundle.get("missing_features_meta"))
4400
4442
  features_meta = deepcopy(features_meta)
4401
4443
 
4402
- original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
4444
+ file_metadata_columns = self._search_task.get_file_metadata(trace_id).columns
4445
+ file_meta_by_orig_name = {c.originalName: c for c in file_metadata_columns}
4446
+ original_names_dict = {c.name: c.originalName for c in file_metadata_columns}
4403
4447
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
4404
4448
 
4405
4449
  # To be sure that names with hash suffixes
@@ -4419,7 +4463,11 @@ if response.status_code == 200:
4419
4463
  original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4420
4464
  feature_meta.name = original_name
4421
4465
 
4422
- is_client_feature = original_name in clients_features_df.columns
4466
+ file_meta = file_meta_by_orig_name.get(original_name)
4467
+ is_generated_feature = (
4468
+ file_meta is not None and file_meta.meaningType == FileColumnMeaningType.GENERATED_FEATURE
4469
+ )
4470
+ is_client_feature = original_name in clients_features_df.columns and not is_generated_feature
4423
4471
 
4424
4472
  if selected_features is not None and feature_meta.name not in selected_features:
4425
4473
  self.logger.info(f"Feature {feature_meta.name} is not selected before and skipped")
@@ -4442,9 +4490,13 @@ if response.status_code == 200:
4442
4490
 
4443
4491
  for feature_meta in selected_features_meta:
4444
4492
  original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4445
- is_client_feature = original_name in clients_features_df.columns
4493
+ file_meta = file_meta_by_orig_name.get(original_name)
4494
+ is_generated_feature = (
4495
+ file_meta is not None and file_meta.meaningType == FileColumnMeaningType.GENERATED_FEATURE
4496
+ )
4497
+ is_client_feature = original_name in clients_features_df.columns and not is_generated_feature
4446
4498
 
4447
- if not is_client_feature:
4499
+ if not is_client_feature and not is_generated_feature:
4448
4500
  self.external_source_feature_names.append(original_name)
4449
4501
 
4450
4502
  if self.psi_values is not None:
@@ -4475,9 +4527,10 @@ if response.status_code == 200:
4475
4527
 
4476
4528
  self.feature_names_.append(feature_meta.name)
4477
4529
  self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
4478
-
4479
4530
  df_for_sample = features_df if feature_meta.name in features_df.columns else clients_features_df
4480
- feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
4531
+ feature_info = FeatureInfo.from_metadata(
4532
+ feature_meta, df_for_sample, is_client_feature, is_generated_feature
4533
+ )
4481
4534
  features_info.append(feature_info.to_row(self.bundle))
4482
4535
  features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
4483
4536
  internal_features_info.append(feature_info.to_internal_row(self.bundle))
@@ -4488,7 +4541,7 @@ if response.status_code == 200:
4488
4541
  if len(features_info) > 0:
4489
4542
  self.features_info = pd.DataFrame(features_info)
4490
4543
  # If all psi values are 0 or null, drop psi column
4491
- if self.features_info[self.bundle.get("features_info_psi")].fillna(0.0).eq(0.0).all():
4544
+ if self.features_info[self.bundle.get("features_info_psi")].astype(np.float64).fillna(0.0).eq(0.0).all():
4492
4545
  self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
4493
4546
  self._features_info_without_links = pd.DataFrame(features_info_without_links)
4494
4547
  self._internal_features_info = pd.DataFrame(internal_features_info)
@@ -4954,7 +5007,7 @@ if response.status_code == 200:
4954
5007
  eval_set: tuple | None = None,
4955
5008
  ):
4956
5009
  def dump_task(X_, y_, eval_set_):
4957
- with MDC(trace_id=trace_id):
5010
+ with MDC(correlation_id=trace_id):
4958
5011
  try:
4959
5012
  if isinstance(X_, pd.Series):
4960
5013
  X_ = X_.to_frame()
@@ -36,6 +36,8 @@ class FileColumnMeaningType(Enum):
36
36
  SCORE = "SCORE"
37
37
  TARGET = "TARGET"
38
38
  FEATURE = "FEATURE"
39
+ GENERATED_FEATURE = "GENERATED_FEATURE"
40
+ DATE_FEATURE = "DATE_FEATURE"
39
41
  CUSTOM_KEY = "CUSTOM_KEY"
40
42
  COUNTRY = "COUNTRY"
41
43
  POSTAL_CODE = "POSTAL_CODE"
@@ -25,7 +25,7 @@ from upgini.metadata import (
25
25
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
26
26
  from upgini.utils import find_numbers_with_decimal_comma
27
27
  from upgini.utils.country_utils import CountrySearchKeyConverter
28
- from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
28
+ from upgini.utils.datetime_utils import DateTimeConverter
29
29
  from upgini.utils.ip_utils import IpSearchKeyConverter
30
30
  from upgini.utils.phone_utils import PhoneSearchKeyConverter
31
31
  from upgini.utils.postal_code_utils import PostalCodeSearchKeyConverter
@@ -89,7 +89,7 @@ class Normalizer:
89
89
  SYSTEM_RECORD_ID,
90
90
  ENTITY_SYSTEM_RECORD_ID,
91
91
  SEARCH_KEY_UNNEST,
92
- DateTimeSearchKeyConverter.DATETIME_COL,
92
+ DateTimeConverter.DATETIME_COL,
93
93
  ]:
94
94
  self.columns_renaming[column] = column
95
95
  new_columns.append(column)