upgini 1.2.136__tar.gz → 1.2.138__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. {upgini-1.2.136 → upgini-1.2.138}/PKG-INFO +2 -1
  2. {upgini-1.2.136 → upgini-1.2.138}/pyproject.toml +1 -0
  3. upgini-1.2.138/src/upgini/__about__.py +1 -0
  4. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/dataset.py +33 -1
  5. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/features_enricher.py +33 -11
  6. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/http.py +2 -2
  7. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/metadata.py +11 -1
  8. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/normalizer/normalize_utils.py +4 -4
  9. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/resource_bundle/strings.properties +1 -0
  10. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/base_search_key_detector.py +5 -1
  11. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/datetime_utils.py +15 -1
  12. upgini-1.2.136/src/upgini/__about__.py +0 -1
  13. {upgini-1.2.136 → upgini-1.2.138}/.gitignore +0 -0
  14. {upgini-1.2.136 → upgini-1.2.138}/LICENSE +0 -0
  15. {upgini-1.2.136 → upgini-1.2.138}/README.md +0 -0
  16. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/__init__.py +0 -0
  17. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/ads.py +0 -0
  18. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/ads_management/__init__.py +0 -0
  19. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/ads_management/ads_manager.py +0 -0
  20. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/autofe/__init__.py +0 -0
  21. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/autofe/all_operators.py +0 -0
  22. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/autofe/binary.py +0 -0
  23. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/autofe/date.py +0 -0
  24. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/autofe/feature.py +0 -0
  25. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/autofe/groupby.py +0 -0
  26. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/autofe/operator.py +0 -0
  27. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/autofe/timeseries/__init__.py +0 -0
  28. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/autofe/timeseries/base.py +0 -0
  29. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/autofe/timeseries/cross.py +0 -0
  30. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/autofe/timeseries/delta.py +0 -0
  31. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/autofe/timeseries/lag.py +0 -0
  32. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/autofe/timeseries/roll.py +0 -0
  33. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/autofe/timeseries/trend.py +0 -0
  34. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/autofe/timeseries/volatility.py +0 -0
  35. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/autofe/unary.py +0 -0
  36. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/autofe/utils.py +0 -0
  37. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/autofe/vector.py +0 -0
  38. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/data_source/__init__.py +0 -0
  39. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/data_source/data_source_publisher.py +0 -0
  40. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/errors.py +0 -0
  41. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/mdc/__init__.py +0 -0
  42. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/mdc/context.py +0 -0
  43. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/metrics.py +0 -0
  44. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/normalizer/__init__.py +0 -0
  45. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/resource_bundle/__init__.py +0 -0
  46. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/resource_bundle/exceptions.py +0 -0
  47. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  48. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/sampler/__init__.py +0 -0
  49. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/sampler/base.py +0 -0
  50. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/sampler/random_under_sampler.py +0 -0
  51. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/sampler/utils.py +0 -0
  52. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/search_task.py +0 -0
  53. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/spinner.py +0 -0
  54. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  55. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/__init__.py +0 -0
  56. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/blocked_time_series.py +0 -0
  57. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/config.py +0 -0
  58. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/country_utils.py +0 -0
  59. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/custom_loss_utils.py +0 -0
  60. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/cv_utils.py +0 -0
  61. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/deduplicate_utils.py +0 -0
  62. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/display_utils.py +0 -0
  63. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/email_utils.py +0 -0
  64. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/fallback_progress_bar.py +0 -0
  65. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/feature_info.py +0 -0
  66. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/features_validator.py +0 -0
  67. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/format.py +0 -0
  68. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/hash_utils.py +0 -0
  69. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/ip_utils.py +0 -0
  70. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/mstats.py +0 -0
  71. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/phone_utils.py +0 -0
  72. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/postal_code_utils.py +0 -0
  73. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/progress_bar.py +0 -0
  74. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/psi.py +0 -0
  75. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/sample_utils.py +0 -0
  76. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/sklearn_ext.py +0 -0
  77. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/sort.py +0 -0
  78. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/target_utils.py +0 -0
  79. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/track_info.py +0 -0
  80. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/ts_utils.py +0 -0
  81. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/utils/warning_counter.py +0 -0
  82. {upgini-1.2.136 → upgini-1.2.138}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: upgini
3
- Version: 1.2.136
3
+ Version: 1.2.138
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -34,6 +34,7 @@ Requires-Dist: more-itertools==10.7.0
34
34
  Requires-Dist: numpy<3.0.0,>=1.19.0
35
35
  Requires-Dist: pandas<3.0.0,>=1.1.0
36
36
  Requires-Dist: psutil>=5.9.0
37
+ Requires-Dist: pyarrow==18.1.0
37
38
  Requires-Dist: pydantic<3.0.0,>1.0.0
38
39
  Requires-Dist: pyjwt>=2.8.0
39
40
  Requires-Dist: python-bidi==0.4.2
@@ -56,6 +56,7 @@ dependencies = [
56
56
  "psutil>=5.9.0",
57
57
  "category-encoders>=2.8.1",
58
58
  "more_itertools==10.7.0",
59
+ "pyarrow==18.1.0",
59
60
  ]
60
61
 
61
62
  [project.urls]
@@ -0,0 +1 @@
1
+ __version__ = "1.2.138"
@@ -7,6 +7,8 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
7
7
 
8
8
  import numpy as np
9
9
  import pandas as pd
10
+ import pyarrow as pa
11
+ import pyarrow.parquet as pq
10
12
  from pandas.api.types import (
11
13
  is_float_dtype,
12
14
  is_integer_dtype,
@@ -18,6 +20,7 @@ from pandas.api.types import (
18
20
  from upgini.errors import ValidationError
19
21
  from upgini.http import ProgressStage, SearchProgress, _RestClient
20
22
  from upgini.metadata import (
23
+ CURRENT_DATE_COL,
21
24
  ENTITY_SYSTEM_RECORD_ID,
22
25
  EVAL_SET_INDEX,
23
26
  SYSTEM_RECORD_ID,
@@ -38,6 +41,7 @@ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
38
41
  from upgini.search_task import SearchTask
39
42
  from upgini.utils.config import SampleConfig
40
43
  from upgini.utils.email_utils import EmailSearchKeyConverter
44
+ from upgini.utils.hash_utils import file_hash
41
45
  from upgini.utils.sample_utils import SampleColumns, sample
42
46
 
43
47
  try:
@@ -287,7 +291,7 @@ class Dataset:
287
291
  for key in search_group
288
292
  if key in self.columns_renaming
289
293
  and not self.columns_renaming.get(key).endswith(EmailSearchKeyConverter.ONE_DOMAIN_SUFFIX)
290
- and not self.columns_renaming.get(key) == "current_date"
294
+ and not self.columns_renaming.get(key) == CURRENT_DATE_COL
291
295
  }
292
296
  ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS.value)
293
297
  if (
@@ -469,6 +473,33 @@ class Dataset:
469
473
 
470
474
  columns.append(column_meta)
471
475
 
476
+ current_date = int(pd.Timestamp(pd.Timestamp.now().date(), tz="UTC").timestamp() * 1000)
477
+ with tempfile.TemporaryDirectory() as tmp_dir:
478
+ if (
479
+ self.date_column is not None
480
+ and self.data[self.date_column].nunique() == 1
481
+ and self.data[self.date_column].iloc[0] == current_date
482
+ ):
483
+ df_without_fake_date = self.data.drop(columns=[self.date_column])
484
+ else:
485
+ df_without_fake_date = self.data
486
+ parquet_file_path = f"{tmp_dir}/{self.dataset_name}.parquet"
487
+
488
+ # calculate deterministic digest for any environment
489
+
490
+ table = pa.Table.from_pandas(df_without_fake_date, preserve_index=False)
491
+ table = table.replace_schema_metadata({}) # remove all metadata
492
+ pq.write_table(
493
+ table,
494
+ parquet_file_path,
495
+ compression=None, # any compression will make it non-deterministic
496
+ data_page_size=0, # optional, to remove page layout variations
497
+ use_deprecated_int96_timestamps=False, # fix timestamp format
498
+ write_statistics=False, # remove statistics to make it deterministic
499
+ )
500
+
501
+ deterministic_digest = file_hash(parquet_file_path)
502
+
472
503
  return FileMetadata(
473
504
  name=self.dataset_name,
474
505
  description=self.description,
@@ -479,6 +510,7 @@ class Dataset:
479
510
  hierarchicalSubgroupKeys=self.hierarchical_subgroup_keys,
480
511
  taskType=self.task_type,
481
512
  droppedColumns=self.dropped_columns,
513
+ deterministicDigest=deterministic_digest,
482
514
  )
483
515
 
484
516
  @staticmethod
@@ -44,6 +44,7 @@ from upgini.http import (
44
44
  from upgini.mdc import MDC
45
45
  from upgini.metadata import (
46
46
  COUNTRY,
47
+ CURRENT_DATE_COL,
47
48
  DEFAULT_INDEX,
48
49
  ENTITY_SYSTEM_RECORD_ID,
49
50
  EVAL_SET_INDEX,
@@ -76,6 +77,7 @@ from upgini.utils.custom_loss_utils import (
76
77
  )
77
78
  from upgini.utils.cv_utils import CVConfig, get_groups
78
79
  from upgini.utils.datetime_utils import (
80
+ DateSearchKeyDetector,
79
81
  DateTimeConverter,
80
82
  is_blocked_time_series,
81
83
  is_dates_distribution_valid,
@@ -167,7 +169,6 @@ class FeaturesEnricher(TransformerMixin):
167
169
  """
168
170
 
169
171
  TARGET_NAME = "target"
170
- CURRENT_DATE = "current_date"
171
172
  RANDOM_STATE = 42
172
173
  CALCULATE_METRICS_THRESHOLD = 50_000_000
173
174
  CALCULATE_METRICS_MIN_THRESHOLD = 500
@@ -238,6 +239,7 @@ class FeaturesEnricher(TransformerMixin):
238
239
  generate_search_key_features: bool = True,
239
240
  sample_config: SampleConfig | None = None,
240
241
  print_trace_id: bool = False,
242
+ print_loaded_report: bool = True,
241
243
  **kwargs,
242
244
  ):
243
245
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -284,7 +286,7 @@ class FeaturesEnricher(TransformerMixin):
284
286
  self.id_columns = id_columns
285
287
  self.id_columns_encoder = None
286
288
  self.country_code = country_code
287
- self.__validate_search_keys(search_keys, search_id)
289
+ self.__validate_search_keys(self.search_keys, search_id)
288
290
 
289
291
  self.model_task_type = ModelTaskType.parse(model_task_type)
290
292
  self.endpoint = endpoint
@@ -317,7 +319,8 @@ class FeaturesEnricher(TransformerMixin):
317
319
  self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
318
320
  df = pd.DataFrame(columns=x_columns)
319
321
  self.__prepare_feature_importances(trace_id, df, silent=True, update_selected_features=False)
320
- self.__show_selected_features()
322
+ if print_loaded_report:
323
+ self.__show_selected_features()
321
324
  # TODO validate search_keys with search_keys from file_metadata
322
325
  print(self.bundle.get("search_by_task_id_finish"))
323
326
  self.logger.debug(f"Successfully initialized with search_id: {search_id}")
@@ -2983,6 +2986,9 @@ if response.status_code == 200:
2983
2986
  else:
2984
2987
  selected_input_columns = []
2985
2988
 
2989
+ if DEFAULT_INDEX in selected_input_columns:
2990
+ selected_input_columns.remove(DEFAULT_INDEX)
2991
+
2986
2992
  return selected_input_columns + selected_generated_features
2987
2993
 
2988
2994
  def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
@@ -3167,7 +3173,7 @@ if response.status_code == 200:
3167
3173
 
3168
3174
  if DEFAULT_INDEX in df.columns:
3169
3175
  msg = self.bundle.get("unsupported_index_column")
3170
- self.logger.info(msg)
3176
+ self.logger.warning(msg)
3171
3177
  print(msg)
3172
3178
  self.fit_dropped_features.add(DEFAULT_INDEX)
3173
3179
  df.drop(columns=DEFAULT_INDEX, inplace=True)
@@ -3223,8 +3229,11 @@ if response.status_code == 200:
3223
3229
  df, self.fit_search_keys, self.fit_generated_features
3224
3230
  )
3225
3231
  self.fit_columns_renaming = normalizer.columns_renaming
3226
- if normalizer.removed_features:
3227
- self.__log_warning(self.bundle.get("dataset_date_features").format(normalizer.removed_features))
3232
+ if normalizer.removed_datetime_features:
3233
+ original_removed_datetime_features = [
3234
+ self.fit_columns_renaming.get(f, f) for f in normalizer.removed_datetime_features
3235
+ ]
3236
+ self.__log_warning(self.bundle.get("dataset_date_features").format(original_removed_datetime_features))
3228
3237
 
3229
3238
  non_feature_columns = [
3230
3239
  self.TARGET_NAME,
@@ -4090,12 +4099,14 @@ if response.status_code == 200:
4090
4099
  or set(search_keys.values()) == {SearchKey.EMAIL}
4091
4100
  or set(search_keys.values()) == {SearchKey.HEM}
4092
4101
  or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
4102
+ or len(search_keys) == 0
4103
+ or set(search_keys.values()) == {SearchKey.CUSTOM_KEY}
4093
4104
  ):
4094
4105
  if not silent:
4095
4106
  self.__log_warning(bundle.get("current_date_added"))
4096
- df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
4097
- search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
4098
- converter = DateTimeConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
4107
+ df[CURRENT_DATE_COL] = datetime.date.today()
4108
+ search_keys[CURRENT_DATE_COL] = SearchKey.DATE
4109
+ converter = DateTimeConverter(CURRENT_DATE_COL, generate_cyclical_features=False)
4099
4110
  df = converter.convert(df)
4100
4111
  return df
4101
4112
 
@@ -4109,7 +4120,7 @@ if response.status_code == 200:
4109
4120
  return [
4110
4121
  col
4111
4122
  for col, t in search_keys.items()
4112
- if t not in [SearchKey.DATE, SearchKey.DATETIME] and df[col].dropna().nunique() > 1
4123
+ if t not in [SearchKey.DATE, SearchKey.DATETIME] and col in df.columns and df[col].dropna().nunique() > 1
4113
4124
  ]
4114
4125
 
4115
4126
  @staticmethod
@@ -4777,7 +4788,8 @@ if response.status_code == 200:
4777
4788
  else:
4778
4789
  msg = self.bundle.get("unregistered_only_personal_keys")
4779
4790
  self.logger.warning(msg + f" Provided search keys: {search_keys}")
4780
- raise ValidationError(msg)
4791
+ # Current date will be added later
4792
+ # raise ValidationError(msg)
4781
4793
 
4782
4794
  if (
4783
4795
  len(valid_search_keys.values()) == 1
@@ -4896,6 +4908,16 @@ if response.status_code == 200:
4896
4908
  search_key in self.fit_search_keys.values() and search_key not in search_keys.values()
4897
4909
  )
4898
4910
 
4911
+ if check_need_detect(SearchKey.DATE) and check_need_detect(SearchKey.DATETIME):
4912
+ maybe_keys = DateSearchKeyDetector().get_search_key_columns(sample, search_keys)
4913
+ if len(maybe_keys) > 0:
4914
+ datetime_key = maybe_keys[0]
4915
+ search_keys[datetime_key] = SearchKey.DATETIME
4916
+ self.autodetected_search_keys[datetime_key] = SearchKey.DATETIME
4917
+ self.logger.info(f"Autodetected search key DATETIME in column {datetime_key}")
4918
+ if not silent_mode:
4919
+ print(self.bundle.get("datetime_detected").format(datetime_key))
4920
+
4899
4921
  # if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
4900
4922
  if check_need_detect(SearchKey.POSTAL_CODE):
4901
4923
  maybe_keys = PostalCodeSearchKeyDetector().get_search_key_columns(sample, search_keys)
@@ -433,8 +433,8 @@ class _RestClient:
433
433
  with open(file_path, "rb") as file:
434
434
  content = file.read()
435
435
  md5_hash.update(content)
436
- digest = md5_hash.hexdigest()
437
- metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
436
+ digest_md5 = md5_hash.hexdigest()
437
+ metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest_md5})
438
438
 
439
439
  digest_sha256 = file_hash(file_path)
440
440
  metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
@@ -12,10 +12,19 @@ SORT_ID = "sort_id"
12
12
  EVAL_SET_INDEX = "eval_set_index"
13
13
  TARGET = "target"
14
14
  COUNTRY = "country_iso_code"
15
+ CURRENT_DATE_COL = "current_date_"
15
16
  RENAMED_INDEX = "index_col"
16
17
  DEFAULT_INDEX = "index"
17
18
  ORIGINAL_INDEX = "original_index"
18
- SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST, EVAL_SET_INDEX, TARGET, COUNTRY}
19
+ SYSTEM_COLUMNS = {
20
+ SYSTEM_RECORD_ID,
21
+ ENTITY_SYSTEM_RECORD_ID,
22
+ SEARCH_KEY_UNNEST,
23
+ EVAL_SET_INDEX,
24
+ TARGET,
25
+ COUNTRY,
26
+ CURRENT_DATE_COL,
27
+ }
19
28
 
20
29
 
21
30
  class FileColumnMeaningType(Enum):
@@ -252,6 +261,7 @@ class FileMetadata(BaseModel):
252
261
  rowsCount: Optional[int] = None
253
262
  checksumMD5: Optional[str] = None
254
263
  digest: Optional[str] = None
264
+ deterministicDigest: Optional[str] = None
255
265
  droppedColumns: Optional[List[str]] = None
256
266
 
257
267
  def column_by_name(self, name: str) -> Optional[FileColumnMetadata]:
@@ -5,7 +5,6 @@ from typing import Dict, List, Tuple
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
  from pandas.api.types import is_bool_dtype as is_bool
8
- from pandas.api.types import is_datetime64_any_dtype as is_datetime
9
8
  from pandas.api.types import (
10
9
  is_float_dtype,
11
10
  is_numeric_dtype,
@@ -45,7 +44,7 @@ class Normalizer:
45
44
  self.columns_renaming = {}
46
45
  self.search_keys = {}
47
46
  self.generated_features = []
48
- self.removed_features = []
47
+ self.removed_datetime_features = []
49
48
 
50
49
  def normalize(
51
50
  self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
@@ -134,8 +133,9 @@ class Normalizer:
134
133
  features = self._get_features(df)
135
134
 
136
135
  for f in features:
137
- if is_datetime(df[f]) or isinstance(df[f].dtype, pd.PeriodDtype):
138
- self.removed_features.append(f)
136
+ converter = DateTimeConverter(f)
137
+ if converter.is_datetime(df):
138
+ self.removed_datetime_features.append(f)
139
139
  df.drop(columns=f, inplace=True)
140
140
 
141
141
  return df
@@ -210,6 +210,7 @@ features_info_zero_important_features=Oops, we can't find any relevant external
210
210
  features_info_zero_hit_rate_search_keys=Oops, looks like values/formats of the search keys {} might be incorrect,\nas we won't be able to match any data source using these values\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
211
211
  features_not_generated=Following features didn't pass checks for automated feature generation: {}
212
212
  # Information
213
+ datetime_detected=Datetime detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
213
214
  postal_code_detected=Postal codes detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
214
215
  country_detected=Countries detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
215
216
  country_auto_determined=Search key country_code `{}` was automatically determined by client IP. \nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
@@ -24,4 +24,8 @@ class BaseSearchKeyDetector:
24
24
  for column_name in other_columns:
25
25
  if self._is_search_key_by_values(df[column_name]):
26
26
  columns_by_values.append(column_name)
27
- return list(set(columns_by_names + columns_by_values))
27
+
28
+ both = [col for col in columns_by_names if col in columns_by_values]
29
+ only_values = [col for col in columns_by_values if col not in columns_by_names]
30
+ only_names = [col for col in columns_by_names if col not in columns_by_values]
31
+ return both + only_values + only_names
@@ -10,6 +10,7 @@ from pandas.api.types import is_numeric_dtype
10
10
  from upgini.errors import ValidationError
11
11
  from upgini.metadata import EVAL_SET_INDEX, SearchKey
12
12
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
13
+ from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
13
14
 
14
15
  DATE_FORMATS = [
15
16
  "%Y-%m-%d",
@@ -29,6 +30,15 @@ DATE_FORMATS = [
29
30
  DATETIME_PATTERN = r"^[\d\s\.\-:T/+]+$"
30
31
 
31
32
 
33
+ class DateSearchKeyDetector(BaseSearchKeyDetector):
34
+ def _is_search_key_by_name(self, column_name: str) -> bool:
35
+ lower_column_name = str(column_name).lower()
36
+ return "date" in lower_column_name or "time" in lower_column_name or "timestamp" in lower_column_name
37
+
38
+ def _is_search_key_by_values(self, column: pd.Series) -> bool:
39
+ return DateTimeConverter(column.name).is_datetime(column.to_frame(column.name))
40
+
41
+
32
42
  class DateTimeConverter:
33
43
  DATETIME_COL = "_date_time"
34
44
  # MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31) # 946684800000 # 2000-01-01
@@ -80,7 +90,7 @@ class DateTimeConverter:
80
90
  return True
81
91
 
82
92
  parsed = self.parse_datetime(df, raise_errors=False)
83
- return parsed is not None and not parsed.isna().all()
93
+ return parsed is not None and parsed.isna().mean() <= 0.5
84
94
 
85
95
  def parse_datetime(self, df: pd.DataFrame, raise_errors=True) -> pd.Series | None:
86
96
  if len(df) == 0 or df[self.date_column].isna().all():
@@ -408,6 +418,10 @@ def is_dates_distribution_valid(
408
418
  if maybe_date_col is None:
409
419
  return
410
420
 
421
+ # Don't check if date column is constant
422
+ if X[maybe_date_col].nunique() <= 1:
423
+ return
424
+
411
425
  if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
412
426
  dates = X[maybe_date_col].dt.to_timestamp().dt.date
413
427
  elif pd.__version__ >= "2.0.0":
@@ -1 +0,0 @@
1
- __version__ = "1.2.136"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes