upgini 1.2.122a4__py3-none-any.whl → 1.2.146a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/http.py CHANGED
@@ -433,8 +433,8 @@ class _RestClient:
433
433
  with open(file_path, "rb") as file:
434
434
  content = file.read()
435
435
  md5_hash.update(content)
436
- digest = md5_hash.hexdigest()
437
- metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
436
+ digest_md5 = md5_hash.hexdigest()
437
+ metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest_md5})
438
438
 
439
439
  digest_sha256 = file_hash(file_path)
440
440
  metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
upgini/metadata.py CHANGED
@@ -12,10 +12,19 @@ SORT_ID = "sort_id"
12
12
  EVAL_SET_INDEX = "eval_set_index"
13
13
  TARGET = "target"
14
14
  COUNTRY = "country_iso_code"
15
+ CURRENT_DATE_COL = "current_date_"
15
16
  RENAMED_INDEX = "index_col"
16
17
  DEFAULT_INDEX = "index"
17
18
  ORIGINAL_INDEX = "original_index"
18
- SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST, EVAL_SET_INDEX, TARGET, COUNTRY}
19
+ SYSTEM_COLUMNS = {
20
+ SYSTEM_RECORD_ID,
21
+ ENTITY_SYSTEM_RECORD_ID,
22
+ SEARCH_KEY_UNNEST,
23
+ EVAL_SET_INDEX,
24
+ TARGET,
25
+ COUNTRY,
26
+ CURRENT_DATE_COL,
27
+ }
19
28
 
20
29
 
21
30
  class FileColumnMeaningType(Enum):
@@ -36,6 +45,8 @@ class FileColumnMeaningType(Enum):
36
45
  SCORE = "SCORE"
37
46
  TARGET = "TARGET"
38
47
  FEATURE = "FEATURE"
48
+ GENERATED_FEATURE = "GENERATED_FEATURE"
49
+ DATE_FEATURE = "DATE_FEATURE"
39
50
  CUSTOM_KEY = "CUSTOM_KEY"
40
51
  COUNTRY = "COUNTRY"
41
52
  POSTAL_CODE = "POSTAL_CODE"
@@ -85,7 +96,7 @@ class SearchKey(Enum):
85
96
  return [SearchKey.EMAIL, SearchKey.HEM, SearchKey.IP, SearchKey.PHONE]
86
97
 
87
98
  @staticmethod
88
- def from_meaning_type(meaning_type: FileColumnMeaningType) -> "SearchKey":
99
+ def from_meaning_type(meaning_type: FileColumnMeaningType) -> Optional["SearchKey"]:
89
100
  if meaning_type == FileColumnMeaningType.EMAIL:
90
101
  return SearchKey.EMAIL
91
102
  if meaning_type == FileColumnMeaningType.HEM:
@@ -163,7 +174,9 @@ class ModelTaskType(Enum):
163
174
  return self in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
164
175
 
165
176
  @staticmethod
166
- def parse(task_type: Any) -> "ModelTaskType":
177
+ def parse(task_type: Any) -> Optional["ModelTaskType"]:
178
+ if task_type is None:
179
+ return None
167
180
  if isinstance(task_type, ModelTaskType):
168
181
  return task_type
169
182
  elif isinstance(task_type, str):
@@ -248,6 +261,9 @@ class FileMetadata(BaseModel):
248
261
  rowsCount: Optional[int] = None
249
262
  checksumMD5: Optional[str] = None
250
263
  digest: Optional[str] = None
264
+ deterministicDigest: Optional[str] = None
265
+ droppedColumns: Optional[List[str]] = None
266
+ autodetectedSearchKeys: Optional[Dict[str, str]] = None
251
267
 
252
268
  def column_by_name(self, name: str) -> Optional[FileColumnMetadata]:
253
269
  for c in self.columns:
@@ -5,7 +5,6 @@ from typing import Dict, List, Tuple
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
  from pandas.api.types import is_bool_dtype as is_bool
8
- from pandas.api.types import is_datetime64_any_dtype as is_datetime
9
8
  from pandas.api.types import (
10
9
  is_float_dtype,
11
10
  is_numeric_dtype,
@@ -25,7 +24,7 @@ from upgini.metadata import (
25
24
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
26
25
  from upgini.utils import find_numbers_with_decimal_comma
27
26
  from upgini.utils.country_utils import CountrySearchKeyConverter
28
- from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
27
+ from upgini.utils.datetime_utils import DateTimeConverter
29
28
  from upgini.utils.ip_utils import IpSearchKeyConverter
30
29
  from upgini.utils.phone_utils import PhoneSearchKeyConverter
31
30
  from upgini.utils.postal_code_utils import PostalCodeSearchKeyConverter
@@ -45,7 +44,7 @@ class Normalizer:
45
44
  self.columns_renaming = {}
46
45
  self.search_keys = {}
47
46
  self.generated_features = []
48
- self.removed_features = []
47
+ self.removed_datetime_features = []
49
48
 
50
49
  def normalize(
51
50
  self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
@@ -89,7 +88,7 @@ class Normalizer:
89
88
  SYSTEM_RECORD_ID,
90
89
  ENTITY_SYSTEM_RECORD_ID,
91
90
  SEARCH_KEY_UNNEST,
92
- DateTimeSearchKeyConverter.DATETIME_COL,
91
+ DateTimeConverter.DATETIME_COL,
93
92
  ]:
94
93
  self.columns_renaming[column] = column
95
94
  new_columns.append(column)
@@ -134,8 +133,9 @@ class Normalizer:
134
133
  features = self._get_features(df)
135
134
 
136
135
  for f in features:
137
- if is_datetime(df[f]) or isinstance(df[f].dtype, pd.PeriodDtype):
138
- self.removed_features.append(f)
136
+ converter = DateTimeConverter(f)
137
+ if converter.is_datetime(df) and f != DateTimeConverter.DATETIME_COL:
138
+ self.removed_datetime_features.append(f)
139
139
  df.drop(columns=f, inplace=True)
140
140
 
141
141
  return df
@@ -12,7 +12,8 @@ polling_unregister_information=We'll send email notification once it's completed
12
12
  ads_upload_finish=Thank you for your submission!\nWe'll check your data sharing proposal and get back to you
13
13
  demo_dataset_info=Demo training dataset detected. Registration for an API key is not required.\n
14
14
  transform_usage_info=You use Trial access to Upgini data enrichment. Limit for Trial: {} rows. You have already enriched: {} rows.
15
- transform_usage_warning=You are trying to launch enrichment for {} rows, which will exceed the rest limit {}.
15
+ transform_usage_warning_demo=Unregistered-user limit: {} rows remaining; you requested {}.
16
+ transform_usage_warning_registered=Free tier limit: {} rows remaining; you requested {}.
16
17
 
17
18
  # Warnings
18
19
  support_link=https://upgini.com/support
@@ -139,6 +140,7 @@ x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
139
140
  eval_x_has_train_samples=Eval set X has rows that are present in train set X
140
141
  oot_without_date_not_supported=Eval set {} provided as OOT but date column is missing. It will be ignored for stability check
141
142
  oot_with_online_sources_not_supported=Eval set {} provided as OOT and also provided columns for online API. It will be ignored for stability check
143
+ autodetected_search_key_not_found=Autodetected on fit search key {} not found in X columns: {} for transform
142
144
 
143
145
  baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
144
146
  baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
@@ -174,7 +176,8 @@ dataset_invalid_multiclass_target=Unexpected dtype of target for multiclass task
174
176
  dataset_invalid_regression_target=Unexpected dtype of target for regression task type: {}. Expected float
175
177
  dataset_invalid_timeseries_target=Unexpected dtype of target for timeseries task type: {}. Expected float
176
178
  dataset_to_many_multiclass_targets=The number of target classes {} exceeds the allowed threshold: {}. Please, correct your data and try again
177
- dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
179
+ dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class
180
+ #\nPlease, remove rows with rarest class from your dataframe
178
181
  dataset_rarest_class_less_threshold=Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
179
182
  dataset_date_features=Columns {} is a datetime or period type but not used as a search key, removed from X
180
183
  dataset_too_many_features=Too many features. Maximum number of features is {}
@@ -209,15 +212,16 @@ features_info_zero_important_features=Oops, we can't find any relevant external
209
212
  features_info_zero_hit_rate_search_keys=Oops, looks like values/formats of the search keys {} might be incorrect,\nas we won't be able to match any data source using these values\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
210
213
  features_not_generated=Following features didn't pass checks for automated feature generation: {}
211
214
  # Information
212
- postal_code_detected=Postal codes detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
213
- country_detected=Countries detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
214
- country_auto_determined=Search key country_code `{}` was automatically determined by client IP. \nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
215
- country_default_determined=Search key country_code `{}` was used as default. \nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
216
- email_detected=Emails detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
217
- email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
218
- phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
219
- phone_detected_not_registered=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
220
- target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
215
+ datetime_detected=Datetime detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
216
+ postal_code_detected=Postal codes detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
217
+ country_detected=Countries detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
218
+ country_auto_determined=Search key country_code `{}` was automatically determined by client IP. \nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
219
+ country_default_determined=Search key country_code `{}` was used as default. \nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
220
+ email_detected=Emails detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
221
+ email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
222
+ phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
223
+ phone_detected_not_registered=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
224
+ target_type_detected=Detected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
221
225
  binary_target_reason=only two unique label-values observed
222
226
  non_numeric_multiclass_reason=non-numeric label values observed
223
227
  few_unique_label_multiclass_reason=few unique label-values observed and can be considered as categorical
upgini/search_task.py CHANGED
@@ -165,10 +165,21 @@ class SearchTask:
165
165
 
166
166
  return list(zero_hit_search_keys)
167
167
 
168
- def get_features_for_transform(self) -> Optional[List[str]]:
168
+ def get_features_for_embeddings(self) -> Optional[List[str]]:
169
169
  if self.provider_metadata_v2 is None:
170
170
  return None
171
171
 
172
+ features_for_transform = set()
173
+ for meta in self.provider_metadata_v2:
174
+ if meta.features_used_for_embeddings is not None:
175
+ features_for_transform.update(meta.features_used_for_embeddings)
176
+
177
+ return list(features_for_transform)
178
+
179
+ def get_features_for_transform(self) -> List[str]:
180
+ if self.provider_metadata_v2 is None:
181
+ return []
182
+
172
183
  features_for_transform = set()
173
184
  for meta in self.provider_metadata_v2:
174
185
  if meta.features_used_for_embeddings is not None:
@@ -423,4 +434,5 @@ def _read_parquet(file_content: bytes, file_name: str = "features.parquet"):
423
434
  tmp_file_name = f"{tmp_dir}/{file_name}"
424
435
  with open(tmp_file_name, "wb") as gzip_file:
425
436
  gzip_file.write(file_content)
426
- return pd.read_parquet(tmp_file_name, engine="fastparquet")
437
+ # Note: MLB writes files using pyarrow, so reading with fastparquet may cause errors.
438
+ return pd.read_parquet(tmp_file_name, engine="pyarrow")
@@ -24,4 +24,8 @@ class BaseSearchKeyDetector:
24
24
  for column_name in other_columns:
25
25
  if self._is_search_key_by_values(df[column_name]):
26
26
  columns_by_values.append(column_name)
27
- return list(set(columns_by_names + columns_by_values))
27
+
28
+ both = [col for col in columns_by_names if col in columns_by_values]
29
+ only_values = [col for col in columns_by_values if col not in columns_by_names]
30
+ only_names = [col for col in columns_by_names if col not in columns_by_values]
31
+ return both + only_values + only_names
@@ -1,6 +1,5 @@
1
1
  import datetime
2
2
  import logging
3
- import re
4
3
  from typing import Dict, List, Optional
5
4
 
6
5
  import numpy as np
@@ -11,6 +10,7 @@ from pandas.api.types import is_numeric_dtype
11
10
  from upgini.errors import ValidationError
12
11
  from upgini.metadata import EVAL_SET_INDEX, SearchKey
13
12
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
13
+ from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
14
14
 
15
15
  DATE_FORMATS = [
16
16
  "%Y-%m-%d",
@@ -30,7 +30,16 @@ DATE_FORMATS = [
30
30
  DATETIME_PATTERN = r"^[\d\s\.\-:T/+]+$"
31
31
 
32
32
 
33
- class DateTimeSearchKeyConverter:
33
+ class DateSearchKeyDetector(BaseSearchKeyDetector):
34
+ def _is_search_key_by_name(self, column_name: str) -> bool:
35
+ lower_column_name = str(column_name).lower()
36
+ return "date" in lower_column_name or "time" in lower_column_name or "timestamp" in lower_column_name
37
+
38
+ def _is_search_key_by_values(self, column: pd.Series) -> bool:
39
+ return DateTimeConverter(column.name).is_datetime(column.to_frame(column.name))
40
+
41
+
42
+ class DateTimeConverter:
34
43
  DATETIME_COL = "_date_time"
35
44
  # MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31) # 946684800000 # 2000-01-01
36
45
  MIN_SUPPORTED_DATE_TS = pd.to_datetime(datetime.datetime(1999, 12, 31)).tz_localize(None)
@@ -67,47 +76,108 @@ class DateTimeSearchKeyConverter:
67
76
  try:
68
77
  if s is None or len(str(s).strip()) == 0:
69
78
  return None
70
- if not re.match(DATETIME_PATTERN, str(s)):
79
+ if sum(ch.isdigit() for ch in str(s)) < 6:
71
80
  return None
72
81
  return s
73
82
  except Exception:
74
83
  return None
75
84
 
76
- def convert(self, df: pd.DataFrame, keep_time=False) -> pd.DataFrame:
77
- if len(df) == 0:
78
- return df
85
+ def is_datetime(self, df: pd.DataFrame) -> bool:
86
+ if len(df) == 0 or df[self.date_column].isna().all():
87
+ return False
79
88
 
80
- df = df.copy()
81
- if df[self.date_column].apply(lambda x: isinstance(x, datetime.datetime)).all():
82
- df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
83
- elif isinstance(df[self.date_column].values[0], datetime.date):
84
- df[self.date_column] = pd.to_datetime(df[self.date_column], errors="coerce")
85
- elif isinstance(df[self.date_column].dtype, pd.PeriodDtype):
86
- df[self.date_column] = df[self.date_column].dt.to_timestamp()
87
- elif is_numeric_dtype(df[self.date_column]):
88
- # 315532801 - 2524608001 - seconds
89
- # 315532801000 - 2524608001000 - milliseconds
90
- # 315532801000000 - 2524608001000000 - microseconds
91
- # 315532801000000000 - 2524608001000000000 - nanoseconds
92
- if df[self.date_column].apply(lambda x: 10**16 < x).all():
93
- df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ns")
94
- elif df[self.date_column].apply(lambda x: 10**14 < x < 10**16).all():
95
- df[self.date_column] = pd.to_datetime(df[self.date_column], unit="us")
96
- elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
97
- df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ms")
98
- elif df[self.date_column].apply(lambda x: 0 < x < 10**11).all():
99
- df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
89
+ if pd.api.types.is_datetime64_any_dtype(df[self.date_column]):
90
+ return True
91
+
92
+ parsed = self.parse_datetime(df, raise_errors=False)
93
+ return parsed is not None and parsed.isna().mean() <= 0.5
94
+
95
+ def parse_datetime(self, df: pd.DataFrame, raise_errors=True) -> pd.Series | None:
96
+ if len(df) == 0 or df[self.date_column].isna().all():
97
+ return None
98
+
99
+ date_col = df[self.date_column].copy()
100
+
101
+ try:
102
+ if date_col.apply(lambda x: isinstance(x, datetime.datetime)).all():
103
+ parsed_datetime = date_col.apply(lambda x: x.replace(tzinfo=None))
104
+ elif isinstance(date_col.dropna().values[0], datetime.date):
105
+ parsed_datetime = pd.to_datetime(date_col, errors="coerce")
106
+ elif isinstance(date_col.dtype, pd.PeriodDtype):
107
+ parsed_datetime = date_col.dt.to_timestamp()
108
+ elif is_numeric_dtype(date_col):
109
+ # 315532801 - 2524608001 - seconds
110
+ # 315532801000 - 2524608001000 - milliseconds
111
+ # 315532801000000 - 2524608001000000 - microseconds
112
+ # 315532801000000000 - 2524608001000000000 - nanoseconds
113
+ if date_col.apply(lambda x: 10**16 < x).all():
114
+ parsed_datetime = pd.to_datetime(date_col, unit="ns")
115
+ elif date_col.apply(lambda x: 10**14 < x < 10**16).all():
116
+ parsed_datetime = pd.to_datetime(date_col, unit="us")
117
+ elif date_col.apply(lambda x: 10**11 < x < 10**14).all():
118
+ parsed_datetime = pd.to_datetime(date_col, unit="ms")
119
+ elif date_col.apply(lambda x: 10**8 < x < 10**11).all():
120
+ parsed_datetime = pd.to_datetime(date_col, unit="s")
121
+ else:
122
+ msg = self.bundle.get("unsupported_date_type").format(self.date_column)
123
+ if raise_errors:
124
+ raise ValidationError(msg)
125
+ else:
126
+ return None
127
+ else:
128
+ date_col = date_col.astype("string").apply(self.clean_date)
129
+ parsed_datetime = self.parse_string_date(date_col.to_frame(self.date_column), raise_errors)
130
+ if parsed_datetime.isna().all():
131
+ raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
132
+ parsed_datetime = parsed_datetime.dt.tz_localize(None)
133
+ return parsed_datetime
134
+ except Exception as e:
135
+ if raise_errors:
136
+ raise ValidationError(e)
100
137
  else:
101
- msg = self.bundle.get("unsupported_date_type").format(self.date_column)
102
- raise ValidationError(msg)
138
+ return None
139
+
140
+ def to_date_string(self, df: pd.DataFrame) -> pd.Series:
141
+ parsed_datetime = self.parse_datetime(df)
142
+ if parsed_datetime is None:
143
+ return df[self.date_column]
144
+ return parsed_datetime.dt.strftime("%Y-%m-%d")
145
+
146
+ def to_date_ms(self, df: pd.DataFrame) -> pd.Series:
147
+ parsed_datetime = self.parse_datetime(df)
148
+ if parsed_datetime is None:
149
+ return df[self.date_column]
150
+ return self.convert_datetime_to_date_ms(parsed_datetime)
151
+
152
+ def convert_datetime_to_datetime_ms(self, date_col: pd.Series) -> pd.Series:
153
+ if date_col.dt.unit == "ns":
154
+ date_col = date_col.astype(np.int64) // 1_000_000
155
+ elif date_col.dt.unit == "us":
156
+ date_col = date_col.astype(np.int64) // 1_000
157
+ elif date_col.dt.unit == "ms":
158
+ date_col = date_col.astype(np.int64)
159
+ elif date_col.dt.unit == "s":
160
+ date_col = date_col.astype(np.int64) * 1_000
103
161
  else:
104
- df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
105
- df[self.date_column] = self.parse_date(df)
162
+ raise ValueError(f"Unsupported date unit: {date_col.dt.unit}")
163
+
164
+ return date_col.apply(self._int_to_opt).astype("Int64")
165
+
166
+ def convert_datetime_to_date_ms(self, date_col: pd.Series) -> pd.Series:
167
+ date_col = date_col.dt.floor("D")
168
+ return self.convert_datetime_to_datetime_ms(date_col)
169
+
170
+ def convert(self, df: pd.DataFrame, keep_time=False) -> pd.DataFrame:
171
+ df = df.copy()
172
+ parsed_datetime = self.parse_datetime(df)
173
+ if parsed_datetime is None:
174
+ return df
175
+
176
+ df[self.date_column] = parsed_datetime
106
177
 
107
178
  # If column with date is datetime then extract seconds of the day and minute of the hour
108
179
  # as additional features
109
180
  seconds = "datetime_seconds"
110
- df[self.date_column] = df[self.date_column].dt.tz_localize(None)
111
181
 
112
182
  df = self.clean_old_dates(df)
113
183
 
@@ -182,21 +252,22 @@ class DateTimeSearchKeyConverter:
182
252
  df.drop(columns=seconds, inplace=True)
183
253
 
184
254
  if keep_time:
185
- df[self.DATETIME_COL] = df[self.date_column].astype(np.int64) // 1_000_000
186
- df[self.DATETIME_COL] = df[self.DATETIME_COL].apply(self._int_to_opt).astype("Int64")
187
- df[self.date_column] = df[self.date_column].dt.floor("D").astype(np.int64) // 1_000_000
188
- df[self.date_column] = df[self.date_column].apply(self._int_to_opt).astype("Int64")
255
+ df[self.DATETIME_COL] = self.convert_datetime_to_datetime_ms(df[self.date_column])
256
+ df[self.date_column] = self.convert_datetime_to_date_ms(df[self.date_column])
189
257
 
190
258
  self.logger.info(f"Date after convertion to timestamp: {df[self.date_column]}")
191
259
 
192
260
  return df
193
261
 
194
- def parse_date(self, df: pd.DataFrame):
262
+ def parse_string_date(self, df: pd.DataFrame, raise_errors=True) -> pd.Series | None:
195
263
  if self.date_format is not None:
196
264
  try:
197
265
  return pd.to_datetime(df[self.date_column], format=self.date_format)
198
266
  except ValueError as e:
199
- raise ValidationError(e)
267
+ if raise_errors:
268
+ raise ValidationError(e)
269
+ else:
270
+ return None
200
271
  else:
201
272
  for date_format in DATE_FORMATS:
202
273
  try:
@@ -204,9 +275,20 @@ class DateTimeSearchKeyConverter:
204
275
  except ValueError:
205
276
  pass
206
277
  try:
207
- return pd.to_datetime(df[self.date_column])
278
+ # Suppress warning for intentional fallback to dateutil parsing
279
+ import warnings
280
+
281
+ with warnings.catch_warnings():
282
+ warnings.filterwarnings("ignore", message="Could not infer format")
283
+ return pd.to_datetime(df[self.date_column])
208
284
  except ValueError:
209
- raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
285
+ try:
286
+ return pd.to_datetime(df[self.date_column], format="mixed", errors="raise")
287
+ except ValueError:
288
+ if raise_errors:
289
+ raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
290
+ else:
291
+ return None
210
292
 
211
293
  def clean_old_dates(self, df: pd.DataFrame) -> pd.DataFrame:
212
294
  condition = df[self.date_column] <= self.MIN_SUPPORTED_DATE_TS
@@ -339,6 +421,10 @@ def is_dates_distribution_valid(
339
421
  if maybe_date_col is None:
340
422
  return
341
423
 
424
+ # Don't check if date column is constant
425
+ if X[maybe_date_col].nunique() <= 1:
426
+ return
427
+
342
428
  if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
343
429
  dates = X[maybe_date_col].dt.to_timestamp().dt.date
344
430
  elif pd.__version__ >= "2.0.0":
@@ -14,7 +14,7 @@ from upgini.metadata import (
14
14
  SearchKey,
15
15
  )
16
16
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
17
- from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
17
+ from upgini.utils.datetime_utils import DateTimeConverter
18
18
  from upgini.utils.target_utils import define_task
19
19
 
20
20
 
@@ -31,7 +31,7 @@ def remove_fintech_duplicates(
31
31
  logger = logging.getLogger()
32
32
  logger.setLevel(logging.FATAL)
33
33
  date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
34
- if define_task(df[TARGET], date_col is not None, silent=True) != ModelTaskType.BINARY:
34
+ if define_task(df[TARGET], date_col is not None, logger=logger, silent=True) != ModelTaskType.BINARY:
35
35
  return df, []
36
36
 
37
37
  if date_col is None:
@@ -104,7 +104,7 @@ def remove_fintech_duplicates(
104
104
  sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
105
105
 
106
106
  # Convert date columns for further checks
107
- sub_df = DateTimeSearchKeyConverter(
107
+ sub_df = DateTimeConverter(
108
108
  date_col, date_format=date_format, logger=logger, bundle=bundle, generate_cyclical_features=False
109
109
  ).convert(sub_df)
110
110
  grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
@@ -160,7 +160,10 @@ def remove_fintech_duplicates(
160
160
 
161
161
 
162
162
  def clean_full_duplicates(
163
- df: pd.DataFrame, logger: Optional[Logger] = None, bundle: Optional[ResourceBundle] = None
163
+ df: pd.DataFrame,
164
+ is_transform: bool = False,
165
+ logger: Optional[Logger] = None,
166
+ bundle: Optional[ResourceBundle] = None,
164
167
  ) -> Tuple[pd.DataFrame, Optional[str]]:
165
168
  if logger is None:
166
169
  logger = logging.getLogger()
@@ -193,7 +196,7 @@ def clean_full_duplicates(
193
196
  logger.warning(bundle.get("dataset_full_duplicates").format(share_full_dedup))
194
197
 
195
198
  msg = None
196
- if TARGET in df.columns:
199
+ if not is_transform and TARGET in df.columns:
197
200
  unique_columns.remove(TARGET)
198
201
 
199
202
  # Separate rows to exclude from deduplication:
@@ -8,7 +8,6 @@ from io import StringIO
8
8
  from typing import Callable, List, Optional
9
9
 
10
10
  import pandas as pd
11
- from xhtml2pdf import pisa
12
11
 
13
12
  from upgini.__about__ import __version__
14
13
 
@@ -325,31 +324,73 @@ def show_button_download_pdf(
325
324
 
326
325
  # html = HTML(string=source)
327
326
  # html.write_pdf(file_name)
328
- with open(file_name, "wb") as output:
329
- pisa.CreatePDF(src=StringIO(source), dest=output, encoding="UTF-8")
330
-
331
- with open(file_name, "rb") as f:
332
- b64 = base64.b64encode(f.read())
333
- payload = b64.decode()
334
- html = f"""<a download="{file_name}" href="data:application/pdf;base64,{payload}" target="_blank">
335
- <button>{title}</button></a>"""
336
- if display_handle is not None:
337
- display_handle.update(HTML(html))
338
- else:
339
- return display(HTML(html), display_id=display_id)
327
+ try:
328
+ from xhtml2pdf import pisa
329
+
330
+ with open(file_name, "wb") as output:
331
+ pisa.CreatePDF(src=StringIO(source), dest=output, encoding="UTF-8")
332
+
333
+ with open(file_name, "rb") as f:
334
+ b64 = base64.b64encode(f.read())
335
+ payload = b64.decode()
336
+ html = f"""<a download="{file_name}" href="data:application/pdf;base64,{payload}" target="_blank">
337
+ <button>{title}</button></a>"""
338
+ if display_handle is not None:
339
+ display_handle.update(HTML(html))
340
+ else:
341
+ return display(HTML(html), display_id=display_id)
342
+ except Exception:
343
+ pass
340
344
 
341
345
 
342
- def show_request_quote_button():
346
+ def show_request_quote_button(is_registered: bool):
343
347
  if not ipython_available():
344
- print("https://upgini.com/request-a-quote")
348
+ if is_registered:
349
+ print("https://upgini.com/request-a-quote")
350
+ else:
351
+ print("https://profile.upgini.com/login")
345
352
  else:
346
- import ipywidgets as widgets
347
- from IPython.display import Javascript, display
348
-
349
- button = widgets.Button(description="Request a quote", button_style="danger")
353
+ from IPython.display import HTML, display, Javascript
354
+ from ipywidgets import Layout, Button
355
+
356
+ if is_registered:
357
+ display(HTML("""
358
+ <style>
359
+ button.custom-button {
360
+ border: 1px solid black !important;
361
+ background: white !important;
362
+ color: black !important;
363
+ white-space: nowrap;
364
+ }
365
+ </style>
366
+ """))
367
+ description = "Request a quote"
368
+ tooltip = "Ask a quote"
369
+ url = "https://upgini.com/request-a-quote"
370
+ else:
371
+ display(HTML("""
372
+ <style>
373
+ button.custom-button {
374
+ border: 1px solid #d00 !important;
375
+ background: #fff !important;
376
+ color: #d00 !important;
377
+ white-space: nowrap;
378
+ }
379
+ </style>
380
+ """))
381
+ description = "Get an API KEY"
382
+ tooltip = "Register"
383
+ url = "https://profile.upgini.com/login"
384
+
385
+ button = Button(
386
+ description=description,
387
+ layout=Layout(width='auto'),
388
+ tooltip=tooltip
389
+ )
390
+ button.add_class("custom-button")
350
391
 
351
392
  def on_button_clicked(b):
352
- display(Javascript('window.open("https://upgini.com/request-a-quote");'))
393
+ display(Javascript('window.open("' + url + '");'))
353
394
 
354
395
  button.on_click(on_button_clicked)
355
396