upgini 1.2.124__py3-none-any.whl → 1.2.146a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/binary.py +4 -3
- upgini/data_source/data_source_publisher.py +1 -9
- upgini/dataset.py +56 -6
- upgini/features_enricher.py +634 -556
- upgini/http.py +2 -2
- upgini/metadata.py +16 -2
- upgini/normalizer/normalize_utils.py +6 -6
- upgini/resource_bundle/strings.properties +15 -11
- upgini/search_task.py +14 -2
- upgini/utils/base_search_key_detector.py +5 -1
- upgini/utils/datetime_utils.py +125 -39
- upgini/utils/deduplicate_utils.py +8 -5
- upgini/utils/display_utils.py +61 -20
- upgini/utils/feature_info.py +18 -7
- upgini/utils/features_validator.py +6 -4
- upgini/utils/postal_code_utils.py +35 -2
- upgini/utils/target_utils.py +3 -1
- upgini/utils/track_info.py +29 -1
- {upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/METADATA +123 -121
- {upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/RECORD +23 -23
- {upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/WHEEL +1 -1
- {upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/licenses/LICENSE +0 -0
upgini/http.py
CHANGED
|
@@ -433,8 +433,8 @@ class _RestClient:
|
|
|
433
433
|
with open(file_path, "rb") as file:
|
|
434
434
|
content = file.read()
|
|
435
435
|
md5_hash.update(content)
|
|
436
|
-
|
|
437
|
-
metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5":
|
|
436
|
+
digest_md5 = md5_hash.hexdigest()
|
|
437
|
+
metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest_md5})
|
|
438
438
|
|
|
439
439
|
digest_sha256 = file_hash(file_path)
|
|
440
440
|
metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
|
upgini/metadata.py
CHANGED
|
@@ -12,10 +12,19 @@ SORT_ID = "sort_id"
|
|
|
12
12
|
EVAL_SET_INDEX = "eval_set_index"
|
|
13
13
|
TARGET = "target"
|
|
14
14
|
COUNTRY = "country_iso_code"
|
|
15
|
+
CURRENT_DATE_COL = "current_date_"
|
|
15
16
|
RENAMED_INDEX = "index_col"
|
|
16
17
|
DEFAULT_INDEX = "index"
|
|
17
18
|
ORIGINAL_INDEX = "original_index"
|
|
18
|
-
SYSTEM_COLUMNS = {
|
|
19
|
+
SYSTEM_COLUMNS = {
|
|
20
|
+
SYSTEM_RECORD_ID,
|
|
21
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
22
|
+
SEARCH_KEY_UNNEST,
|
|
23
|
+
EVAL_SET_INDEX,
|
|
24
|
+
TARGET,
|
|
25
|
+
COUNTRY,
|
|
26
|
+
CURRENT_DATE_COL,
|
|
27
|
+
}
|
|
19
28
|
|
|
20
29
|
|
|
21
30
|
class FileColumnMeaningType(Enum):
|
|
@@ -36,6 +45,8 @@ class FileColumnMeaningType(Enum):
|
|
|
36
45
|
SCORE = "SCORE"
|
|
37
46
|
TARGET = "TARGET"
|
|
38
47
|
FEATURE = "FEATURE"
|
|
48
|
+
GENERATED_FEATURE = "GENERATED_FEATURE"
|
|
49
|
+
DATE_FEATURE = "DATE_FEATURE"
|
|
39
50
|
CUSTOM_KEY = "CUSTOM_KEY"
|
|
40
51
|
COUNTRY = "COUNTRY"
|
|
41
52
|
POSTAL_CODE = "POSTAL_CODE"
|
|
@@ -85,7 +96,7 @@ class SearchKey(Enum):
|
|
|
85
96
|
return [SearchKey.EMAIL, SearchKey.HEM, SearchKey.IP, SearchKey.PHONE]
|
|
86
97
|
|
|
87
98
|
@staticmethod
|
|
88
|
-
def from_meaning_type(meaning_type: FileColumnMeaningType) -> "SearchKey":
|
|
99
|
+
def from_meaning_type(meaning_type: FileColumnMeaningType) -> Optional["SearchKey"]:
|
|
89
100
|
if meaning_type == FileColumnMeaningType.EMAIL:
|
|
90
101
|
return SearchKey.EMAIL
|
|
91
102
|
if meaning_type == FileColumnMeaningType.HEM:
|
|
@@ -250,6 +261,9 @@ class FileMetadata(BaseModel):
|
|
|
250
261
|
rowsCount: Optional[int] = None
|
|
251
262
|
checksumMD5: Optional[str] = None
|
|
252
263
|
digest: Optional[str] = None
|
|
264
|
+
deterministicDigest: Optional[str] = None
|
|
265
|
+
droppedColumns: Optional[List[str]] = None
|
|
266
|
+
autodetectedSearchKeys: Optional[Dict[str, str]] = None
|
|
253
267
|
|
|
254
268
|
def column_by_name(self, name: str) -> Optional[FileColumnMetadata]:
|
|
255
269
|
for c in self.columns:
|
|
@@ -5,7 +5,6 @@ from typing import Dict, List, Tuple
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
7
|
from pandas.api.types import is_bool_dtype as is_bool
|
|
8
|
-
from pandas.api.types import is_datetime64_any_dtype as is_datetime
|
|
9
8
|
from pandas.api.types import (
|
|
10
9
|
is_float_dtype,
|
|
11
10
|
is_numeric_dtype,
|
|
@@ -25,7 +24,7 @@ from upgini.metadata import (
|
|
|
25
24
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
26
25
|
from upgini.utils import find_numbers_with_decimal_comma
|
|
27
26
|
from upgini.utils.country_utils import CountrySearchKeyConverter
|
|
28
|
-
from upgini.utils.datetime_utils import
|
|
27
|
+
from upgini.utils.datetime_utils import DateTimeConverter
|
|
29
28
|
from upgini.utils.ip_utils import IpSearchKeyConverter
|
|
30
29
|
from upgini.utils.phone_utils import PhoneSearchKeyConverter
|
|
31
30
|
from upgini.utils.postal_code_utils import PostalCodeSearchKeyConverter
|
|
@@ -45,7 +44,7 @@ class Normalizer:
|
|
|
45
44
|
self.columns_renaming = {}
|
|
46
45
|
self.search_keys = {}
|
|
47
46
|
self.generated_features = []
|
|
48
|
-
self.
|
|
47
|
+
self.removed_datetime_features = []
|
|
49
48
|
|
|
50
49
|
def normalize(
|
|
51
50
|
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
|
|
@@ -89,7 +88,7 @@ class Normalizer:
|
|
|
89
88
|
SYSTEM_RECORD_ID,
|
|
90
89
|
ENTITY_SYSTEM_RECORD_ID,
|
|
91
90
|
SEARCH_KEY_UNNEST,
|
|
92
|
-
|
|
91
|
+
DateTimeConverter.DATETIME_COL,
|
|
93
92
|
]:
|
|
94
93
|
self.columns_renaming[column] = column
|
|
95
94
|
new_columns.append(column)
|
|
@@ -134,8 +133,9 @@ class Normalizer:
|
|
|
134
133
|
features = self._get_features(df)
|
|
135
134
|
|
|
136
135
|
for f in features:
|
|
137
|
-
|
|
138
|
-
|
|
136
|
+
converter = DateTimeConverter(f)
|
|
137
|
+
if converter.is_datetime(df) and f != DateTimeConverter.DATETIME_COL:
|
|
138
|
+
self.removed_datetime_features.append(f)
|
|
139
139
|
df.drop(columns=f, inplace=True)
|
|
140
140
|
|
|
141
141
|
return df
|
|
@@ -12,7 +12,8 @@ polling_unregister_information=We'll send email notification once it's completed
|
|
|
12
12
|
ads_upload_finish=Thank you for your submission!\nWe'll check your data sharing proposal and get back to you
|
|
13
13
|
demo_dataset_info=Demo training dataset detected. Registration for an API key is not required.\n
|
|
14
14
|
transform_usage_info=You use Trial access to Upgini data enrichment. Limit for Trial: {} rows. You have already enriched: {} rows.
|
|
15
|
-
|
|
15
|
+
transform_usage_warning_demo=Unregistered-user limit: {} rows remaining; you requested {}.
|
|
16
|
+
transform_usage_warning_registered=Free tier limit: {} rows remaining; you requested {}.
|
|
16
17
|
|
|
17
18
|
# Warnings
|
|
18
19
|
support_link=https://upgini.com/support
|
|
@@ -139,6 +140,7 @@ x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
|
|
|
139
140
|
eval_x_has_train_samples=Eval set X has rows that are present in train set X
|
|
140
141
|
oot_without_date_not_supported=Eval set {} provided as OOT but date column is missing. It will be ignored for stability check
|
|
141
142
|
oot_with_online_sources_not_supported=Eval set {} provided as OOT and also provided columns for online API. It will be ignored for stability check
|
|
143
|
+
autodetected_search_key_not_found=Autodetected on fit search key {} not found in X columns: {} for transform
|
|
142
144
|
|
|
143
145
|
baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
|
|
144
146
|
baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
|
|
@@ -174,7 +176,8 @@ dataset_invalid_multiclass_target=Unexpected dtype of target for multiclass task
|
|
|
174
176
|
dataset_invalid_regression_target=Unexpected dtype of target for regression task type: {}. Expected float
|
|
175
177
|
dataset_invalid_timeseries_target=Unexpected dtype of target for timeseries task type: {}. Expected float
|
|
176
178
|
dataset_to_many_multiclass_targets=The number of target classes {} exceeds the allowed threshold: {}. Please, correct your data and try again
|
|
177
|
-
dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class
|
|
179
|
+
dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class
|
|
180
|
+
#\nPlease, remove rows with rarest class from your dataframe
|
|
178
181
|
dataset_rarest_class_less_threshold=Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
|
|
179
182
|
dataset_date_features=Columns {} is a datetime or period type but not used as a search key, removed from X
|
|
180
183
|
dataset_too_many_features=Too many features. Maximum number of features is {}
|
|
@@ -209,15 +212,16 @@ features_info_zero_important_features=Oops, we can't find any relevant external
|
|
|
209
212
|
features_info_zero_hit_rate_search_keys=Oops, looks like values/formats of the search keys {} might be incorrect,\nas we won't be able to match any data source using these values\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
|
|
210
213
|
features_not_generated=Following features didn't pass checks for automated feature generation: {}
|
|
211
214
|
# Information
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
215
|
+
datetime_detected=Datetime detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
|
|
216
|
+
postal_code_detected=Postal codes detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
|
|
217
|
+
country_detected=Countries detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
|
|
218
|
+
country_auto_determined=Search key country_code `{}` was automatically determined by client IP. \nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
|
|
219
|
+
country_default_determined=Search key country_code `{}` was used as default. \nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
|
|
220
|
+
email_detected=Emails detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
|
|
221
|
+
email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
|
|
222
|
+
phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
|
|
223
|
+
phone_detected_not_registered=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
|
|
224
|
+
target_type_detected=Detected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
|
|
221
225
|
binary_target_reason=only two unique label-values observed
|
|
222
226
|
non_numeric_multiclass_reason=non-numeric label values observed
|
|
223
227
|
few_unique_label_multiclass_reason=few unique label-values observed and can be considered as categorical
|
upgini/search_task.py
CHANGED
|
@@ -165,10 +165,21 @@ class SearchTask:
|
|
|
165
165
|
|
|
166
166
|
return list(zero_hit_search_keys)
|
|
167
167
|
|
|
168
|
-
def
|
|
168
|
+
def get_features_for_embeddings(self) -> Optional[List[str]]:
|
|
169
169
|
if self.provider_metadata_v2 is None:
|
|
170
170
|
return None
|
|
171
171
|
|
|
172
|
+
features_for_transform = set()
|
|
173
|
+
for meta in self.provider_metadata_v2:
|
|
174
|
+
if meta.features_used_for_embeddings is not None:
|
|
175
|
+
features_for_transform.update(meta.features_used_for_embeddings)
|
|
176
|
+
|
|
177
|
+
return list(features_for_transform)
|
|
178
|
+
|
|
179
|
+
def get_features_for_transform(self) -> List[str]:
|
|
180
|
+
if self.provider_metadata_v2 is None:
|
|
181
|
+
return []
|
|
182
|
+
|
|
172
183
|
features_for_transform = set()
|
|
173
184
|
for meta in self.provider_metadata_v2:
|
|
174
185
|
if meta.features_used_for_embeddings is not None:
|
|
@@ -423,4 +434,5 @@ def _read_parquet(file_content: bytes, file_name: str = "features.parquet"):
|
|
|
423
434
|
tmp_file_name = f"{tmp_dir}/{file_name}"
|
|
424
435
|
with open(tmp_file_name, "wb") as gzip_file:
|
|
425
436
|
gzip_file.write(file_content)
|
|
426
|
-
|
|
437
|
+
# Note: MLB writes files using pyarrow, so reading with fastparquet may cause errors.
|
|
438
|
+
return pd.read_parquet(tmp_file_name, engine="pyarrow")
|
|
@@ -24,4 +24,8 @@ class BaseSearchKeyDetector:
|
|
|
24
24
|
for column_name in other_columns:
|
|
25
25
|
if self._is_search_key_by_values(df[column_name]):
|
|
26
26
|
columns_by_values.append(column_name)
|
|
27
|
-
|
|
27
|
+
|
|
28
|
+
both = [col for col in columns_by_names if col in columns_by_values]
|
|
29
|
+
only_values = [col for col in columns_by_values if col not in columns_by_names]
|
|
30
|
+
only_names = [col for col in columns_by_names if col not in columns_by_values]
|
|
31
|
+
return both + only_values + only_names
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import logging
|
|
3
|
-
import re
|
|
4
3
|
from typing import Dict, List, Optional
|
|
5
4
|
|
|
6
5
|
import numpy as np
|
|
@@ -11,6 +10,7 @@ from pandas.api.types import is_numeric_dtype
|
|
|
11
10
|
from upgini.errors import ValidationError
|
|
12
11
|
from upgini.metadata import EVAL_SET_INDEX, SearchKey
|
|
13
12
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
13
|
+
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
14
14
|
|
|
15
15
|
DATE_FORMATS = [
|
|
16
16
|
"%Y-%m-%d",
|
|
@@ -30,7 +30,16 @@ DATE_FORMATS = [
|
|
|
30
30
|
DATETIME_PATTERN = r"^[\d\s\.\-:T/+]+$"
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
class
|
|
33
|
+
class DateSearchKeyDetector(BaseSearchKeyDetector):
|
|
34
|
+
def _is_search_key_by_name(self, column_name: str) -> bool:
|
|
35
|
+
lower_column_name = str(column_name).lower()
|
|
36
|
+
return "date" in lower_column_name or "time" in lower_column_name or "timestamp" in lower_column_name
|
|
37
|
+
|
|
38
|
+
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
39
|
+
return DateTimeConverter(column.name).is_datetime(column.to_frame(column.name))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class DateTimeConverter:
|
|
34
43
|
DATETIME_COL = "_date_time"
|
|
35
44
|
# MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31) # 946684800000 # 2000-01-01
|
|
36
45
|
MIN_SUPPORTED_DATE_TS = pd.to_datetime(datetime.datetime(1999, 12, 31)).tz_localize(None)
|
|
@@ -67,47 +76,108 @@ class DateTimeSearchKeyConverter:
|
|
|
67
76
|
try:
|
|
68
77
|
if s is None or len(str(s).strip()) == 0:
|
|
69
78
|
return None
|
|
70
|
-
if
|
|
79
|
+
if sum(ch.isdigit() for ch in str(s)) < 6:
|
|
71
80
|
return None
|
|
72
81
|
return s
|
|
73
82
|
except Exception:
|
|
74
83
|
return None
|
|
75
84
|
|
|
76
|
-
def
|
|
77
|
-
if len(df) == 0:
|
|
78
|
-
return
|
|
85
|
+
def is_datetime(self, df: pd.DataFrame) -> bool:
|
|
86
|
+
if len(df) == 0 or df[self.date_column].isna().all():
|
|
87
|
+
return False
|
|
79
88
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
89
|
+
if pd.api.types.is_datetime64_any_dtype(df[self.date_column]):
|
|
90
|
+
return True
|
|
91
|
+
|
|
92
|
+
parsed = self.parse_datetime(df, raise_errors=False)
|
|
93
|
+
return parsed is not None and parsed.isna().mean() <= 0.5
|
|
94
|
+
|
|
95
|
+
def parse_datetime(self, df: pd.DataFrame, raise_errors=True) -> pd.Series | None:
|
|
96
|
+
if len(df) == 0 or df[self.date_column].isna().all():
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
date_col = df[self.date_column].copy()
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
if date_col.apply(lambda x: isinstance(x, datetime.datetime)).all():
|
|
103
|
+
parsed_datetime = date_col.apply(lambda x: x.replace(tzinfo=None))
|
|
104
|
+
elif isinstance(date_col.dropna().values[0], datetime.date):
|
|
105
|
+
parsed_datetime = pd.to_datetime(date_col, errors="coerce")
|
|
106
|
+
elif isinstance(date_col.dtype, pd.PeriodDtype):
|
|
107
|
+
parsed_datetime = date_col.dt.to_timestamp()
|
|
108
|
+
elif is_numeric_dtype(date_col):
|
|
109
|
+
# 315532801 - 2524608001 - seconds
|
|
110
|
+
# 315532801000 - 2524608001000 - milliseconds
|
|
111
|
+
# 315532801000000 - 2524608001000000 - microseconds
|
|
112
|
+
# 315532801000000000 - 2524608001000000000 - nanoseconds
|
|
113
|
+
if date_col.apply(lambda x: 10**16 < x).all():
|
|
114
|
+
parsed_datetime = pd.to_datetime(date_col, unit="ns")
|
|
115
|
+
elif date_col.apply(lambda x: 10**14 < x < 10**16).all():
|
|
116
|
+
parsed_datetime = pd.to_datetime(date_col, unit="us")
|
|
117
|
+
elif date_col.apply(lambda x: 10**11 < x < 10**14).all():
|
|
118
|
+
parsed_datetime = pd.to_datetime(date_col, unit="ms")
|
|
119
|
+
elif date_col.apply(lambda x: 10**8 < x < 10**11).all():
|
|
120
|
+
parsed_datetime = pd.to_datetime(date_col, unit="s")
|
|
121
|
+
else:
|
|
122
|
+
msg = self.bundle.get("unsupported_date_type").format(self.date_column)
|
|
123
|
+
if raise_errors:
|
|
124
|
+
raise ValidationError(msg)
|
|
125
|
+
else:
|
|
126
|
+
return None
|
|
127
|
+
else:
|
|
128
|
+
date_col = date_col.astype("string").apply(self.clean_date)
|
|
129
|
+
parsed_datetime = self.parse_string_date(date_col.to_frame(self.date_column), raise_errors)
|
|
130
|
+
if parsed_datetime.isna().all():
|
|
131
|
+
raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
|
|
132
|
+
parsed_datetime = parsed_datetime.dt.tz_localize(None)
|
|
133
|
+
return parsed_datetime
|
|
134
|
+
except Exception as e:
|
|
135
|
+
if raise_errors:
|
|
136
|
+
raise ValidationError(e)
|
|
100
137
|
else:
|
|
101
|
-
|
|
102
|
-
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
def to_date_string(self, df: pd.DataFrame) -> pd.Series:
|
|
141
|
+
parsed_datetime = self.parse_datetime(df)
|
|
142
|
+
if parsed_datetime is None:
|
|
143
|
+
return df[self.date_column]
|
|
144
|
+
return parsed_datetime.dt.strftime("%Y-%m-%d")
|
|
145
|
+
|
|
146
|
+
def to_date_ms(self, df: pd.DataFrame) -> pd.Series:
|
|
147
|
+
parsed_datetime = self.parse_datetime(df)
|
|
148
|
+
if parsed_datetime is None:
|
|
149
|
+
return df[self.date_column]
|
|
150
|
+
return self.convert_datetime_to_date_ms(parsed_datetime)
|
|
151
|
+
|
|
152
|
+
def convert_datetime_to_datetime_ms(self, date_col: pd.Series) -> pd.Series:
|
|
153
|
+
if date_col.dt.unit == "ns":
|
|
154
|
+
date_col = date_col.astype(np.int64) // 1_000_000
|
|
155
|
+
elif date_col.dt.unit == "us":
|
|
156
|
+
date_col = date_col.astype(np.int64) // 1_000
|
|
157
|
+
elif date_col.dt.unit == "ms":
|
|
158
|
+
date_col = date_col.astype(np.int64)
|
|
159
|
+
elif date_col.dt.unit == "s":
|
|
160
|
+
date_col = date_col.astype(np.int64) * 1_000
|
|
103
161
|
else:
|
|
104
|
-
|
|
105
|
-
|
|
162
|
+
raise ValueError(f"Unsupported date unit: {date_col.dt.unit}")
|
|
163
|
+
|
|
164
|
+
return date_col.apply(self._int_to_opt).astype("Int64")
|
|
165
|
+
|
|
166
|
+
def convert_datetime_to_date_ms(self, date_col: pd.Series) -> pd.Series:
|
|
167
|
+
date_col = date_col.dt.floor("D")
|
|
168
|
+
return self.convert_datetime_to_datetime_ms(date_col)
|
|
169
|
+
|
|
170
|
+
def convert(self, df: pd.DataFrame, keep_time=False) -> pd.DataFrame:
|
|
171
|
+
df = df.copy()
|
|
172
|
+
parsed_datetime = self.parse_datetime(df)
|
|
173
|
+
if parsed_datetime is None:
|
|
174
|
+
return df
|
|
175
|
+
|
|
176
|
+
df[self.date_column] = parsed_datetime
|
|
106
177
|
|
|
107
178
|
# If column with date is datetime then extract seconds of the day and minute of the hour
|
|
108
179
|
# as additional features
|
|
109
180
|
seconds = "datetime_seconds"
|
|
110
|
-
df[self.date_column] = df[self.date_column].dt.tz_localize(None)
|
|
111
181
|
|
|
112
182
|
df = self.clean_old_dates(df)
|
|
113
183
|
|
|
@@ -182,21 +252,22 @@ class DateTimeSearchKeyConverter:
|
|
|
182
252
|
df.drop(columns=seconds, inplace=True)
|
|
183
253
|
|
|
184
254
|
if keep_time:
|
|
185
|
-
df[self.DATETIME_COL] = df[self.date_column]
|
|
186
|
-
|
|
187
|
-
df[self.date_column] = df[self.date_column].dt.floor("D").astype(np.int64) // 1_000_000
|
|
188
|
-
df[self.date_column] = df[self.date_column].apply(self._int_to_opt).astype("Int64")
|
|
255
|
+
df[self.DATETIME_COL] = self.convert_datetime_to_datetime_ms(df[self.date_column])
|
|
256
|
+
df[self.date_column] = self.convert_datetime_to_date_ms(df[self.date_column])
|
|
189
257
|
|
|
190
258
|
self.logger.info(f"Date after convertion to timestamp: {df[self.date_column]}")
|
|
191
259
|
|
|
192
260
|
return df
|
|
193
261
|
|
|
194
|
-
def
|
|
262
|
+
def parse_string_date(self, df: pd.DataFrame, raise_errors=True) -> pd.Series | None:
|
|
195
263
|
if self.date_format is not None:
|
|
196
264
|
try:
|
|
197
265
|
return pd.to_datetime(df[self.date_column], format=self.date_format)
|
|
198
266
|
except ValueError as e:
|
|
199
|
-
|
|
267
|
+
if raise_errors:
|
|
268
|
+
raise ValidationError(e)
|
|
269
|
+
else:
|
|
270
|
+
return None
|
|
200
271
|
else:
|
|
201
272
|
for date_format in DATE_FORMATS:
|
|
202
273
|
try:
|
|
@@ -204,9 +275,20 @@ class DateTimeSearchKeyConverter:
|
|
|
204
275
|
except ValueError:
|
|
205
276
|
pass
|
|
206
277
|
try:
|
|
207
|
-
|
|
278
|
+
# Suppress warning for intentional fallback to dateutil parsing
|
|
279
|
+
import warnings
|
|
280
|
+
|
|
281
|
+
with warnings.catch_warnings():
|
|
282
|
+
warnings.filterwarnings("ignore", message="Could not infer format")
|
|
283
|
+
return pd.to_datetime(df[self.date_column])
|
|
208
284
|
except ValueError:
|
|
209
|
-
|
|
285
|
+
try:
|
|
286
|
+
return pd.to_datetime(df[self.date_column], format="mixed", errors="raise")
|
|
287
|
+
except ValueError:
|
|
288
|
+
if raise_errors:
|
|
289
|
+
raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
|
|
290
|
+
else:
|
|
291
|
+
return None
|
|
210
292
|
|
|
211
293
|
def clean_old_dates(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
212
294
|
condition = df[self.date_column] <= self.MIN_SUPPORTED_DATE_TS
|
|
@@ -339,6 +421,10 @@ def is_dates_distribution_valid(
|
|
|
339
421
|
if maybe_date_col is None:
|
|
340
422
|
return
|
|
341
423
|
|
|
424
|
+
# Don't check if date column is constant
|
|
425
|
+
if X[maybe_date_col].nunique() <= 1:
|
|
426
|
+
return
|
|
427
|
+
|
|
342
428
|
if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
|
|
343
429
|
dates = X[maybe_date_col].dt.to_timestamp().dt.date
|
|
344
430
|
elif pd.__version__ >= "2.0.0":
|
|
@@ -14,7 +14,7 @@ from upgini.metadata import (
|
|
|
14
14
|
SearchKey,
|
|
15
15
|
)
|
|
16
16
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
17
|
-
from upgini.utils.datetime_utils import
|
|
17
|
+
from upgini.utils.datetime_utils import DateTimeConverter
|
|
18
18
|
from upgini.utils.target_utils import define_task
|
|
19
19
|
|
|
20
20
|
|
|
@@ -31,7 +31,7 @@ def remove_fintech_duplicates(
|
|
|
31
31
|
logger = logging.getLogger()
|
|
32
32
|
logger.setLevel(logging.FATAL)
|
|
33
33
|
date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
34
|
-
if define_task(df[TARGET], date_col is not None, silent=True) != ModelTaskType.BINARY:
|
|
34
|
+
if define_task(df[TARGET], date_col is not None, logger=logger, silent=True) != ModelTaskType.BINARY:
|
|
35
35
|
return df, []
|
|
36
36
|
|
|
37
37
|
if date_col is None:
|
|
@@ -104,7 +104,7 @@ def remove_fintech_duplicates(
|
|
|
104
104
|
sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
|
|
105
105
|
|
|
106
106
|
# Convert date columns for further checks
|
|
107
|
-
sub_df =
|
|
107
|
+
sub_df = DateTimeConverter(
|
|
108
108
|
date_col, date_format=date_format, logger=logger, bundle=bundle, generate_cyclical_features=False
|
|
109
109
|
).convert(sub_df)
|
|
110
110
|
grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
|
|
@@ -160,7 +160,10 @@ def remove_fintech_duplicates(
|
|
|
160
160
|
|
|
161
161
|
|
|
162
162
|
def clean_full_duplicates(
|
|
163
|
-
df: pd.DataFrame,
|
|
163
|
+
df: pd.DataFrame,
|
|
164
|
+
is_transform: bool = False,
|
|
165
|
+
logger: Optional[Logger] = None,
|
|
166
|
+
bundle: Optional[ResourceBundle] = None,
|
|
164
167
|
) -> Tuple[pd.DataFrame, Optional[str]]:
|
|
165
168
|
if logger is None:
|
|
166
169
|
logger = logging.getLogger()
|
|
@@ -193,7 +196,7 @@ def clean_full_duplicates(
|
|
|
193
196
|
logger.warning(bundle.get("dataset_full_duplicates").format(share_full_dedup))
|
|
194
197
|
|
|
195
198
|
msg = None
|
|
196
|
-
if TARGET in df.columns:
|
|
199
|
+
if not is_transform and TARGET in df.columns:
|
|
197
200
|
unique_columns.remove(TARGET)
|
|
198
201
|
|
|
199
202
|
# Separate rows to exclude from deduplication:
|
upgini/utils/display_utils.py
CHANGED
|
@@ -8,7 +8,6 @@ from io import StringIO
|
|
|
8
8
|
from typing import Callable, List, Optional
|
|
9
9
|
|
|
10
10
|
import pandas as pd
|
|
11
|
-
from xhtml2pdf import pisa
|
|
12
11
|
|
|
13
12
|
from upgini.__about__ import __version__
|
|
14
13
|
|
|
@@ -325,31 +324,73 @@ def show_button_download_pdf(
|
|
|
325
324
|
|
|
326
325
|
# html = HTML(string=source)
|
|
327
326
|
# html.write_pdf(file_name)
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
327
|
+
try:
|
|
328
|
+
from xhtml2pdf import pisa
|
|
329
|
+
|
|
330
|
+
with open(file_name, "wb") as output:
|
|
331
|
+
pisa.CreatePDF(src=StringIO(source), dest=output, encoding="UTF-8")
|
|
332
|
+
|
|
333
|
+
with open(file_name, "rb") as f:
|
|
334
|
+
b64 = base64.b64encode(f.read())
|
|
335
|
+
payload = b64.decode()
|
|
336
|
+
html = f"""<a download="{file_name}" href="data:application/pdf;base64,{payload}" target="_blank">
|
|
337
|
+
<button>{title}</button></a>"""
|
|
338
|
+
if display_handle is not None:
|
|
339
|
+
display_handle.update(HTML(html))
|
|
340
|
+
else:
|
|
341
|
+
return display(HTML(html), display_id=display_id)
|
|
342
|
+
except Exception:
|
|
343
|
+
pass
|
|
340
344
|
|
|
341
345
|
|
|
342
|
-
def show_request_quote_button():
|
|
346
|
+
def show_request_quote_button(is_registered: bool):
|
|
343
347
|
if not ipython_available():
|
|
344
|
-
|
|
348
|
+
if is_registered:
|
|
349
|
+
print("https://upgini.com/request-a-quote")
|
|
350
|
+
else:
|
|
351
|
+
print("https://profile.upgini.com/login")
|
|
345
352
|
else:
|
|
346
|
-
import
|
|
347
|
-
from
|
|
348
|
-
|
|
349
|
-
|
|
353
|
+
from IPython.display import HTML, display, Javascript
|
|
354
|
+
from ipywidgets import Layout, Button
|
|
355
|
+
|
|
356
|
+
if is_registered:
|
|
357
|
+
display(HTML("""
|
|
358
|
+
<style>
|
|
359
|
+
button.custom-button {
|
|
360
|
+
border: 1px solid black !important;
|
|
361
|
+
background: white !important;
|
|
362
|
+
color: black !important;
|
|
363
|
+
white-space: nowrap;
|
|
364
|
+
}
|
|
365
|
+
</style>
|
|
366
|
+
"""))
|
|
367
|
+
description = "Request a quote"
|
|
368
|
+
tooltip = "Ask a quote"
|
|
369
|
+
url = "https://upgini.com/request-a-quote"
|
|
370
|
+
else:
|
|
371
|
+
display(HTML("""
|
|
372
|
+
<style>
|
|
373
|
+
button.custom-button {
|
|
374
|
+
border: 1px solid #d00 !important;
|
|
375
|
+
background: #fff !important;
|
|
376
|
+
color: #d00 !important;
|
|
377
|
+
white-space: nowrap;
|
|
378
|
+
}
|
|
379
|
+
</style>
|
|
380
|
+
"""))
|
|
381
|
+
description = "Get an API KEY"
|
|
382
|
+
tooltip = "Register"
|
|
383
|
+
url = "https://profile.upgini.com/login"
|
|
384
|
+
|
|
385
|
+
button = Button(
|
|
386
|
+
description=description,
|
|
387
|
+
layout=Layout(width='auto'),
|
|
388
|
+
tooltip=tooltip
|
|
389
|
+
)
|
|
390
|
+
button.add_class("custom-button")
|
|
350
391
|
|
|
351
392
|
def on_button_clicked(b):
|
|
352
|
-
display(Javascript('window.open("
|
|
393
|
+
display(Javascript('window.open("' + url + '");'))
|
|
353
394
|
|
|
354
395
|
button.on_click(on_button_clicked)
|
|
355
396
|
|
upgini/utils/feature_info.py
CHANGED
|
@@ -31,7 +31,10 @@ class FeatureInfo:
|
|
|
31
31
|
|
|
32
32
|
@staticmethod
|
|
33
33
|
def from_metadata(
|
|
34
|
-
feature_meta: FeaturesMetadataV2,
|
|
34
|
+
feature_meta: FeaturesMetadataV2,
|
|
35
|
+
data: Optional[pd.DataFrame],
|
|
36
|
+
is_client_feature: bool,
|
|
37
|
+
is_generated_feature: bool,
|
|
35
38
|
) -> "FeatureInfo":
|
|
36
39
|
return FeatureInfo(
|
|
37
40
|
name=_get_name(feature_meta),
|
|
@@ -41,8 +44,8 @@ class FeatureInfo:
|
|
|
41
44
|
value_preview=_get_feature_sample(feature_meta, data),
|
|
42
45
|
provider=_get_provider(feature_meta, is_client_feature),
|
|
43
46
|
internal_provider=_get_internal_provider(feature_meta, is_client_feature),
|
|
44
|
-
source=_get_source(feature_meta, is_client_feature),
|
|
45
|
-
internal_source=_get_internal_source(feature_meta, is_client_feature),
|
|
47
|
+
source=_get_source(feature_meta, is_client_feature, is_generated_feature),
|
|
48
|
+
internal_source=_get_internal_source(feature_meta, is_client_feature, is_generated_feature),
|
|
46
49
|
update_frequency=feature_meta.update_frequency,
|
|
47
50
|
commercial_schema=feature_meta.commercial_schema,
|
|
48
51
|
doc_link=feature_meta.doc_link,
|
|
@@ -139,22 +142,30 @@ def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature:
|
|
|
139
142
|
return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
|
|
140
143
|
|
|
141
144
|
|
|
142
|
-
def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
145
|
+
def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool, is_generated_feature: bool) -> str:
|
|
146
|
+
if is_generated_feature:
|
|
147
|
+
return "AutoFE: features from Training dataset"
|
|
148
|
+
|
|
143
149
|
sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
|
|
144
150
|
source_links = _list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
|
|
145
151
|
if sources:
|
|
146
152
|
source = _make_links(sources, source_links)
|
|
147
153
|
else:
|
|
148
|
-
source = _get_internal_source(feature_meta, is_client_feature)
|
|
154
|
+
source = _get_internal_source(feature_meta, is_client_feature, is_generated_feature)
|
|
149
155
|
return source
|
|
150
156
|
|
|
151
157
|
|
|
152
|
-
def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
158
|
+
def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool, is_generated_feature: bool) -> str:
|
|
159
|
+
if is_generated_feature:
|
|
160
|
+
return "AutoFE: features from Training dataset"
|
|
161
|
+
|
|
153
162
|
sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
|
|
154
163
|
if sources:
|
|
155
164
|
return ", ".join(sources)
|
|
165
|
+
elif feature_meta.data_source:
|
|
166
|
+
return feature_meta.data_source
|
|
156
167
|
else:
|
|
157
|
-
return
|
|
168
|
+
return (
|
|
158
169
|
LLM_SOURCE
|
|
159
170
|
if not feature_meta.name.endswith("_country")
|
|
160
171
|
and not feature_meta.name.endswith("_postal_code")
|