upgini 1.2.137__py3-none-any.whl → 1.2.138__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +24 -6
- upgini/normalizer/normalize_utils.py +4 -4
- upgini/resource_bundle/strings.properties +1 -0
- upgini/utils/base_search_key_detector.py +5 -1
- upgini/utils/datetime_utils.py +11 -1
- {upgini-1.2.137.dist-info → upgini-1.2.138.dist-info}/METADATA +1 -1
- {upgini-1.2.137.dist-info → upgini-1.2.138.dist-info}/RECORD +10 -10
- {upgini-1.2.137.dist-info → upgini-1.2.138.dist-info}/WHEEL +0 -0
- {upgini-1.2.137.dist-info → upgini-1.2.138.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.138"
|
upgini/features_enricher.py
CHANGED
|
@@ -77,6 +77,7 @@ from upgini.utils.custom_loss_utils import (
|
|
|
77
77
|
)
|
|
78
78
|
from upgini.utils.cv_utils import CVConfig, get_groups
|
|
79
79
|
from upgini.utils.datetime_utils import (
|
|
80
|
+
DateSearchKeyDetector,
|
|
80
81
|
DateTimeConverter,
|
|
81
82
|
is_blocked_time_series,
|
|
82
83
|
is_dates_distribution_valid,
|
|
@@ -238,6 +239,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
238
239
|
generate_search_key_features: bool = True,
|
|
239
240
|
sample_config: SampleConfig | None = None,
|
|
240
241
|
print_trace_id: bool = False,
|
|
242
|
+
print_loaded_report: bool = True,
|
|
241
243
|
**kwargs,
|
|
242
244
|
):
|
|
243
245
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -284,7 +286,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
284
286
|
self.id_columns = id_columns
|
|
285
287
|
self.id_columns_encoder = None
|
|
286
288
|
self.country_code = country_code
|
|
287
|
-
self.__validate_search_keys(search_keys, search_id)
|
|
289
|
+
self.__validate_search_keys(self.search_keys, search_id)
|
|
288
290
|
|
|
289
291
|
self.model_task_type = ModelTaskType.parse(model_task_type)
|
|
290
292
|
self.endpoint = endpoint
|
|
@@ -317,7 +319,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
317
319
|
self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
|
|
318
320
|
df = pd.DataFrame(columns=x_columns)
|
|
319
321
|
self.__prepare_feature_importances(trace_id, df, silent=True, update_selected_features=False)
|
|
320
|
-
|
|
322
|
+
if print_loaded_report:
|
|
323
|
+
self.__show_selected_features()
|
|
321
324
|
# TODO validate search_keys with search_keys from file_metadata
|
|
322
325
|
print(self.bundle.get("search_by_task_id_finish"))
|
|
323
326
|
self.logger.debug(f"Successfully initialized with search_id: {search_id}")
|
|
@@ -3226,8 +3229,11 @@ if response.status_code == 200:
|
|
|
3226
3229
|
df, self.fit_search_keys, self.fit_generated_features
|
|
3227
3230
|
)
|
|
3228
3231
|
self.fit_columns_renaming = normalizer.columns_renaming
|
|
3229
|
-
if normalizer.
|
|
3230
|
-
|
|
3232
|
+
if normalizer.removed_datetime_features:
|
|
3233
|
+
original_removed_datetime_features = [
|
|
3234
|
+
self.fit_columns_renaming.get(f, f) for f in normalizer.removed_datetime_features
|
|
3235
|
+
]
|
|
3236
|
+
self.__log_warning(self.bundle.get("dataset_date_features").format(original_removed_datetime_features))
|
|
3231
3237
|
|
|
3232
3238
|
non_feature_columns = [
|
|
3233
3239
|
self.TARGET_NAME,
|
|
@@ -4093,11 +4099,12 @@ if response.status_code == 200:
|
|
|
4093
4099
|
or set(search_keys.values()) == {SearchKey.EMAIL}
|
|
4094
4100
|
or set(search_keys.values()) == {SearchKey.HEM}
|
|
4095
4101
|
or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
|
|
4102
|
+
or len(search_keys) == 0
|
|
4103
|
+
or set(search_keys.values()) == {SearchKey.CUSTOM_KEY}
|
|
4096
4104
|
):
|
|
4097
4105
|
if not silent:
|
|
4098
4106
|
self.__log_warning(bundle.get("current_date_added"))
|
|
4099
4107
|
df[CURRENT_DATE_COL] = datetime.date.today()
|
|
4100
|
-
# df[CURRENT_DATE_COL] = datetime.date(2025, 10, 15)
|
|
4101
4108
|
search_keys[CURRENT_DATE_COL] = SearchKey.DATE
|
|
4102
4109
|
converter = DateTimeConverter(CURRENT_DATE_COL, generate_cyclical_features=False)
|
|
4103
4110
|
df = converter.convert(df)
|
|
@@ -4781,7 +4788,8 @@ if response.status_code == 200:
|
|
|
4781
4788
|
else:
|
|
4782
4789
|
msg = self.bundle.get("unregistered_only_personal_keys")
|
|
4783
4790
|
self.logger.warning(msg + f" Provided search keys: {search_keys}")
|
|
4784
|
-
|
|
4791
|
+
# Current date will be added later
|
|
4792
|
+
# raise ValidationError(msg)
|
|
4785
4793
|
|
|
4786
4794
|
if (
|
|
4787
4795
|
len(valid_search_keys.values()) == 1
|
|
@@ -4900,6 +4908,16 @@ if response.status_code == 200:
|
|
|
4900
4908
|
search_key in self.fit_search_keys.values() and search_key not in search_keys.values()
|
|
4901
4909
|
)
|
|
4902
4910
|
|
|
4911
|
+
if check_need_detect(SearchKey.DATE) and check_need_detect(SearchKey.DATETIME):
|
|
4912
|
+
maybe_keys = DateSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
4913
|
+
if len(maybe_keys) > 0:
|
|
4914
|
+
datetime_key = maybe_keys[0]
|
|
4915
|
+
search_keys[datetime_key] = SearchKey.DATETIME
|
|
4916
|
+
self.autodetected_search_keys[datetime_key] = SearchKey.DATETIME
|
|
4917
|
+
self.logger.info(f"Autodetected search key DATETIME in column {datetime_key}")
|
|
4918
|
+
if not silent_mode:
|
|
4919
|
+
print(self.bundle.get("datetime_detected").format(datetime_key))
|
|
4920
|
+
|
|
4903
4921
|
# if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
|
|
4904
4922
|
if check_need_detect(SearchKey.POSTAL_CODE):
|
|
4905
4923
|
maybe_keys = PostalCodeSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
@@ -5,7 +5,6 @@ from typing import Dict, List, Tuple
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
7
|
from pandas.api.types import is_bool_dtype as is_bool
|
|
8
|
-
from pandas.api.types import is_datetime64_any_dtype as is_datetime
|
|
9
8
|
from pandas.api.types import (
|
|
10
9
|
is_float_dtype,
|
|
11
10
|
is_numeric_dtype,
|
|
@@ -45,7 +44,7 @@ class Normalizer:
|
|
|
45
44
|
self.columns_renaming = {}
|
|
46
45
|
self.search_keys = {}
|
|
47
46
|
self.generated_features = []
|
|
48
|
-
self.
|
|
47
|
+
self.removed_datetime_features = []
|
|
49
48
|
|
|
50
49
|
def normalize(
|
|
51
50
|
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
|
|
@@ -134,8 +133,9 @@ class Normalizer:
|
|
|
134
133
|
features = self._get_features(df)
|
|
135
134
|
|
|
136
135
|
for f in features:
|
|
137
|
-
|
|
138
|
-
|
|
136
|
+
converter = DateTimeConverter(f)
|
|
137
|
+
if converter.is_datetime(df):
|
|
138
|
+
self.removed_datetime_features.append(f)
|
|
139
139
|
df.drop(columns=f, inplace=True)
|
|
140
140
|
|
|
141
141
|
return df
|
|
@@ -210,6 +210,7 @@ features_info_zero_important_features=Oops, we can't find any relevant external
|
|
|
210
210
|
features_info_zero_hit_rate_search_keys=Oops, looks like values/formats of the search keys {} might be incorrect,\nas we won't be able to match any data source using these values\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
|
|
211
211
|
features_not_generated=Following features didn't pass checks for automated feature generation: {}
|
|
212
212
|
# Information
|
|
213
|
+
datetime_detected=Datetime detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
213
214
|
postal_code_detected=Postal codes detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
214
215
|
country_detected=Countries detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
215
216
|
country_auto_determined=Search key country_code `{}` was automatically determined by client IP. \nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
@@ -24,4 +24,8 @@ class BaseSearchKeyDetector:
|
|
|
24
24
|
for column_name in other_columns:
|
|
25
25
|
if self._is_search_key_by_values(df[column_name]):
|
|
26
26
|
columns_by_values.append(column_name)
|
|
27
|
-
|
|
27
|
+
|
|
28
|
+
both = [col for col in columns_by_names if col in columns_by_values]
|
|
29
|
+
only_values = [col for col in columns_by_values if col not in columns_by_names]
|
|
30
|
+
only_names = [col for col in columns_by_names if col not in columns_by_values]
|
|
31
|
+
return both + only_values + only_names
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -10,6 +10,7 @@ from pandas.api.types import is_numeric_dtype
|
|
|
10
10
|
from upgini.errors import ValidationError
|
|
11
11
|
from upgini.metadata import EVAL_SET_INDEX, SearchKey
|
|
12
12
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
13
|
+
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
13
14
|
|
|
14
15
|
DATE_FORMATS = [
|
|
15
16
|
"%Y-%m-%d",
|
|
@@ -29,6 +30,15 @@ DATE_FORMATS = [
|
|
|
29
30
|
DATETIME_PATTERN = r"^[\d\s\.\-:T/+]+$"
|
|
30
31
|
|
|
31
32
|
|
|
33
|
+
class DateSearchKeyDetector(BaseSearchKeyDetector):
|
|
34
|
+
def _is_search_key_by_name(self, column_name: str) -> bool:
|
|
35
|
+
lower_column_name = str(column_name).lower()
|
|
36
|
+
return "date" in lower_column_name or "time" in lower_column_name or "timestamp" in lower_column_name
|
|
37
|
+
|
|
38
|
+
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
39
|
+
return DateTimeConverter(column.name).is_datetime(column.to_frame(column.name))
|
|
40
|
+
|
|
41
|
+
|
|
32
42
|
class DateTimeConverter:
|
|
33
43
|
DATETIME_COL = "_date_time"
|
|
34
44
|
# MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31) # 946684800000 # 2000-01-01
|
|
@@ -80,7 +90,7 @@ class DateTimeConverter:
|
|
|
80
90
|
return True
|
|
81
91
|
|
|
82
92
|
parsed = self.parse_datetime(df, raise_errors=False)
|
|
83
|
-
return parsed is not None and
|
|
93
|
+
return parsed is not None and parsed.isna().mean() <= 0.5
|
|
84
94
|
|
|
85
95
|
def parse_datetime(self, df: pd.DataFrame, raise_errors=True) -> pd.Series | None:
|
|
86
96
|
if len(df) == 0 or df[self.date_column].isna().all():
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=XCQXAFpTbucTcUYjHmbmGVUtguvbpWMCujKCLALhk3U,24
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=D9JzJJkZLPP_dp8GOlGgMhTtrd5pvP-4cHIcqiY3q-E,33354
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=nb40GDtl7FEWn13z_oFXN_Q67Hh_XTOHoDY8ASgcDSw,237111
|
|
7
7
|
upgini/http.py,sha256=y26x4TQVYuEM3jz8JdASxSyBtvBemUkFf-FmX25sx-s,44356
|
|
8
8
|
upgini/metadata.py,sha256=BwUTCY-EUHqPtO0tGazHrk3wqhh-NfjNZhlBHW8bR78,12796
|
|
9
9
|
upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
|
|
@@ -35,10 +35,10 @@ upgini/data_source/data_source_publisher.py,sha256=CQi3fEukaStV-RiadSEvEFLThOlZJ
|
|
|
35
35
|
upgini/mdc/__init__.py,sha256=iHJlXQg6xRM1-ZOUtaPSJqw5SpQDszvxp4LyqviNLIQ,1027
|
|
36
36
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
|
37
37
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
38
|
-
upgini/normalizer/normalize_utils.py,sha256=
|
|
38
|
+
upgini/normalizer/normalize_utils.py,sha256=oKevieBChYxtugocrbev8Uz2vqSZ_PB4Ibo55fHOylM,8452
|
|
39
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
40
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
41
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
41
|
+
upgini/resource_bundle/strings.properties,sha256=Dym1ymRH2uTeb0CfrO_lccoP9LWj7lxNGvsxmR7vkSw,29583
|
|
42
42
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
43
43
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
44
44
|
upgini/sampler/base.py,sha256=Fva2FEhLiNRPZ9Q6uOtJRtRzwsayjv7aphalAZO_4lc,6452
|
|
@@ -46,13 +46,13 @@ upgini/sampler/random_under_sampler.py,sha256=4mofmaRTmNwT_HqxecWJyfXdLKK0h9jMBw
|
|
|
46
46
|
upgini/sampler/utils.py,sha256=PYOk3kKSnFlyxcpdtDNLBEEhTB4lO_iP7pQHqeUcmAc,20211
|
|
47
47
|
upgini/utils/Roboto-Regular.ttf,sha256=kqYnZjMRQMpbyLulIChCLSdgYa1XF8GsUIoRi2Gcauw,168260
|
|
48
48
|
upgini/utils/__init__.py,sha256=O_KgzKiJjW3g4NoqZ7lAxUpoHcBi_gze6r3ndEjCH74,842
|
|
49
|
-
upgini/utils/base_search_key_detector.py,sha256=
|
|
49
|
+
upgini/utils/base_search_key_detector.py,sha256=DCmTbfhCdirJaDMpbKicFvhgVv6a9NNf9tXy3AbMCEg,1258
|
|
50
50
|
upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl1UOB4s,3382
|
|
51
51
|
upgini/utils/config.py,sha256=zFdnjchykfp_1Tm3Qep7phLzXBpXIOzr2tIuXchRBLw,1754
|
|
52
52
|
upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
|
|
53
53
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
54
54
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
55
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
55
|
+
upgini/utils/datetime_utils.py,sha256=pTi5doYjjjlQgwEVDNrJuKaAEC3Jtx78fjX4W7M_UV4,17615
|
|
56
56
|
upgini/utils/deduplicate_utils.py,sha256=CLX0QapRxB-ZVQT7yGvv1vSd2zac5SwRjCJavujdCps,11332
|
|
57
57
|
upgini/utils/display_utils.py,sha256=MoTqXZJvC6pAqgOaI3V0FG-IU_LnMfrn4TDcNvUqsdg,13316
|
|
58
58
|
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
|
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=CihpV6SC95HwtlMH60rGAUzVDa4Id0Bva8ySprmNHlE,
|
|
|
74
74
|
upgini/utils/track_info.py,sha256=NDKeQTUlZaYp15UoP-xLKGoDoJQ0drbDMwB0g9R0PUg,6427
|
|
75
75
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
76
76
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
77
|
-
upgini-1.2.
|
|
78
|
-
upgini-1.2.
|
|
79
|
-
upgini-1.2.
|
|
80
|
-
upgini-1.2.
|
|
77
|
+
upgini-1.2.138.dist-info/METADATA,sha256=AQ9v8a70kG-cbEps3pwuy1Rl0vUX-bHq3L11k4r5mp0,51164
|
|
78
|
+
upgini-1.2.138.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
79
|
+
upgini-1.2.138.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
80
|
+
upgini-1.2.138.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|