upgini 1.2.143__py3-none-any.whl → 1.2.145__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +26 -17
- upgini/search_task.py +2 -1
- upgini/utils/postal_code_utils.py +4 -0
- {upgini-1.2.143.dist-info → upgini-1.2.145.dist-info}/METADATA +1 -1
- {upgini-1.2.143.dist-info → upgini-1.2.145.dist-info}/RECORD +8 -8
- {upgini-1.2.143.dist-info → upgini-1.2.145.dist-info}/WHEEL +0 -0
- {upgini-1.2.143.dist-info → upgini-1.2.145.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.145"
|
upgini/features_enricher.py
CHANGED
|
@@ -274,7 +274,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
274
274
|
self.X: pd.DataFrame | None = None
|
|
275
275
|
self.y: pd.Series | None = None
|
|
276
276
|
self.eval_set: list[tuple] | None = None
|
|
277
|
-
self.autodetected_search_keys: dict[str, SearchKey] =
|
|
277
|
+
self.autodetected_search_keys: dict[str, SearchKey] | None = None
|
|
278
278
|
self.imbalanced = False
|
|
279
279
|
self.fit_select_features = True
|
|
280
280
|
self.__cached_sampled_datasets: dict[str, tuple[pd.DataFrame, pd.DataFrame, pd.Series, dict, dict, dict]] = (
|
|
@@ -1311,10 +1311,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1311
1311
|
def _get_autodetected_search_keys(self):
|
|
1312
1312
|
if self.autodetected_search_keys is None and self._search_task is not None:
|
|
1313
1313
|
meta = self._search_task.get_file_metadata(self._get_trace_id())
|
|
1314
|
-
|
|
1314
|
+
autodetected_search_keys = meta.autodetectedSearchKeys or {}
|
|
1315
|
+
self.autodetected_search_keys = {k: SearchKey[v] for k, v in autodetected_search_keys.items()}
|
|
1315
1316
|
|
|
1316
1317
|
return self.autodetected_search_keys
|
|
1317
1318
|
|
|
1319
|
+
def _add_autodetected_search_keys(self, adding_search_keys: dict[str, SearchKey]):
|
|
1320
|
+
if self.autodetected_search_keys is None:
|
|
1321
|
+
self.autodetected_search_keys = dict()
|
|
1322
|
+
self.autodetected_search_keys.update(adding_search_keys)
|
|
1323
|
+
return self.autodetected_search_keys
|
|
1324
|
+
|
|
1318
1325
|
def _get_fit_search_keys_with_original_names(self):
|
|
1319
1326
|
if self.fit_search_keys is None and self._search_task is not None:
|
|
1320
1327
|
fit_search_keys = dict()
|
|
@@ -2954,10 +2961,6 @@ if response.status_code == 200:
|
|
|
2954
2961
|
if add_fit_system_record_id:
|
|
2955
2962
|
result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
|
|
2956
2963
|
|
|
2957
|
-
for c in result.columns:
|
|
2958
|
-
if result[c].dtype == "category":
|
|
2959
|
-
result.loc[:, c] = np.where(~result[c].isin(result[c].dtype.categories), np.nan, result[c])
|
|
2960
|
-
|
|
2961
2964
|
return result, columns_renaming, generated_features, search_keys
|
|
2962
2965
|
|
|
2963
2966
|
def _selecting_input_and_generated_columns(
|
|
@@ -2997,15 +3000,16 @@ if response.status_code == 200:
|
|
|
2997
3000
|
|
|
2998
3001
|
return selected_input_columns + selected_generated_features
|
|
2999
3002
|
|
|
3000
|
-
def
|
|
3003
|
+
def _validate_empty_search_keys(self, search_keys: dict[str, SearchKey], is_transform: bool = False):
|
|
3001
3004
|
if (search_keys is None or len(search_keys) == 0) and self.country_code is None:
|
|
3002
|
-
if
|
|
3003
|
-
self.logger.debug(
|
|
3005
|
+
if is_transform:
|
|
3006
|
+
self.logger.debug("Transform started without search_keys")
|
|
3004
3007
|
return
|
|
3005
3008
|
else:
|
|
3006
3009
|
self.logger.warning("search_keys not provided")
|
|
3007
3010
|
raise ValidationError(self.bundle.get("empty_search_keys"))
|
|
3008
3011
|
|
|
3012
|
+
def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
|
|
3009
3013
|
key_types = search_keys.values()
|
|
3010
3014
|
|
|
3011
3015
|
# Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
|
|
@@ -3647,7 +3651,8 @@ if response.status_code == 200:
|
|
|
3647
3651
|
keys.append("EMAIL")
|
|
3648
3652
|
if "DATE" in keys:
|
|
3649
3653
|
keys.append("DATETIME")
|
|
3650
|
-
|
|
3654
|
+
autodetected_search_keys = self.autodetected_search_keys or {}
|
|
3655
|
+
search_keys_with_autodetection = {**self.search_keys, **autodetected_search_keys}
|
|
3651
3656
|
return [c for c, v in search_keys_with_autodetection.items() if v.value.value in keys]
|
|
3652
3657
|
|
|
3653
3658
|
def _validate_train_eval(
|
|
@@ -4797,6 +4802,8 @@ if response.status_code == 200:
|
|
|
4797
4802
|
|
|
4798
4803
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
4799
4804
|
|
|
4805
|
+
self._validate_empty_search_keys(valid_search_keys, is_transform=is_transform)
|
|
4806
|
+
|
|
4800
4807
|
return valid_search_keys
|
|
4801
4808
|
|
|
4802
4809
|
def __show_metrics(
|
|
@@ -4882,8 +4889,9 @@ if response.status_code == 200:
|
|
|
4882
4889
|
maybe_keys = DateSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
4883
4890
|
if len(maybe_keys) > 0:
|
|
4884
4891
|
datetime_key = maybe_keys[0]
|
|
4885
|
-
|
|
4886
|
-
|
|
4892
|
+
new_keys = {datetime_key: SearchKey.DATETIME}
|
|
4893
|
+
search_keys.update(new_keys)
|
|
4894
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4887
4895
|
self.logger.info(f"Autodetected search key DATETIME in column {datetime_key}")
|
|
4888
4896
|
print(self.bundle.get("datetime_detected").format(datetime_key))
|
|
4889
4897
|
|
|
@@ -4892,15 +4900,16 @@ if response.status_code == 200:
|
|
|
4892
4900
|
if maybe_keys:
|
|
4893
4901
|
new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
|
|
4894
4902
|
search_keys.update(new_keys)
|
|
4895
|
-
self.
|
|
4903
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4896
4904
|
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
|
|
4897
4905
|
print(self.bundle.get("postal_code_detected").format(maybe_keys))
|
|
4898
4906
|
|
|
4899
4907
|
if SearchKey.COUNTRY not in search_keys.values() and self.country_code is None:
|
|
4900
4908
|
maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
4901
4909
|
if maybe_key:
|
|
4902
|
-
|
|
4903
|
-
|
|
4910
|
+
new_keys = {maybe_key[0]: SearchKey.COUNTRY}
|
|
4911
|
+
search_keys.update(new_keys)
|
|
4912
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4904
4913
|
self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
|
|
4905
4914
|
print(self.bundle.get("country_detected").format(maybe_key))
|
|
4906
4915
|
|
|
@@ -4910,7 +4919,7 @@ if response.status_code == 200:
|
|
|
4910
4919
|
if self.__is_registered or is_demo_dataset:
|
|
4911
4920
|
new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
|
|
4912
4921
|
search_keys.update(new_keys)
|
|
4913
|
-
self.
|
|
4922
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4914
4923
|
self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
|
|
4915
4924
|
print(self.bundle.get("email_detected").format(maybe_keys))
|
|
4916
4925
|
else:
|
|
@@ -4926,7 +4935,7 @@ if response.status_code == 200:
|
|
|
4926
4935
|
if self.__is_registered or is_demo_dataset:
|
|
4927
4936
|
new_keys = {key: SearchKey.PHONE for key in maybe_keys}
|
|
4928
4937
|
search_keys.update(new_keys)
|
|
4929
|
-
self.
|
|
4938
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4930
4939
|
self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
|
|
4931
4940
|
print(self.bundle.get("phone_detected").format(maybe_keys))
|
|
4932
4941
|
else:
|
upgini/search_task.py
CHANGED
|
@@ -434,4 +434,5 @@ def _read_parquet(file_content: bytes, file_name: str = "features.parquet"):
|
|
|
434
434
|
tmp_file_name = f"{tmp_dir}/{file_name}"
|
|
435
435
|
with open(tmp_file_name, "wb") as gzip_file:
|
|
436
436
|
gzip_file.write(file_content)
|
|
437
|
-
|
|
437
|
+
# Note: MLB writes files using pyarrow, so reading with fastparquet may cause errors.
|
|
438
|
+
return pd.read_parquet(tmp_file_name, engine="pyarrow")
|
|
@@ -21,6 +21,10 @@ class PostalCodeSearchKeyDetector(BaseSearchKeyDetector):
|
|
|
21
21
|
# Returns True if, after removing missing values, values remain,
|
|
22
22
|
# and all of them match the common characteristics of a postal code.
|
|
23
23
|
"""
|
|
24
|
+
# Check only columns that are candidates for postal code by column name
|
|
25
|
+
if not self._is_search_key_by_name(column.name):
|
|
26
|
+
return False
|
|
27
|
+
|
|
24
28
|
s = column.copy().dropna().astype(str).str.strip()
|
|
25
29
|
s = s[s != ""] # remove empty strings
|
|
26
30
|
if s.empty:
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=4jbqu35vw7uX_qx6RMvIXk8QXoruhLaE4ANuc3Ev6ks,24
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=bkUpUC1sdhCQcLPysB7BC8WiFDPfjiJj1SztExpv0nA,33735
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=7vAiAQNGH5BJENp22NfgFRJfsBNSYyFlIHJmPmA5F-I,235217
|
|
7
7
|
upgini/http.py,sha256=y26x4TQVYuEM3jz8JdASxSyBtvBemUkFf-FmX25sx-s,44356
|
|
8
8
|
upgini/metadata.py,sha256=iYlL91g2PMHjiPIySIZb4IzIIUsPaAMIiV2It95GAjA,12866
|
|
9
9
|
upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
|
|
10
|
-
upgini/search_task.py,sha256=
|
|
10
|
+
upgini/search_task.py,sha256=zEXqfX95g0aLtVQUpwQjCvFqqMCAIWELAJRoeYBP5tQ,18746
|
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
|
13
13
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
@@ -64,7 +64,7 @@ upgini/utils/hash_utils.py,sha256=mP2yHyzvDNdpa5g3B4MHzulxBeEz_ZSoGl1YF_VnAyE,55
|
|
|
64
64
|
upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
|
|
65
65
|
upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
|
66
66
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
67
|
-
upgini/utils/postal_code_utils.py,sha256=
|
|
67
|
+
upgini/utils/postal_code_utils.py,sha256=COQ23lYcLS2Ky-qI-eBOVxhEMdBl7J2cFDT9F3XdswI,3003
|
|
68
68
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
69
69
|
upgini/utils/psi.py,sha256=D_DMMBVkU4nwMospTwdMpYzNFACDxhqTuNesDngPwyY,11068
|
|
70
70
|
upgini/utils/sample_utils.py,sha256=xpfYaZ2cYP7I2JrcooVc13QNBFawB81cJRuh38451Q4,15123
|
|
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=CihpV6SC95HwtlMH60rGAUzVDa4Id0Bva8ySprmNHlE,
|
|
|
74
74
|
upgini/utils/track_info.py,sha256=NDKeQTUlZaYp15UoP-xLKGoDoJQ0drbDMwB0g9R0PUg,6427
|
|
75
75
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
76
76
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
77
|
-
upgini-1.2.
|
|
78
|
-
upgini-1.2.
|
|
79
|
-
upgini-1.2.
|
|
80
|
-
upgini-1.2.
|
|
77
|
+
upgini-1.2.145.dist-info/METADATA,sha256=p_rXUVOW9xW1zDnHfPkzMOUvs3Jumd7teykmLYWyEIM,51164
|
|
78
|
+
upgini-1.2.145.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
79
|
+
upgini-1.2.145.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
80
|
+
upgini-1.2.145.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|