upgini 1.2.143__py3-none-any.whl → 1.2.144__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +20 -14
- upgini/search_task.py +2 -1
- upgini/utils/postal_code_utils.py +4 -0
- {upgini-1.2.143.dist-info → upgini-1.2.144.dist-info}/METADATA +1 -1
- {upgini-1.2.143.dist-info → upgini-1.2.144.dist-info}/RECORD +8 -8
- {upgini-1.2.143.dist-info → upgini-1.2.144.dist-info}/WHEEL +0 -0
- {upgini-1.2.143.dist-info → upgini-1.2.144.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.144"
|
upgini/features_enricher.py
CHANGED
|
@@ -274,7 +274,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
274
274
|
self.X: pd.DataFrame | None = None
|
|
275
275
|
self.y: pd.Series | None = None
|
|
276
276
|
self.eval_set: list[tuple] | None = None
|
|
277
|
-
self.autodetected_search_keys: dict[str, SearchKey] =
|
|
277
|
+
self.autodetected_search_keys: dict[str, SearchKey] | None = None
|
|
278
278
|
self.imbalanced = False
|
|
279
279
|
self.fit_select_features = True
|
|
280
280
|
self.__cached_sampled_datasets: dict[str, tuple[pd.DataFrame, pd.DataFrame, pd.Series, dict, dict, dict]] = (
|
|
@@ -1311,10 +1311,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1311
1311
|
def _get_autodetected_search_keys(self):
|
|
1312
1312
|
if self.autodetected_search_keys is None and self._search_task is not None:
|
|
1313
1313
|
meta = self._search_task.get_file_metadata(self._get_trace_id())
|
|
1314
|
-
|
|
1314
|
+
autodetected_search_keys = meta.autodetectedSearchKeys or {}
|
|
1315
|
+
self.autodetected_search_keys = {k: SearchKey[v] for k, v in autodetected_search_keys.items()}
|
|
1315
1316
|
|
|
1316
1317
|
return self.autodetected_search_keys
|
|
1317
1318
|
|
|
1319
|
+
def _add_autodetected_search_keys(self, adding_search_keys: dict[str, SearchKey]):
|
|
1320
|
+
if self.autodetected_search_keys is None:
|
|
1321
|
+
self.autodetected_search_keys = dict()
|
|
1322
|
+
self.autodetected_search_keys.update(adding_search_keys)
|
|
1323
|
+
return self.autodetected_search_keys
|
|
1324
|
+
|
|
1318
1325
|
def _get_fit_search_keys_with_original_names(self):
|
|
1319
1326
|
if self.fit_search_keys is None and self._search_task is not None:
|
|
1320
1327
|
fit_search_keys = dict()
|
|
@@ -2954,10 +2961,6 @@ if response.status_code == 200:
|
|
|
2954
2961
|
if add_fit_system_record_id:
|
|
2955
2962
|
result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
|
|
2956
2963
|
|
|
2957
|
-
for c in result.columns:
|
|
2958
|
-
if result[c].dtype == "category":
|
|
2959
|
-
result.loc[:, c] = np.where(~result[c].isin(result[c].dtype.categories), np.nan, result[c])
|
|
2960
|
-
|
|
2961
2964
|
return result, columns_renaming, generated_features, search_keys
|
|
2962
2965
|
|
|
2963
2966
|
def _selecting_input_and_generated_columns(
|
|
@@ -3647,7 +3650,8 @@ if response.status_code == 200:
|
|
|
3647
3650
|
keys.append("EMAIL")
|
|
3648
3651
|
if "DATE" in keys:
|
|
3649
3652
|
keys.append("DATETIME")
|
|
3650
|
-
|
|
3653
|
+
autodetected_search_keys = self.autodetected_search_keys or {}
|
|
3654
|
+
search_keys_with_autodetection = {**self.search_keys, **autodetected_search_keys}
|
|
3651
3655
|
return [c for c, v in search_keys_with_autodetection.items() if v.value.value in keys]
|
|
3652
3656
|
|
|
3653
3657
|
def _validate_train_eval(
|
|
@@ -4882,8 +4886,9 @@ if response.status_code == 200:
|
|
|
4882
4886
|
maybe_keys = DateSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
4883
4887
|
if len(maybe_keys) > 0:
|
|
4884
4888
|
datetime_key = maybe_keys[0]
|
|
4885
|
-
|
|
4886
|
-
|
|
4889
|
+
new_keys = {datetime_key: SearchKey.DATETIME}
|
|
4890
|
+
search_keys.update(new_keys)
|
|
4891
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4887
4892
|
self.logger.info(f"Autodetected search key DATETIME in column {datetime_key}")
|
|
4888
4893
|
print(self.bundle.get("datetime_detected").format(datetime_key))
|
|
4889
4894
|
|
|
@@ -4892,15 +4897,16 @@ if response.status_code == 200:
|
|
|
4892
4897
|
if maybe_keys:
|
|
4893
4898
|
new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
|
|
4894
4899
|
search_keys.update(new_keys)
|
|
4895
|
-
self.
|
|
4900
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4896
4901
|
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
|
|
4897
4902
|
print(self.bundle.get("postal_code_detected").format(maybe_keys))
|
|
4898
4903
|
|
|
4899
4904
|
if SearchKey.COUNTRY not in search_keys.values() and self.country_code is None:
|
|
4900
4905
|
maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
4901
4906
|
if maybe_key:
|
|
4902
|
-
|
|
4903
|
-
|
|
4907
|
+
new_keys = {maybe_key[0]: SearchKey.COUNTRY}
|
|
4908
|
+
search_keys.update(new_keys)
|
|
4909
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4904
4910
|
self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
|
|
4905
4911
|
print(self.bundle.get("country_detected").format(maybe_key))
|
|
4906
4912
|
|
|
@@ -4910,7 +4916,7 @@ if response.status_code == 200:
|
|
|
4910
4916
|
if self.__is_registered or is_demo_dataset:
|
|
4911
4917
|
new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
|
|
4912
4918
|
search_keys.update(new_keys)
|
|
4913
|
-
self.
|
|
4919
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4914
4920
|
self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
|
|
4915
4921
|
print(self.bundle.get("email_detected").format(maybe_keys))
|
|
4916
4922
|
else:
|
|
@@ -4926,7 +4932,7 @@ if response.status_code == 200:
|
|
|
4926
4932
|
if self.__is_registered or is_demo_dataset:
|
|
4927
4933
|
new_keys = {key: SearchKey.PHONE for key in maybe_keys}
|
|
4928
4934
|
search_keys.update(new_keys)
|
|
4929
|
-
self.
|
|
4935
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4930
4936
|
self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
|
|
4931
4937
|
print(self.bundle.get("phone_detected").format(maybe_keys))
|
|
4932
4938
|
else:
|
upgini/search_task.py
CHANGED
|
@@ -434,4 +434,5 @@ def _read_parquet(file_content: bytes, file_name: str = "features.parquet"):
|
|
|
434
434
|
tmp_file_name = f"{tmp_dir}/{file_name}"
|
|
435
435
|
with open(tmp_file_name, "wb") as gzip_file:
|
|
436
436
|
gzip_file.write(file_content)
|
|
437
|
-
|
|
437
|
+
# Note: MLB writes files using pyarrow, so reading with fastparquet may cause errors.
|
|
438
|
+
return pd.read_parquet(tmp_file_name, engine="pyarrow")
|
|
@@ -21,6 +21,10 @@ class PostalCodeSearchKeyDetector(BaseSearchKeyDetector):
|
|
|
21
21
|
# Returns True if, after removing missing values, values remain,
|
|
22
22
|
# and all of them match the common characteristics of a postal code.
|
|
23
23
|
"""
|
|
24
|
+
# Check only columns that are candidates for postal code by column name
|
|
25
|
+
if not self._is_search_key_by_name(column.name):
|
|
26
|
+
return False
|
|
27
|
+
|
|
24
28
|
s = column.copy().dropna().astype(str).str.strip()
|
|
25
29
|
s = s[s != ""] # remove empty strings
|
|
26
30
|
if s.empty:
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=aics4V8KhPF0GdMfEahLLcODv_MyCYM-IVrE84vOyM8,24
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=bkUpUC1sdhCQcLPysB7BC8WiFDPfjiJj1SztExpv0nA,33735
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=f_l0qOzclD6Zi2vEqfFbkFwuxDxFU9nFuaC88leDJGg,235034
|
|
7
7
|
upgini/http.py,sha256=y26x4TQVYuEM3jz8JdASxSyBtvBemUkFf-FmX25sx-s,44356
|
|
8
8
|
upgini/metadata.py,sha256=iYlL91g2PMHjiPIySIZb4IzIIUsPaAMIiV2It95GAjA,12866
|
|
9
9
|
upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
|
|
10
|
-
upgini/search_task.py,sha256=
|
|
10
|
+
upgini/search_task.py,sha256=zEXqfX95g0aLtVQUpwQjCvFqqMCAIWELAJRoeYBP5tQ,18746
|
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
|
13
13
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
@@ -64,7 +64,7 @@ upgini/utils/hash_utils.py,sha256=mP2yHyzvDNdpa5g3B4MHzulxBeEz_ZSoGl1YF_VnAyE,55
|
|
|
64
64
|
upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
|
|
65
65
|
upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
|
66
66
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
67
|
-
upgini/utils/postal_code_utils.py,sha256=
|
|
67
|
+
upgini/utils/postal_code_utils.py,sha256=COQ23lYcLS2Ky-qI-eBOVxhEMdBl7J2cFDT9F3XdswI,3003
|
|
68
68
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
69
69
|
upgini/utils/psi.py,sha256=D_DMMBVkU4nwMospTwdMpYzNFACDxhqTuNesDngPwyY,11068
|
|
70
70
|
upgini/utils/sample_utils.py,sha256=xpfYaZ2cYP7I2JrcooVc13QNBFawB81cJRuh38451Q4,15123
|
|
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=CihpV6SC95HwtlMH60rGAUzVDa4Id0Bva8ySprmNHlE,
|
|
|
74
74
|
upgini/utils/track_info.py,sha256=NDKeQTUlZaYp15UoP-xLKGoDoJQ0drbDMwB0g9R0PUg,6427
|
|
75
75
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
76
76
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
77
|
-
upgini-1.2.
|
|
78
|
-
upgini-1.2.
|
|
79
|
-
upgini-1.2.
|
|
80
|
-
upgini-1.2.
|
|
77
|
+
upgini-1.2.144.dist-info/METADATA,sha256=hBvc1KOqsAiItKjYzQDkxt07e8v3GAMQuOkYoNyRrl8,51164
|
|
78
|
+
upgini-1.2.144.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
79
|
+
upgini-1.2.144.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
80
|
+
upgini-1.2.144.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|