upgini 1.2.142a2__py3-none-any.whl → 1.2.143__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +5 -5
- upgini/utils/postal_code_utils.py +31 -2
- {upgini-1.2.142a2.dist-info → upgini-1.2.143.dist-info}/METADATA +1 -1
- {upgini-1.2.142a2.dist-info → upgini-1.2.143.dist-info}/RECORD +7 -7
- {upgini-1.2.142a2.dist-info → upgini-1.2.143.dist-info}/WHEEL +0 -0
- {upgini-1.2.142a2.dist-info → upgini-1.2.143.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.143"
|
upgini/features_enricher.py
CHANGED
|
@@ -2553,9 +2553,7 @@ if response.status_code == 200:
|
|
|
2553
2553
|
|
|
2554
2554
|
self.__validate_search_keys(search_keys, self.search_id)
|
|
2555
2555
|
|
|
2556
|
-
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
|
2557
|
-
X, y, eval_set=None, is_transform=True
|
|
2558
|
-
)
|
|
2556
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set=None, is_transform=True)
|
|
2559
2557
|
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
|
|
2560
2558
|
|
|
2561
2559
|
validated_Xy = df.copy()
|
|
@@ -4784,7 +4782,7 @@ if response.status_code == 200:
|
|
|
4784
4782
|
maybe_date = [k for k, v in valid_search_keys.items() if v in [SearchKey.DATE, SearchKey.DATETIME]]
|
|
4785
4783
|
if (self.cv is None or self.cv == CVType.k_fold) and len(maybe_date) > 0 and not silent_mode:
|
|
4786
4784
|
date_column = next(iter(maybe_date))
|
|
4787
|
-
if x[date_column].nunique() > 0.9 * _num_samples(x):
|
|
4785
|
+
if x[date_column].nunique() > 0.9 * _num_samples(x) and not is_transform:
|
|
4788
4786
|
msg = self.bundle.get("date_search_without_time_series")
|
|
4789
4787
|
self.__log_warning(msg)
|
|
4790
4788
|
|
|
@@ -5025,7 +5023,9 @@ if response.status_code == 200:
|
|
|
5025
5023
|
f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping"
|
|
5026
5024
|
)
|
|
5027
5025
|
else:
|
|
5028
|
-
self.rest_client.dump_input_file(
|
|
5026
|
+
self.rest_client.dump_input_file(
|
|
5027
|
+
trace_id_, f"{tmp_dir}/x.parquet", "x.parquet", x_digest_sha256
|
|
5028
|
+
)
|
|
5029
5029
|
|
|
5030
5030
|
if y_ is not None:
|
|
5031
5031
|
if isinstance(y_, pd.Series):
|
|
@@ -4,16 +4,45 @@ from pandas.api.types import (
|
|
|
4
4
|
is_object_dtype,
|
|
5
5
|
is_string_dtype,
|
|
6
6
|
)
|
|
7
|
+
import re
|
|
7
8
|
|
|
8
9
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class PostalCodeSearchKeyDetector(BaseSearchKeyDetector):
|
|
13
|
+
postal_pattern = re.compile(r'^[A-Za-z0-9][A-Za-z0-9\s\-]{1,9}$')
|
|
14
|
+
|
|
12
15
|
def _is_search_key_by_name(self, column_name: str) -> bool:
|
|
13
|
-
return str(column_name).lower()
|
|
16
|
+
return "zip" in str(column_name).lower() or "postal" in str(column_name).lower()
|
|
14
17
|
|
|
15
18
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
16
|
-
|
|
19
|
+
"""
|
|
20
|
+
# Fast two-step check whether the column looks like a postal code.
|
|
21
|
+
# Returns True if, after removing missing values, values remain,
|
|
22
|
+
# and all of them match the common characteristics of a postal code.
|
|
23
|
+
"""
|
|
24
|
+
s = column.copy().dropna().astype(str).str.strip()
|
|
25
|
+
s = s[s != ""] # remove empty strings
|
|
26
|
+
if s.empty:
|
|
27
|
+
return False
|
|
28
|
+
|
|
29
|
+
# remove suffix ".0" (often after float)
|
|
30
|
+
s = s.str.replace(r"\.0$", "", regex=True)
|
|
31
|
+
|
|
32
|
+
# --- Step 1: fast filtering ---
|
|
33
|
+
mask_len = s.str.len().between(2, 10)
|
|
34
|
+
mask_digit = s.str.contains(r'\d', regex=True)
|
|
35
|
+
mask_chars = ~s.str.contains(r'[^A-Za-z0-9\s\-]', regex=True)
|
|
36
|
+
fast_mask = mask_len & mask_digit & mask_chars
|
|
37
|
+
|
|
38
|
+
# if any of them failed the fast check, return False
|
|
39
|
+
if not fast_mask.all():
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
# --- Step 2: regex check ---
|
|
43
|
+
# only if the first step passed
|
|
44
|
+
valid_mask = s.apply(lambda x: bool(self.postal_pattern.fullmatch(x)))
|
|
45
|
+
return valid_mask.all()
|
|
17
46
|
|
|
18
47
|
|
|
19
48
|
class PostalCodeSearchKeyConverter:
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=TkxQ31o1wT_libI5BqRH2rhpiwyfbBiFlzZY9cCcBfw,24
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=bkUpUC1sdhCQcLPysB7BC8WiFDPfjiJj1SztExpv0nA,33735
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=KTSNUaJ_j6fBiCFnFfqY92kfSqGMXdgoIO2L7dyYM68,234739
|
|
7
7
|
upgini/http.py,sha256=y26x4TQVYuEM3jz8JdASxSyBtvBemUkFf-FmX25sx-s,44356
|
|
8
8
|
upgini/metadata.py,sha256=iYlL91g2PMHjiPIySIZb4IzIIUsPaAMIiV2It95GAjA,12866
|
|
9
9
|
upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
|
|
@@ -64,7 +64,7 @@ upgini/utils/hash_utils.py,sha256=mP2yHyzvDNdpa5g3B4MHzulxBeEz_ZSoGl1YF_VnAyE,55
|
|
|
64
64
|
upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
|
|
65
65
|
upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
|
66
66
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
67
|
-
upgini/utils/postal_code_utils.py,sha256=
|
|
67
|
+
upgini/utils/postal_code_utils.py,sha256=sxdk32CTLvey6zqBbay_HtNzLqcn2eAYUJL-sNmJuDg,2840
|
|
68
68
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
69
69
|
upgini/utils/psi.py,sha256=D_DMMBVkU4nwMospTwdMpYzNFACDxhqTuNesDngPwyY,11068
|
|
70
70
|
upgini/utils/sample_utils.py,sha256=xpfYaZ2cYP7I2JrcooVc13QNBFawB81cJRuh38451Q4,15123
|
|
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=CihpV6SC95HwtlMH60rGAUzVDa4Id0Bva8ySprmNHlE,
|
|
|
74
74
|
upgini/utils/track_info.py,sha256=NDKeQTUlZaYp15UoP-xLKGoDoJQ0drbDMwB0g9R0PUg,6427
|
|
75
75
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
76
76
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
77
|
-
upgini-1.2.
|
|
78
|
-
upgini-1.2.
|
|
79
|
-
upgini-1.2.
|
|
80
|
-
upgini-1.2.
|
|
77
|
+
upgini-1.2.143.dist-info/METADATA,sha256=rabfdtvLV7EsidthqGBubCvVp8zWDjqDf7_Nur_1Dro,51164
|
|
78
|
+
upgini-1.2.143.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
79
|
+
upgini-1.2.143.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
80
|
+
upgini-1.2.143.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|