upgini 1.2.142a1__py3-none-any.whl → 1.2.143__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +6 -5
- upgini/utils/datetime_utils.py +7 -4
- upgini/utils/postal_code_utils.py +31 -2
- {upgini-1.2.142a1.dist-info → upgini-1.2.143.dist-info}/METADATA +1 -1
- {upgini-1.2.142a1.dist-info → upgini-1.2.143.dist-info}/RECORD +8 -8
- {upgini-1.2.142a1.dist-info → upgini-1.2.143.dist-info}/WHEEL +0 -0
- {upgini-1.2.142a1.dist-info → upgini-1.2.143.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.143"
|
upgini/features_enricher.py
CHANGED
|
@@ -2553,9 +2553,7 @@ if response.status_code == 200:
|
|
|
2553
2553
|
|
|
2554
2554
|
self.__validate_search_keys(search_keys, self.search_id)
|
|
2555
2555
|
|
|
2556
|
-
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
|
2557
|
-
X, y, eval_set=None, is_transform=True
|
|
2558
|
-
)
|
|
2556
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set=None, is_transform=True)
|
|
2559
2557
|
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
|
|
2560
2558
|
|
|
2561
2559
|
validated_Xy = df.copy()
|
|
@@ -3239,6 +3237,7 @@ if response.status_code == 200:
|
|
|
3239
3237
|
)
|
|
3240
3238
|
self.fit_columns_renaming = normalizer.columns_renaming
|
|
3241
3239
|
if normalizer.removed_datetime_features:
|
|
3240
|
+
self.fit_dropped_features.update(normalizer.removed_datetime_features)
|
|
3242
3241
|
original_removed_datetime_features = [
|
|
3243
3242
|
self.fit_columns_renaming.get(f, f) for f in normalizer.removed_datetime_features
|
|
3244
3243
|
]
|
|
@@ -4783,7 +4782,7 @@ if response.status_code == 200:
|
|
|
4783
4782
|
maybe_date = [k for k, v in valid_search_keys.items() if v in [SearchKey.DATE, SearchKey.DATETIME]]
|
|
4784
4783
|
if (self.cv is None or self.cv == CVType.k_fold) and len(maybe_date) > 0 and not silent_mode:
|
|
4785
4784
|
date_column = next(iter(maybe_date))
|
|
4786
|
-
if x[date_column].nunique() > 0.9 * _num_samples(x):
|
|
4785
|
+
if x[date_column].nunique() > 0.9 * _num_samples(x) and not is_transform:
|
|
4787
4786
|
msg = self.bundle.get("date_search_without_time_series")
|
|
4788
4787
|
self.__log_warning(msg)
|
|
4789
4788
|
|
|
@@ -5024,7 +5023,9 @@ if response.status_code == 200:
|
|
|
5024
5023
|
f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping"
|
|
5025
5024
|
)
|
|
5026
5025
|
else:
|
|
5027
|
-
self.rest_client.dump_input_file(
|
|
5026
|
+
self.rest_client.dump_input_file(
|
|
5027
|
+
trace_id_, f"{tmp_dir}/x.parquet", "x.parquet", x_digest_sha256
|
|
5028
|
+
)
|
|
5028
5029
|
|
|
5029
5030
|
if y_ is not None:
|
|
5030
5031
|
if isinstance(y_, pd.Series):
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -282,10 +282,13 @@ class DateTimeConverter:
|
|
|
282
282
|
warnings.filterwarnings("ignore", message="Could not infer format")
|
|
283
283
|
return pd.to_datetime(df[self.date_column])
|
|
284
284
|
except ValueError:
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
285
|
+
try:
|
|
286
|
+
return pd.to_datetime(df[self.date_column], format="mixed", errors="raise")
|
|
287
|
+
except ValueError:
|
|
288
|
+
if raise_errors:
|
|
289
|
+
raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
|
|
290
|
+
else:
|
|
291
|
+
return None
|
|
289
292
|
|
|
290
293
|
def clean_old_dates(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
291
294
|
condition = df[self.date_column] <= self.MIN_SUPPORTED_DATE_TS
|
|
@@ -4,16 +4,45 @@ from pandas.api.types import (
|
|
|
4
4
|
is_object_dtype,
|
|
5
5
|
is_string_dtype,
|
|
6
6
|
)
|
|
7
|
+
import re
|
|
7
8
|
|
|
8
9
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class PostalCodeSearchKeyDetector(BaseSearchKeyDetector):
|
|
13
|
+
postal_pattern = re.compile(r'^[A-Za-z0-9][A-Za-z0-9\s\-]{1,9}$')
|
|
14
|
+
|
|
12
15
|
def _is_search_key_by_name(self, column_name: str) -> bool:
|
|
13
|
-
return str(column_name).lower()
|
|
16
|
+
return "zip" in str(column_name).lower() or "postal" in str(column_name).lower()
|
|
14
17
|
|
|
15
18
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
16
|
-
|
|
19
|
+
"""
|
|
20
|
+
# Fast two-step check whether the column looks like a postal code.
|
|
21
|
+
# Returns True if, after removing missing values, values remain,
|
|
22
|
+
# and all of them match the common characteristics of a postal code.
|
|
23
|
+
"""
|
|
24
|
+
s = column.copy().dropna().astype(str).str.strip()
|
|
25
|
+
s = s[s != ""] # remove empty strings
|
|
26
|
+
if s.empty:
|
|
27
|
+
return False
|
|
28
|
+
|
|
29
|
+
# remove suffix ".0" (often after float)
|
|
30
|
+
s = s.str.replace(r"\.0$", "", regex=True)
|
|
31
|
+
|
|
32
|
+
# --- Step 1: fast filtering ---
|
|
33
|
+
mask_len = s.str.len().between(2, 10)
|
|
34
|
+
mask_digit = s.str.contains(r'\d', regex=True)
|
|
35
|
+
mask_chars = ~s.str.contains(r'[^A-Za-z0-9\s\-]', regex=True)
|
|
36
|
+
fast_mask = mask_len & mask_digit & mask_chars
|
|
37
|
+
|
|
38
|
+
# if any of them failed the fast check, return False
|
|
39
|
+
if not fast_mask.all():
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
# --- Step 2: regex check ---
|
|
43
|
+
# only if the first step passed
|
|
44
|
+
valid_mask = s.apply(lambda x: bool(self.postal_pattern.fullmatch(x)))
|
|
45
|
+
return valid_mask.all()
|
|
17
46
|
|
|
18
47
|
|
|
19
48
|
class PostalCodeSearchKeyConverter:
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=TkxQ31o1wT_libI5BqRH2rhpiwyfbBiFlzZY9cCcBfw,24
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=bkUpUC1sdhCQcLPysB7BC8WiFDPfjiJj1SztExpv0nA,33735
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=KTSNUaJ_j6fBiCFnFfqY92kfSqGMXdgoIO2L7dyYM68,234739
|
|
7
7
|
upgini/http.py,sha256=y26x4TQVYuEM3jz8JdASxSyBtvBemUkFf-FmX25sx-s,44356
|
|
8
8
|
upgini/metadata.py,sha256=iYlL91g2PMHjiPIySIZb4IzIIUsPaAMIiV2It95GAjA,12866
|
|
9
9
|
upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
|
|
@@ -52,7 +52,7 @@ upgini/utils/config.py,sha256=zFdnjchykfp_1Tm3Qep7phLzXBpXIOzr2tIuXchRBLw,1754
|
|
|
52
52
|
upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
|
|
53
53
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
54
54
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
55
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
55
|
+
upgini/utils/datetime_utils.py,sha256=rr_aUjzKxj9I-0EPyRnWCquXkb4hdh6YcEDtoxeB2XE,17783
|
|
56
56
|
upgini/utils/deduplicate_utils.py,sha256=CLX0QapRxB-ZVQT7yGvv1vSd2zac5SwRjCJavujdCps,11332
|
|
57
57
|
upgini/utils/display_utils.py,sha256=MoTqXZJvC6pAqgOaI3V0FG-IU_LnMfrn4TDcNvUqsdg,13316
|
|
58
58
|
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
|
@@ -64,7 +64,7 @@ upgini/utils/hash_utils.py,sha256=mP2yHyzvDNdpa5g3B4MHzulxBeEz_ZSoGl1YF_VnAyE,55
|
|
|
64
64
|
upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
|
|
65
65
|
upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
|
66
66
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
67
|
-
upgini/utils/postal_code_utils.py,sha256=
|
|
67
|
+
upgini/utils/postal_code_utils.py,sha256=sxdk32CTLvey6zqBbay_HtNzLqcn2eAYUJL-sNmJuDg,2840
|
|
68
68
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
69
69
|
upgini/utils/psi.py,sha256=D_DMMBVkU4nwMospTwdMpYzNFACDxhqTuNesDngPwyY,11068
|
|
70
70
|
upgini/utils/sample_utils.py,sha256=xpfYaZ2cYP7I2JrcooVc13QNBFawB81cJRuh38451Q4,15123
|
|
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=CihpV6SC95HwtlMH60rGAUzVDa4Id0Bva8ySprmNHlE,
|
|
|
74
74
|
upgini/utils/track_info.py,sha256=NDKeQTUlZaYp15UoP-xLKGoDoJQ0drbDMwB0g9R0PUg,6427
|
|
75
75
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
76
76
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
77
|
-
upgini-1.2.
|
|
78
|
-
upgini-1.2.
|
|
79
|
-
upgini-1.2.
|
|
80
|
-
upgini-1.2.
|
|
77
|
+
upgini-1.2.143.dist-info/METADATA,sha256=rabfdtvLV7EsidthqGBubCvVp8zWDjqDf7_Nur_1Dro,51164
|
|
78
|
+
upgini-1.2.143.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
79
|
+
upgini-1.2.143.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
80
|
+
upgini-1.2.143.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|