upgini 1.2.124__py3-none-any.whl → 1.2.146a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/binary.py +4 -3
- upgini/data_source/data_source_publisher.py +1 -9
- upgini/dataset.py +56 -6
- upgini/features_enricher.py +634 -556
- upgini/http.py +2 -2
- upgini/metadata.py +16 -2
- upgini/normalizer/normalize_utils.py +6 -6
- upgini/resource_bundle/strings.properties +15 -11
- upgini/search_task.py +14 -2
- upgini/utils/base_search_key_detector.py +5 -1
- upgini/utils/datetime_utils.py +125 -39
- upgini/utils/deduplicate_utils.py +8 -5
- upgini/utils/display_utils.py +61 -20
- upgini/utils/feature_info.py +18 -7
- upgini/utils/features_validator.py +6 -4
- upgini/utils/postal_code_utils.py +35 -2
- upgini/utils/target_utils.py +3 -1
- upgini/utils/track_info.py +29 -1
- {upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/METADATA +123 -121
- {upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/RECORD +23 -23
- {upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/WHEEL +1 -1
- {upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/licenses/LICENSE +0 -0
|
@@ -44,12 +44,14 @@ class FeaturesValidator:
|
|
|
44
44
|
else:
|
|
45
45
|
empty_or_constant_features.append(f)
|
|
46
46
|
|
|
47
|
-
if one_hot_encoded_features:
|
|
48
|
-
msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
|
|
49
|
-
warnings.append(msg)
|
|
50
|
-
|
|
51
47
|
columns_renaming = columns_renaming or {}
|
|
52
48
|
|
|
49
|
+
if one_hot_encoded_features and len(one_hot_encoded_features) > 1:
|
|
50
|
+
msg = bundle.get("one_hot_encoded_features").format(
|
|
51
|
+
[columns_renaming.get(f, f) for f in one_hot_encoded_features]
|
|
52
|
+
)
|
|
53
|
+
warnings.append(msg)
|
|
54
|
+
|
|
53
55
|
if empty_or_constant_features:
|
|
54
56
|
msg = bundle.get("empty_or_contant_features").format(
|
|
55
57
|
[columns_renaming.get(f, f) for f in empty_or_constant_features]
|
|
@@ -4,16 +4,49 @@ from pandas.api.types import (
|
|
|
4
4
|
is_object_dtype,
|
|
5
5
|
is_string_dtype,
|
|
6
6
|
)
|
|
7
|
+
import re
|
|
7
8
|
|
|
8
9
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class PostalCodeSearchKeyDetector(BaseSearchKeyDetector):
|
|
13
|
+
postal_pattern = re.compile(r'^[A-Za-z0-9][A-Za-z0-9\s\-]{1,9}$')
|
|
14
|
+
|
|
12
15
|
def _is_search_key_by_name(self, column_name: str) -> bool:
|
|
13
|
-
return str(column_name).lower()
|
|
16
|
+
return "zip" in str(column_name).lower() or "postal" in str(column_name).lower()
|
|
14
17
|
|
|
15
18
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
16
|
-
|
|
19
|
+
"""
|
|
20
|
+
# Fast two-step check whether the column looks like a postal code.
|
|
21
|
+
# Returns True if, after removing missing values, values remain,
|
|
22
|
+
# and all of them match the common characteristics of a postal code.
|
|
23
|
+
"""
|
|
24
|
+
# Check only columns that are candidates for postal code by column name
|
|
25
|
+
if not self._is_search_key_by_name(column.name):
|
|
26
|
+
return False
|
|
27
|
+
|
|
28
|
+
s = column.copy().dropna().astype(str).str.strip()
|
|
29
|
+
s = s[s != ""] # remove empty strings
|
|
30
|
+
if s.empty:
|
|
31
|
+
return False
|
|
32
|
+
|
|
33
|
+
# remove suffix ".0" (often after float)
|
|
34
|
+
s = s.str.replace(r"\.0$", "", regex=True)
|
|
35
|
+
|
|
36
|
+
# --- Step 1: fast filtering ---
|
|
37
|
+
mask_len = s.str.len().between(2, 10)
|
|
38
|
+
mask_digit = s.str.contains(r'\d', regex=True)
|
|
39
|
+
mask_chars = ~s.str.contains(r'[^A-Za-z0-9\s\-]', regex=True)
|
|
40
|
+
fast_mask = mask_len & mask_digit & mask_chars
|
|
41
|
+
|
|
42
|
+
# if any of them failed the fast check, return False
|
|
43
|
+
if not fast_mask.all():
|
|
44
|
+
return False
|
|
45
|
+
|
|
46
|
+
# --- Step 2: regex check ---
|
|
47
|
+
# only if the first step passed
|
|
48
|
+
valid_mask = s.apply(lambda x: bool(self.postal_pattern.fullmatch(x)))
|
|
49
|
+
return valid_mask.all()
|
|
17
50
|
|
|
18
51
|
|
|
19
52
|
class PostalCodeSearchKeyConverter:
|
upgini/utils/target_utils.py
CHANGED
|
@@ -31,6 +31,7 @@ def define_task(
|
|
|
31
31
|
) -> ModelTaskType:
|
|
32
32
|
if logger is None:
|
|
33
33
|
logger = logging.getLogger()
|
|
34
|
+
logger.setLevel(logging.FATAL)
|
|
34
35
|
|
|
35
36
|
# Replace inf and -inf with NaN to handle extreme values correctly
|
|
36
37
|
y = y.replace([np.inf, -np.inf], np.nan, inplace=False)
|
|
@@ -143,7 +144,8 @@ def is_imbalanced(
|
|
|
143
144
|
msg = bundle.get("dataset_rarest_class_less_min").format(
|
|
144
145
|
min_class_value, min_class_count, MIN_TARGET_CLASS_ROWS
|
|
145
146
|
)
|
|
146
|
-
|
|
147
|
+
print(msg)
|
|
148
|
+
# raise ValidationError(msg)
|
|
147
149
|
|
|
148
150
|
min_class_percent = IMBALANCE_THESHOLD / target_classes_count
|
|
149
151
|
min_class_threshold = min_class_percent * count
|
upgini/utils/track_info.py
CHANGED
|
@@ -5,6 +5,7 @@ import sys
|
|
|
5
5
|
from functools import lru_cache
|
|
6
6
|
from getpass import getuser
|
|
7
7
|
from hashlib import sha256
|
|
8
|
+
from threading import Event, Lock
|
|
8
9
|
from typing import Optional
|
|
9
10
|
from uuid import getnode
|
|
10
11
|
|
|
@@ -51,8 +52,12 @@ def _get_execution_ide() -> str:
|
|
|
51
52
|
return "other"
|
|
52
53
|
|
|
53
54
|
|
|
55
|
+
_inflight_lock = Lock()
|
|
56
|
+
_inflight_events = {}
|
|
57
|
+
|
|
58
|
+
|
|
54
59
|
@lru_cache
|
|
55
|
-
def
|
|
60
|
+
def _compute_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> dict:
|
|
56
61
|
# default values
|
|
57
62
|
track = {"ide": _get_execution_ide()}
|
|
58
63
|
ident_res = "https://api64.ipify.org"
|
|
@@ -164,3 +169,26 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
164
169
|
track["ip"] = "0.0.0.0"
|
|
165
170
|
|
|
166
171
|
return track
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> dict:
|
|
175
|
+
key = (client_ip, client_visitorid)
|
|
176
|
+
with _inflight_lock:
|
|
177
|
+
event = _inflight_events.get(key)
|
|
178
|
+
if event is None:
|
|
179
|
+
event = Event()
|
|
180
|
+
_inflight_events[key] = event
|
|
181
|
+
is_owner = True
|
|
182
|
+
else:
|
|
183
|
+
is_owner = False
|
|
184
|
+
|
|
185
|
+
if not is_owner:
|
|
186
|
+
event.wait()
|
|
187
|
+
return _compute_track_metrics(client_ip, client_visitorid)
|
|
188
|
+
|
|
189
|
+
try:
|
|
190
|
+
return _compute_track_metrics(client_ip, client_visitorid)
|
|
191
|
+
finally:
|
|
192
|
+
with _inflight_lock:
|
|
193
|
+
event.set()
|
|
194
|
+
_inflight_events.pop(key, None)
|