upgini 1.2.124__py3-none-any.whl → 1.2.146a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -44,12 +44,14 @@ class FeaturesValidator:
44
44
  else:
45
45
  empty_or_constant_features.append(f)
46
46
 
47
- if one_hot_encoded_features:
48
- msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
49
- warnings.append(msg)
50
-
51
47
  columns_renaming = columns_renaming or {}
52
48
 
49
+ if one_hot_encoded_features and len(one_hot_encoded_features) > 1:
50
+ msg = bundle.get("one_hot_encoded_features").format(
51
+ [columns_renaming.get(f, f) for f in one_hot_encoded_features]
52
+ )
53
+ warnings.append(msg)
54
+
53
55
  if empty_or_constant_features:
54
56
  msg = bundle.get("empty_or_contant_features").format(
55
57
  [columns_renaming.get(f, f) for f in empty_or_constant_features]
@@ -4,16 +4,49 @@ from pandas.api.types import (
4
4
  is_object_dtype,
5
5
  is_string_dtype,
6
6
  )
7
+ import re
7
8
 
8
9
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
9
10
 
10
11
 
11
12
  class PostalCodeSearchKeyDetector(BaseSearchKeyDetector):
13
+ postal_pattern = re.compile(r'^[A-Za-z0-9][A-Za-z0-9\s\-]{1,9}$')
14
+
12
15
  def _is_search_key_by_name(self, column_name: str) -> bool:
13
- return str(column_name).lower() in ["zip", "zipcode", "zip_code", "postal_code", "postalcode"]
16
+ return "zip" in str(column_name).lower() or "postal" in str(column_name).lower()
14
17
 
15
18
  def _is_search_key_by_values(self, column: pd.Series) -> bool:
16
- return False
19
+ """
20
+ # Fast two-step check whether the column looks like a postal code.
21
+ # Returns True if, after removing missing values, values remain,
22
+ # and all of them match the common characteristics of a postal code.
23
+ """
24
+ # Check only columns that are candidates for postal code by column name
25
+ if not self._is_search_key_by_name(column.name):
26
+ return False
27
+
28
+ s = column.copy().dropna().astype(str).str.strip()
29
+ s = s[s != ""] # remove empty strings
30
+ if s.empty:
31
+ return False
32
+
33
+ # remove suffix ".0" (often after float)
34
+ s = s.str.replace(r"\.0$", "", regex=True)
35
+
36
+ # --- Step 1: fast filtering ---
37
+ mask_len = s.str.len().between(2, 10)
38
+ mask_digit = s.str.contains(r'\d', regex=True)
39
+ mask_chars = ~s.str.contains(r'[^A-Za-z0-9\s\-]', regex=True)
40
+ fast_mask = mask_len & mask_digit & mask_chars
41
+
42
+ # if any of them failed the fast check, return False
43
+ if not fast_mask.all():
44
+ return False
45
+
46
+ # --- Step 2: regex check ---
47
+ # only if the first step passed
48
+ valid_mask = s.apply(lambda x: bool(self.postal_pattern.fullmatch(x)))
49
+ return valid_mask.all()
17
50
 
18
51
 
19
52
  class PostalCodeSearchKeyConverter:
@@ -31,6 +31,7 @@ def define_task(
31
31
  ) -> ModelTaskType:
32
32
  if logger is None:
33
33
  logger = logging.getLogger()
34
+ logger.setLevel(logging.FATAL)
34
35
 
35
36
  # Replace inf and -inf with NaN to handle extreme values correctly
36
37
  y = y.replace([np.inf, -np.inf], np.nan, inplace=False)
@@ -143,7 +144,8 @@ def is_imbalanced(
143
144
  msg = bundle.get("dataset_rarest_class_less_min").format(
144
145
  min_class_value, min_class_count, MIN_TARGET_CLASS_ROWS
145
146
  )
146
- raise ValidationError(msg)
147
+ print(msg)
148
+ # raise ValidationError(msg)
147
149
 
148
150
  min_class_percent = IMBALANCE_THESHOLD / target_classes_count
149
151
  min_class_threshold = min_class_percent * count
@@ -5,6 +5,7 @@ import sys
5
5
  from functools import lru_cache
6
6
  from getpass import getuser
7
7
  from hashlib import sha256
8
+ from threading import Event, Lock
8
9
  from typing import Optional
9
10
  from uuid import getnode
10
11
 
@@ -51,8 +52,12 @@ def _get_execution_ide() -> str:
51
52
  return "other"
52
53
 
53
54
 
55
+ _inflight_lock = Lock()
56
+ _inflight_events = {}
57
+
58
+
54
59
  @lru_cache
55
- def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> dict:
60
+ def _compute_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> dict:
56
61
  # default values
57
62
  track = {"ide": _get_execution_ide()}
58
63
  ident_res = "https://api64.ipify.org"
@@ -164,3 +169,26 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
164
169
  track["ip"] = "0.0.0.0"
165
170
 
166
171
  return track
172
+
173
+
174
+ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> dict:
175
+ key = (client_ip, client_visitorid)
176
+ with _inflight_lock:
177
+ event = _inflight_events.get(key)
178
+ if event is None:
179
+ event = Event()
180
+ _inflight_events[key] = event
181
+ is_owner = True
182
+ else:
183
+ is_owner = False
184
+
185
+ if not is_owner:
186
+ event.wait()
187
+ return _compute_track_metrics(client_ip, client_visitorid)
188
+
189
+ try:
190
+ return _compute_track_metrics(client_ip, client_visitorid)
191
+ finally:
192
+ with _inflight_lock:
193
+ event.set()
194
+ _inflight_events.pop(key, None)