upgini 1.2.122a4__py3-none-any.whl → 1.2.146a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -31,7 +31,10 @@ class FeatureInfo:
31
31
 
32
32
  @staticmethod
33
33
  def from_metadata(
34
- feature_meta: FeaturesMetadataV2, data: Optional[pd.DataFrame], is_client_feature: bool
34
+ feature_meta: FeaturesMetadataV2,
35
+ data: Optional[pd.DataFrame],
36
+ is_client_feature: bool,
37
+ is_generated_feature: bool,
35
38
  ) -> "FeatureInfo":
36
39
  return FeatureInfo(
37
40
  name=_get_name(feature_meta),
@@ -41,8 +44,8 @@ class FeatureInfo:
41
44
  value_preview=_get_feature_sample(feature_meta, data),
42
45
  provider=_get_provider(feature_meta, is_client_feature),
43
46
  internal_provider=_get_internal_provider(feature_meta, is_client_feature),
44
- source=_get_source(feature_meta, is_client_feature),
45
- internal_source=_get_internal_source(feature_meta, is_client_feature),
47
+ source=_get_source(feature_meta, is_client_feature, is_generated_feature),
48
+ internal_source=_get_internal_source(feature_meta, is_client_feature, is_generated_feature),
46
49
  update_frequency=feature_meta.update_frequency,
47
50
  commercial_schema=feature_meta.commercial_schema,
48
51
  doc_link=feature_meta.doc_link,
@@ -139,22 +142,30 @@ def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature:
139
142
  return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
140
143
 
141
144
 
142
- def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
145
+ def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool, is_generated_feature: bool) -> str:
146
+ if is_generated_feature:
147
+ return "AutoFE: features from Training dataset"
148
+
143
149
  sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
144
150
  source_links = _list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
145
151
  if sources:
146
152
  source = _make_links(sources, source_links)
147
153
  else:
148
- source = _get_internal_source(feature_meta, is_client_feature)
154
+ source = _get_internal_source(feature_meta, is_client_feature, is_generated_feature)
149
155
  return source
150
156
 
151
157
 
152
- def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
158
+ def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool, is_generated_feature: bool) -> str:
159
+ if is_generated_feature:
160
+ return "AutoFE: features from Training dataset"
161
+
153
162
  sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
154
163
  if sources:
155
164
  return ", ".join(sources)
165
+ elif feature_meta.data_source:
166
+ return feature_meta.data_source
156
167
  else:
157
- return feature_meta.data_source or (
168
+ return (
158
169
  LLM_SOURCE
159
170
  if not feature_meta.name.endswith("_country")
160
171
  and not feature_meta.name.endswith("_postal_code")
@@ -44,12 +44,14 @@ class FeaturesValidator:
44
44
  else:
45
45
  empty_or_constant_features.append(f)
46
46
 
47
- if one_hot_encoded_features:
48
- msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
49
- warnings.append(msg)
50
-
51
47
  columns_renaming = columns_renaming or {}
52
48
 
49
+ if one_hot_encoded_features and len(one_hot_encoded_features) > 1:
50
+ msg = bundle.get("one_hot_encoded_features").format(
51
+ [columns_renaming.get(f, f) for f in one_hot_encoded_features]
52
+ )
53
+ warnings.append(msg)
54
+
53
55
  if empty_or_constant_features:
54
56
  msg = bundle.get("empty_or_contant_features").format(
55
57
  [columns_renaming.get(f, f) for f in empty_or_constant_features]
@@ -4,16 +4,49 @@ from pandas.api.types import (
4
4
  is_object_dtype,
5
5
  is_string_dtype,
6
6
  )
7
+ import re
7
8
 
8
9
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
9
10
 
10
11
 
11
12
  class PostalCodeSearchKeyDetector(BaseSearchKeyDetector):
13
+ postal_pattern = re.compile(r'^[A-Za-z0-9][A-Za-z0-9\s\-]{1,9}$')
14
+
12
15
  def _is_search_key_by_name(self, column_name: str) -> bool:
13
- return str(column_name).lower() in ["zip", "zipcode", "zip_code", "postal_code", "postalcode"]
16
+ return "zip" in str(column_name).lower() or "postal" in str(column_name).lower()
14
17
 
15
18
  def _is_search_key_by_values(self, column: pd.Series) -> bool:
16
- return False
19
+ """
20
+ # Fast two-step check whether the column looks like a postal code.
21
+ # Returns True if, after removing missing values, values remain,
22
+ # and all of them match the common characteristics of a postal code.
23
+ """
24
+ # Check only columns that are candidates for postal code by column name
25
+ if not self._is_search_key_by_name(column.name):
26
+ return False
27
+
28
+ s = column.copy().dropna().astype(str).str.strip()
29
+ s = s[s != ""] # remove empty strings
30
+ if s.empty:
31
+ return False
32
+
33
+ # remove suffix ".0" (often after float)
34
+ s = s.str.replace(r"\.0$", "", regex=True)
35
+
36
+ # --- Step 1: fast filtering ---
37
+ mask_len = s.str.len().between(2, 10)
38
+ mask_digit = s.str.contains(r'\d', regex=True)
39
+ mask_chars = ~s.str.contains(r'[^A-Za-z0-9\s\-]', regex=True)
40
+ fast_mask = mask_len & mask_digit & mask_chars
41
+
42
+ # if any of them failed the fast check, return False
43
+ if not fast_mask.all():
44
+ return False
45
+
46
+ # --- Step 2: regex check ---
47
+ # only if the first step passed
48
+ valid_mask = s.apply(lambda x: bool(self.postal_pattern.fullmatch(x)))
49
+ return valid_mask.all()
17
50
 
18
51
 
19
52
  class PostalCodeSearchKeyConverter:
@@ -31,6 +31,7 @@ def define_task(
31
31
  ) -> ModelTaskType:
32
32
  if logger is None:
33
33
  logger = logging.getLogger()
34
+ logger.setLevel(logging.FATAL)
34
35
 
35
36
  # Replace inf and -inf with NaN to handle extreme values correctly
36
37
  y = y.replace([np.inf, -np.inf], np.nan, inplace=False)
@@ -143,7 +144,8 @@ def is_imbalanced(
143
144
  msg = bundle.get("dataset_rarest_class_less_min").format(
144
145
  min_class_value, min_class_count, MIN_TARGET_CLASS_ROWS
145
146
  )
146
- raise ValidationError(msg)
147
+ print(msg)
148
+ # raise ValidationError(msg)
147
149
 
148
150
  min_class_percent = IMBALANCE_THESHOLD / target_classes_count
149
151
  min_class_threshold = min_class_percent * count
@@ -5,6 +5,7 @@ import sys
5
5
  from functools import lru_cache
6
6
  from getpass import getuser
7
7
  from hashlib import sha256
8
+ from threading import Event, Lock
8
9
  from typing import Optional
9
10
  from uuid import getnode
10
11
 
@@ -51,8 +52,12 @@ def _get_execution_ide() -> str:
51
52
  return "other"
52
53
 
53
54
 
55
+ _inflight_lock = Lock()
56
+ _inflight_events = {}
57
+
58
+
54
59
  @lru_cache
55
- def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> dict:
60
+ def _compute_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> dict:
56
61
  # default values
57
62
  track = {"ide": _get_execution_ide()}
58
63
  ident_res = "https://api64.ipify.org"
@@ -164,3 +169,26 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
164
169
  track["ip"] = "0.0.0.0"
165
170
 
166
171
  return track
172
+
173
+
174
+ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> dict:
175
+ key = (client_ip, client_visitorid)
176
+ with _inflight_lock:
177
+ event = _inflight_events.get(key)
178
+ if event is None:
179
+ event = Event()
180
+ _inflight_events[key] = event
181
+ is_owner = True
182
+ else:
183
+ is_owner = False
184
+
185
+ if not is_owner:
186
+ event.wait()
187
+ return _compute_track_metrics(client_ip, client_visitorid)
188
+
189
+ try:
190
+ return _compute_track_metrics(client_ip, client_visitorid)
191
+ finally:
192
+ with _inflight_lock:
193
+ event.set()
194
+ _inflight_events.pop(key, None)