upgini 1.2.142a2__py3-none-any.whl → 1.2.144__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.142a2"
1
+ __version__ = "1.2.144"
@@ -274,7 +274,7 @@ class FeaturesEnricher(TransformerMixin):
274
274
  self.X: pd.DataFrame | None = None
275
275
  self.y: pd.Series | None = None
276
276
  self.eval_set: list[tuple] | None = None
277
- self.autodetected_search_keys: dict[str, SearchKey] = dict()
277
+ self.autodetected_search_keys: dict[str, SearchKey] | None = None
278
278
  self.imbalanced = False
279
279
  self.fit_select_features = True
280
280
  self.__cached_sampled_datasets: dict[str, tuple[pd.DataFrame, pd.DataFrame, pd.Series, dict, dict, dict]] = (
@@ -1311,10 +1311,17 @@ class FeaturesEnricher(TransformerMixin):
1311
1311
  def _get_autodetected_search_keys(self):
1312
1312
  if self.autodetected_search_keys is None and self._search_task is not None:
1313
1313
  meta = self._search_task.get_file_metadata(self._get_trace_id())
1314
- self.autodetected_search_keys = {k: SearchKey[v] for k, v in meta.autodetectedSearchKeys.items()}
1314
+ autodetected_search_keys = meta.autodetectedSearchKeys or {}
1315
+ self.autodetected_search_keys = {k: SearchKey[v] for k, v in autodetected_search_keys.items()}
1315
1316
 
1316
1317
  return self.autodetected_search_keys
1317
1318
 
1319
+ def _add_autodetected_search_keys(self, adding_search_keys: dict[str, SearchKey]):
1320
+ if self.autodetected_search_keys is None:
1321
+ self.autodetected_search_keys = dict()
1322
+ self.autodetected_search_keys.update(adding_search_keys)
1323
+ return self.autodetected_search_keys
1324
+
1318
1325
  def _get_fit_search_keys_with_original_names(self):
1319
1326
  if self.fit_search_keys is None and self._search_task is not None:
1320
1327
  fit_search_keys = dict()
@@ -2553,9 +2560,7 @@ if response.status_code == 200:
2553
2560
 
2554
2561
  self.__validate_search_keys(search_keys, self.search_id)
2555
2562
 
2556
- validated_X, validated_y, validated_eval_set = self._validate_train_eval(
2557
- X, y, eval_set=None, is_transform=True
2558
- )
2563
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set=None, is_transform=True)
2559
2564
  df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
2560
2565
 
2561
2566
  validated_Xy = df.copy()
@@ -2956,10 +2961,6 @@ if response.status_code == 200:
2956
2961
  if add_fit_system_record_id:
2957
2962
  result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
2958
2963
 
2959
- for c in result.columns:
2960
- if result[c].dtype == "category":
2961
- result.loc[:, c] = np.where(~result[c].isin(result[c].dtype.categories), np.nan, result[c])
2962
-
2963
2964
  return result, columns_renaming, generated_features, search_keys
2964
2965
 
2965
2966
  def _selecting_input_and_generated_columns(
@@ -3649,7 +3650,8 @@ if response.status_code == 200:
3649
3650
  keys.append("EMAIL")
3650
3651
  if "DATE" in keys:
3651
3652
  keys.append("DATETIME")
3652
- search_keys_with_autodetection = {**self.search_keys, **self.autodetected_search_keys}
3653
+ autodetected_search_keys = self.autodetected_search_keys or {}
3654
+ search_keys_with_autodetection = {**self.search_keys, **autodetected_search_keys}
3653
3655
  return [c for c, v in search_keys_with_autodetection.items() if v.value.value in keys]
3654
3656
 
3655
3657
  def _validate_train_eval(
@@ -4784,7 +4786,7 @@ if response.status_code == 200:
4784
4786
  maybe_date = [k for k, v in valid_search_keys.items() if v in [SearchKey.DATE, SearchKey.DATETIME]]
4785
4787
  if (self.cv is None or self.cv == CVType.k_fold) and len(maybe_date) > 0 and not silent_mode:
4786
4788
  date_column = next(iter(maybe_date))
4787
- if x[date_column].nunique() > 0.9 * _num_samples(x):
4789
+ if x[date_column].nunique() > 0.9 * _num_samples(x) and not is_transform:
4788
4790
  msg = self.bundle.get("date_search_without_time_series")
4789
4791
  self.__log_warning(msg)
4790
4792
 
@@ -4884,8 +4886,9 @@ if response.status_code == 200:
4884
4886
  maybe_keys = DateSearchKeyDetector().get_search_key_columns(sample, search_keys)
4885
4887
  if len(maybe_keys) > 0:
4886
4888
  datetime_key = maybe_keys[0]
4887
- search_keys[datetime_key] = SearchKey.DATETIME
4888
- self.autodetected_search_keys[datetime_key] = SearchKey.DATETIME
4889
+ new_keys = {datetime_key: SearchKey.DATETIME}
4890
+ search_keys.update(new_keys)
4891
+ self._add_autodetected_search_keys(new_keys)
4889
4892
  self.logger.info(f"Autodetected search key DATETIME in column {datetime_key}")
4890
4893
  print(self.bundle.get("datetime_detected").format(datetime_key))
4891
4894
 
@@ -4894,15 +4897,16 @@ if response.status_code == 200:
4894
4897
  if maybe_keys:
4895
4898
  new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
4896
4899
  search_keys.update(new_keys)
4897
- self.autodetected_search_keys.update(new_keys)
4900
+ self._add_autodetected_search_keys(new_keys)
4898
4901
  self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
4899
4902
  print(self.bundle.get("postal_code_detected").format(maybe_keys))
4900
4903
 
4901
4904
  if SearchKey.COUNTRY not in search_keys.values() and self.country_code is None:
4902
4905
  maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
4903
4906
  if maybe_key:
4904
- search_keys[maybe_key[0]] = SearchKey.COUNTRY
4905
- self.autodetected_search_keys[maybe_key[0]] = SearchKey.COUNTRY
4907
+ new_keys = {maybe_key[0]: SearchKey.COUNTRY}
4908
+ search_keys.update(new_keys)
4909
+ self._add_autodetected_search_keys(new_keys)
4906
4910
  self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
4907
4911
  print(self.bundle.get("country_detected").format(maybe_key))
4908
4912
 
@@ -4912,7 +4916,7 @@ if response.status_code == 200:
4912
4916
  if self.__is_registered or is_demo_dataset:
4913
4917
  new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
4914
4918
  search_keys.update(new_keys)
4915
- self.autodetected_search_keys.update(new_keys)
4919
+ self._add_autodetected_search_keys(new_keys)
4916
4920
  self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
4917
4921
  print(self.bundle.get("email_detected").format(maybe_keys))
4918
4922
  else:
@@ -4928,7 +4932,7 @@ if response.status_code == 200:
4928
4932
  if self.__is_registered or is_demo_dataset:
4929
4933
  new_keys = {key: SearchKey.PHONE for key in maybe_keys}
4930
4934
  search_keys.update(new_keys)
4931
- self.autodetected_search_keys.update(new_keys)
4935
+ self._add_autodetected_search_keys(new_keys)
4932
4936
  self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
4933
4937
  print(self.bundle.get("phone_detected").format(maybe_keys))
4934
4938
  else:
@@ -5025,7 +5029,9 @@ if response.status_code == 200:
5025
5029
  f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping"
5026
5030
  )
5027
5031
  else:
5028
- self.rest_client.dump_input_file(f"{tmp_dir}/x.parquet", "x.parquet", x_digest_sha256)
5032
+ self.rest_client.dump_input_file(
5033
+ trace_id_, f"{tmp_dir}/x.parquet", "x.parquet", x_digest_sha256
5034
+ )
5029
5035
 
5030
5036
  if y_ is not None:
5031
5037
  if isinstance(y_, pd.Series):
upgini/search_task.py CHANGED
@@ -434,4 +434,5 @@ def _read_parquet(file_content: bytes, file_name: str = "features.parquet"):
434
434
  tmp_file_name = f"{tmp_dir}/{file_name}"
435
435
  with open(tmp_file_name, "wb") as gzip_file:
436
436
  gzip_file.write(file_content)
437
- return pd.read_parquet(tmp_file_name, engine="fastparquet")
437
+ # Note: MLB writes files using pyarrow, so reading with fastparquet may cause errors.
438
+ return pd.read_parquet(tmp_file_name, engine="pyarrow")
@@ -4,16 +4,49 @@ from pandas.api.types import (
4
4
  is_object_dtype,
5
5
  is_string_dtype,
6
6
  )
7
+ import re
7
8
 
8
9
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
9
10
 
10
11
 
11
12
  class PostalCodeSearchKeyDetector(BaseSearchKeyDetector):
13
+ postal_pattern = re.compile(r'^[A-Za-z0-9][A-Za-z0-9\s\-]{1,9}$')
14
+
12
15
  def _is_search_key_by_name(self, column_name: str) -> bool:
13
- return str(column_name).lower() in ["zip", "zipcode", "zip_code", "postal_code", "postalcode"]
16
+ return "zip" in str(column_name).lower() or "postal" in str(column_name).lower()
14
17
 
15
18
  def _is_search_key_by_values(self, column: pd.Series) -> bool:
16
- return False
19
+ """
20
+ # Fast two-step check whether the column looks like a postal code.
21
+ # Returns True if, after removing missing values, values remain,
22
+ # and all of them match the common characteristics of a postal code.
23
+ """
24
+ # Check only columns that are candidates for postal code by column name
25
+ if not self._is_search_key_by_name(column.name):
26
+ return False
27
+
28
+ s = column.copy().dropna().astype(str).str.strip()
29
+ s = s[s != ""] # remove empty strings
30
+ if s.empty:
31
+ return False
32
+
33
+ # remove suffix ".0" (often after float)
34
+ s = s.str.replace(r"\.0$", "", regex=True)
35
+
36
+ # --- Step 1: fast filtering ---
37
+ mask_len = s.str.len().between(2, 10)
38
+ mask_digit = s.str.contains(r'\d', regex=True)
39
+ mask_chars = ~s.str.contains(r'[^A-Za-z0-9\s\-]', regex=True)
40
+ fast_mask = mask_len & mask_digit & mask_chars
41
+
42
+ # if any of them failed the fast check, return False
43
+ if not fast_mask.all():
44
+ return False
45
+
46
+ # --- Step 2: regex check ---
47
+ # only if the first step passed
48
+ valid_mask = s.apply(lambda x: bool(self.postal_pattern.fullmatch(x)))
49
+ return valid_mask.all()
17
50
 
18
51
 
19
52
  class PostalCodeSearchKeyConverter:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: upgini
3
- Version: 1.2.142a2
3
+ Version: 1.2.144
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,13 +1,13 @@
1
- upgini/__about__.py,sha256=dSeNLAH450Diequ6QaqtlfmtTqZqHt2ymoZ_4jsnfj0,26
1
+ upgini/__about__.py,sha256=aics4V8KhPF0GdMfEahLLcODv_MyCYM-IVrE84vOyM8,24
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=bkUpUC1sdhCQcLPysB7BC8WiFDPfjiJj1SztExpv0nA,33735
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=07decijgeSDaN6A1qVf71ZETu9nXVhNRWUTPWZeNpJo,234667
6
+ upgini/features_enricher.py,sha256=f_l0qOzclD6Zi2vEqfFbkFwuxDxFU9nFuaC88leDJGg,235034
7
7
  upgini/http.py,sha256=y26x4TQVYuEM3jz8JdASxSyBtvBemUkFf-FmX25sx-s,44356
8
8
  upgini/metadata.py,sha256=iYlL91g2PMHjiPIySIZb4IzIIUsPaAMIiV2It95GAjA,12866
9
9
  upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
10
- upgini/search_task.py,sha256=5mL_qV5mVtDkIumM9xCOgfa9Lc2B8mxJ1qI21iaScnQ,18656
10
+ upgini/search_task.py,sha256=zEXqfX95g0aLtVQUpwQjCvFqqMCAIWELAJRoeYBP5tQ,18746
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
13
13
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
@@ -64,7 +64,7 @@ upgini/utils/hash_utils.py,sha256=mP2yHyzvDNdpa5g3B4MHzulxBeEz_ZSoGl1YF_VnAyE,55
64
64
  upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
65
65
  upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
66
66
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
67
- upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
67
+ upgini/utils/postal_code_utils.py,sha256=COQ23lYcLS2Ky-qI-eBOVxhEMdBl7J2cFDT9F3XdswI,3003
68
68
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
69
69
  upgini/utils/psi.py,sha256=D_DMMBVkU4nwMospTwdMpYzNFACDxhqTuNesDngPwyY,11068
70
70
  upgini/utils/sample_utils.py,sha256=xpfYaZ2cYP7I2JrcooVc13QNBFawB81cJRuh38451Q4,15123
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=CihpV6SC95HwtlMH60rGAUzVDa4Id0Bva8ySprmNHlE,
74
74
  upgini/utils/track_info.py,sha256=NDKeQTUlZaYp15UoP-xLKGoDoJQ0drbDMwB0g9R0PUg,6427
75
75
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
76
76
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
77
- upgini-1.2.142a2.dist-info/METADATA,sha256=WCW6AUmnuZ1JnKGHmjZhdAzyJv4AVOtPleHTz7ewAfY,51166
78
- upgini-1.2.142a2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
79
- upgini-1.2.142a2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
- upgini-1.2.142a2.dist-info/RECORD,,
77
+ upgini-1.2.144.dist-info/METADATA,sha256=hBvc1KOqsAiItKjYzQDkxt07e8v3GAMQuOkYoNyRrl8,51164
78
+ upgini-1.2.144.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
79
+ upgini-1.2.144.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
+ upgini-1.2.144.dist-info/RECORD,,