PyPI - upgini - Versions diffs - 1.2.142a2__tar.gz → 1.2.144__tar.gz - Mend

upgini 1.2.142a2tar.gz → 1.2.144tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (82) hide show

{upgini-1.2.142a2 → upgini-1.2.144}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: upgini
-Version: 1.2.142a2
+Version: 1.2.144
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

upgini-1.2.144/src/upgini/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.2.144"

{upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/features_enricher.py RENAMED Viewed

@@ -274,7 +274,7 @@ class FeaturesEnricher(TransformerMixin):
         self.X: pd.DataFrame | None = None
         self.y: pd.Series | None = None
         self.eval_set: list[tuple] | None = None
-        self.autodetected_search_keys: dict[str, SearchKey] = dict()
+        self.autodetected_search_keys: dict[str, SearchKey] | None = None
         self.imbalanced = False
         self.fit_select_features = True
         self.__cached_sampled_datasets: dict[str, tuple[pd.DataFrame, pd.DataFrame, pd.Series, dict, dict, dict]] = (
@@ -1311,10 +1311,17 @@ class FeaturesEnricher(TransformerMixin):
     def _get_autodetected_search_keys(self):
         if self.autodetected_search_keys is None and self._search_task is not None:
             meta = self._search_task.get_file_metadata(self._get_trace_id())
-            self.autodetected_search_keys = {k: SearchKey[v] for k, v in meta.autodetectedSearchKeys.items()}
+            autodetected_search_keys = meta.autodetectedSearchKeys or {}
+            self.autodetected_search_keys = {k: SearchKey[v] for k, v in autodetected_search_keys.items()}
         return self.autodetected_search_keys
+    def _add_autodetected_search_keys(self, adding_search_keys: dict[str, SearchKey]):
+        if self.autodetected_search_keys is None:
+            self.autodetected_search_keys = dict()
+        self.autodetected_search_keys.update(adding_search_keys)
+        return self.autodetected_search_keys
     def _get_fit_search_keys_with_original_names(self):
         if self.fit_search_keys is None and self._search_task is not None:
             fit_search_keys = dict()
@@ -2553,9 +2560,7 @@ if response.status_code == 200:
         self.__validate_search_keys(search_keys, self.search_id)
-        validated_X, validated_y, validated_eval_set = self._validate_train_eval(
-            X, y, eval_set=None, is_transform=True
-        )
+        validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set=None, is_transform=True)
         df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
         validated_Xy = df.copy()
@@ -2956,10 +2961,6 @@ if response.status_code == 200:
         if add_fit_system_record_id:
             result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
-        for c in result.columns:
-            if result[c].dtype == "category":
-                result.loc[:, c] = np.where(~result[c].isin(result[c].dtype.categories), np.nan, result[c])
         return result, columns_renaming, generated_features, search_keys
     def _selecting_input_and_generated_columns(
@@ -3649,7 +3650,8 @@ if response.status_code == 200:
             keys.append("EMAIL")
         if "DATE" in keys:
             keys.append("DATETIME")
-        search_keys_with_autodetection = {**self.search_keys, **self.autodetected_search_keys}
+        autodetected_search_keys = self.autodetected_search_keys or {}
+        search_keys_with_autodetection = {**self.search_keys, **autodetected_search_keys}
         return [c for c, v in search_keys_with_autodetection.items() if v.value.value in keys]
     def _validate_train_eval(
@@ -4784,7 +4786,7 @@ if response.status_code == 200:
         maybe_date = [k for k, v in valid_search_keys.items() if v in [SearchKey.DATE, SearchKey.DATETIME]]
         if (self.cv is None or self.cv == CVType.k_fold) and len(maybe_date) > 0 and not silent_mode:
             date_column = next(iter(maybe_date))
-            if x[date_column].nunique() > 0.9 * _num_samples(x):
+            if x[date_column].nunique() > 0.9 * _num_samples(x) and not is_transform:
                 msg = self.bundle.get("date_search_without_time_series")
                 self.__log_warning(msg)
@@ -4884,8 +4886,9 @@ if response.status_code == 200:
             maybe_keys = DateSearchKeyDetector().get_search_key_columns(sample, search_keys)
             if len(maybe_keys) > 0:
                 datetime_key = maybe_keys[0]
-                search_keys[datetime_key] = SearchKey.DATETIME
-                self.autodetected_search_keys[datetime_key] = SearchKey.DATETIME
+                new_keys = {datetime_key: SearchKey.DATETIME}
+                search_keys.update(new_keys)
+                self._add_autodetected_search_keys(new_keys)
                 self.logger.info(f"Autodetected search key DATETIME in column {datetime_key}")
                 print(self.bundle.get("datetime_detected").format(datetime_key))
@@ -4894,15 +4897,16 @@ if response.status_code == 200:
         if maybe_keys:
             new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
             search_keys.update(new_keys)
-            self.autodetected_search_keys.update(new_keys)
+            self._add_autodetected_search_keys(new_keys)
             self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
             print(self.bundle.get("postal_code_detected").format(maybe_keys))
         if SearchKey.COUNTRY not in search_keys.values() and self.country_code is None:
             maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
             if maybe_key:
-                search_keys[maybe_key[0]] = SearchKey.COUNTRY
-                self.autodetected_search_keys[maybe_key[0]] = SearchKey.COUNTRY
+                new_keys = {maybe_key[0]: SearchKey.COUNTRY}
+                search_keys.update(new_keys)
+                self._add_autodetected_search_keys(new_keys)
                 self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
                 print(self.bundle.get("country_detected").format(maybe_key))
@@ -4912,7 +4916,7 @@ if response.status_code == 200:
                 if self.__is_registered or is_demo_dataset:
                     new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
                     search_keys.update(new_keys)
-                    self.autodetected_search_keys.update(new_keys)
+                    self._add_autodetected_search_keys(new_keys)
                     self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
                     print(self.bundle.get("email_detected").format(maybe_keys))
                 else:
@@ -4928,7 +4932,7 @@ if response.status_code == 200:
             if self.__is_registered or is_demo_dataset:
                 new_keys = {key: SearchKey.PHONE for key in maybe_keys}
                 search_keys.update(new_keys)
-                self.autodetected_search_keys.update(new_keys)
+                self._add_autodetected_search_keys(new_keys)
                 self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
                 print(self.bundle.get("phone_detected").format(maybe_keys))
             else:
@@ -5025,7 +5029,9 @@ if response.status_code == 200:
                                 f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping"
                             )
                         else:
-                            self.rest_client.dump_input_file(f"{tmp_dir}/x.parquet", "x.parquet", x_digest_sha256)
+                            self.rest_client.dump_input_file(
+                                trace_id_, f"{tmp_dir}/x.parquet", "x.parquet", x_digest_sha256
+                            )
                         if y_ is not None:
                             if isinstance(y_, pd.Series):

{upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/search_task.py RENAMED Viewed

@@ -434,4 +434,5 @@ def _read_parquet(file_content: bytes, file_name: str = "features.parquet"):
         tmp_file_name = f"{tmp_dir}/{file_name}"
         with open(tmp_file_name, "wb") as gzip_file:
             gzip_file.write(file_content)
-        return pd.read_parquet(tmp_file_name, engine="fastparquet")
+        # Note: MLB writes files using pyarrow, so reading with fastparquet may cause errors.
+        return pd.read_parquet(tmp_file_name, engine="pyarrow")

{upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/postal_code_utils.py RENAMED Viewed

@@ -4,16 +4,49 @@ from pandas.api.types import (
     is_object_dtype,
     is_string_dtype,
 )
+import re
 from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
 class PostalCodeSearchKeyDetector(BaseSearchKeyDetector):
+    postal_pattern = re.compile(r'^[A-Za-z0-9][A-Za-z0-9\s\-]{1,9}$')
     def _is_search_key_by_name(self, column_name: str) -> bool:
-        return str(column_name).lower() in ["zip", "zipcode", "zip_code", "postal_code", "postalcode"]
+        return "zip" in str(column_name).lower() or "postal" in str(column_name).lower()
     def _is_search_key_by_values(self, column: pd.Series) -> bool:
-        return False
+        """
+        # Fast two-step check whether the column looks like a postal code.
+        # Returns True if, after removing missing values, values remain,
+        # and all of them match the common characteristics of a postal code.
+        """
+        # Check only columns that are candidates for postal code by column name
+        if not self._is_search_key_by_name(column.name):
+            return False
+        s = column.copy().dropna().astype(str).str.strip()
+        s = s[s != ""]  # remove empty strings
+        if s.empty:
+            return False
+        # remove suffix ".0" (often after float)
+        s = s.str.replace(r"\.0$", "", regex=True)
+        # --- Step 1: fast filtering ---
+        mask_len = s.str.len().between(2, 10)
+        mask_digit = s.str.contains(r'\d', regex=True)
+        mask_chars = ~s.str.contains(r'[^A-Za-z0-9\s\-]', regex=True)
+        fast_mask = mask_len & mask_digit & mask_chars
+        # if any of them failed the fast check, return False
+        if not fast_mask.all():
+            return False
+        # --- Step 2: regex check ---
+        # only if the first step passed
+        valid_mask = s.apply(lambda x: bool(self.postal_pattern.fullmatch(x)))
+        return valid_mask.all()
 class PostalCodeSearchKeyConverter: