PyPI - upgini - Versions diffs - 1.2.143__py3-none-any.whl → 1.2.145__py3-none-any.whl - Mend

upgini 1.2.143py3-none-any.whl → 1.2.145py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (8) hide show

upgini/__about__.py +1 -1
upgini/features_enricher.py +26 -17
upgini/search_task.py +2 -1
upgini/utils/postal_code_utils.py +4 -0
{upgini-1.2.143.dist-info → upgini-1.2.145.dist-info}/METADATA +1 -1
{upgini-1.2.143.dist-info → upgini-1.2.145.dist-info}/RECORD +8 -8
{upgini-1.2.143.dist-info → upgini-1.2.145.dist-info}/WHEEL +0 -0
{upgini-1.2.143.dist-info → upgini-1.2.145.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.~~143~~"
1	+ __version__ = "1.2.145"

upgini/features_enricher.py CHANGED Viewed

@@ -274,7 +274,7 @@ class FeaturesEnricher(TransformerMixin):
         self.X: pd.DataFrame | None = None
         self.y: pd.Series | None = None
         self.eval_set: list[tuple] | None = None
-        self.autodetected_search_keys: dict[str, SearchKey] = dict()
+        self.autodetected_search_keys: dict[str, SearchKey] | None = None
         self.imbalanced = False
         self.fit_select_features = True
         self.__cached_sampled_datasets: dict[str, tuple[pd.DataFrame, pd.DataFrame, pd.Series, dict, dict, dict]] = (
@@ -1311,10 +1311,17 @@ class FeaturesEnricher(TransformerMixin):
     def _get_autodetected_search_keys(self):
         if self.autodetected_search_keys is None and self._search_task is not None:
             meta = self._search_task.get_file_metadata(self._get_trace_id())
-            self.autodetected_search_keys = {k: SearchKey[v] for k, v in meta.autodetectedSearchKeys.items()}
+            autodetected_search_keys = meta.autodetectedSearchKeys or {}
+            self.autodetected_search_keys = {k: SearchKey[v] for k, v in autodetected_search_keys.items()}
         return self.autodetected_search_keys
+    def _add_autodetected_search_keys(self, adding_search_keys: dict[str, SearchKey]):
+        if self.autodetected_search_keys is None:
+            self.autodetected_search_keys = dict()
+        self.autodetected_search_keys.update(adding_search_keys)
+        return self.autodetected_search_keys
     def _get_fit_search_keys_with_original_names(self):
         if self.fit_search_keys is None and self._search_task is not None:
             fit_search_keys = dict()
@@ -2954,10 +2961,6 @@ if response.status_code == 200:
         if add_fit_system_record_id:
             result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
-        for c in result.columns:
-            if result[c].dtype == "category":
-                result.loc[:, c] = np.where(~result[c].isin(result[c].dtype.categories), np.nan, result[c])
         return result, columns_renaming, generated_features, search_keys
     def _selecting_input_and_generated_columns(
@@ -2997,15 +3000,16 @@ if response.status_code == 200:
         return selected_input_columns + selected_generated_features
-    def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
+    def _validate_empty_search_keys(self, search_keys: dict[str, SearchKey], is_transform: bool = False):
         if (search_keys is None or len(search_keys) == 0) and self.country_code is None:
-            if search_id:
-                self.logger.debug(f"search_id {search_id} provided without search_keys")
+            if is_transform:
+                self.logger.debug("Transform started without search_keys")
                 return
             else:
                 self.logger.warning("search_keys not provided")
                 raise ValidationError(self.bundle.get("empty_search_keys"))
+    def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
         key_types = search_keys.values()
         # Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
@@ -3647,7 +3651,8 @@ if response.status_code == 200:
             keys.append("EMAIL")
         if "DATE" in keys:
             keys.append("DATETIME")
-        search_keys_with_autodetection = {**self.search_keys, **self.autodetected_search_keys}
+        autodetected_search_keys = self.autodetected_search_keys or {}
+        search_keys_with_autodetection = {**self.search_keys, **autodetected_search_keys}
         return [c for c, v in search_keys_with_autodetection.items() if v.value.value in keys]
     def _validate_train_eval(
@@ -4797,6 +4802,8 @@ if response.status_code == 200:
         self.logger.info(f"Prepared search keys: {valid_search_keys}")
+        self._validate_empty_search_keys(valid_search_keys, is_transform=is_transform)
         return valid_search_keys
     def __show_metrics(
@@ -4882,8 +4889,9 @@ if response.status_code == 200:
             maybe_keys = DateSearchKeyDetector().get_search_key_columns(sample, search_keys)
             if len(maybe_keys) > 0:
                 datetime_key = maybe_keys[0]
-                search_keys[datetime_key] = SearchKey.DATETIME
-                self.autodetected_search_keys[datetime_key] = SearchKey.DATETIME
+                new_keys = {datetime_key: SearchKey.DATETIME}
+                search_keys.update(new_keys)
+                self._add_autodetected_search_keys(new_keys)
                 self.logger.info(f"Autodetected search key DATETIME in column {datetime_key}")
                 print(self.bundle.get("datetime_detected").format(datetime_key))
@@ -4892,15 +4900,16 @@ if response.status_code == 200:
         if maybe_keys:
             new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
             search_keys.update(new_keys)
-            self.autodetected_search_keys.update(new_keys)
+            self._add_autodetected_search_keys(new_keys)
             self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
             print(self.bundle.get("postal_code_detected").format(maybe_keys))
         if SearchKey.COUNTRY not in search_keys.values() and self.country_code is None:
             maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
             if maybe_key:
-                search_keys[maybe_key[0]] = SearchKey.COUNTRY
-                self.autodetected_search_keys[maybe_key[0]] = SearchKey.COUNTRY
+                new_keys = {maybe_key[0]: SearchKey.COUNTRY}
+                search_keys.update(new_keys)
+                self._add_autodetected_search_keys(new_keys)
                 self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
                 print(self.bundle.get("country_detected").format(maybe_key))
@@ -4910,7 +4919,7 @@ if response.status_code == 200:
                 if self.__is_registered or is_demo_dataset:
                     new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
                     search_keys.update(new_keys)
-                    self.autodetected_search_keys.update(new_keys)
+                    self._add_autodetected_search_keys(new_keys)
                     self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
                     print(self.bundle.get("email_detected").format(maybe_keys))
                 else:
@@ -4926,7 +4935,7 @@ if response.status_code == 200:
             if self.__is_registered or is_demo_dataset:
                 new_keys = {key: SearchKey.PHONE for key in maybe_keys}
                 search_keys.update(new_keys)
-                self.autodetected_search_keys.update(new_keys)
+                self._add_autodetected_search_keys(new_keys)
                 self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
                 print(self.bundle.get("phone_detected").format(maybe_keys))
             else:

upgini/search_task.py CHANGED Viewed

@@ -434,4 +434,5 @@ def _read_parquet(file_content: bytes, file_name: str = "features.parquet"):
         tmp_file_name = f"{tmp_dir}/{file_name}"
         with open(tmp_file_name, "wb") as gzip_file:
             gzip_file.write(file_content)
-        return pd.read_parquet(tmp_file_name, engine="fastparquet")
+        # Note: MLB writes files using pyarrow, so reading with fastparquet may cause errors.
+        return pd.read_parquet(tmp_file_name, engine="pyarrow")

upgini/utils/postal_code_utils.py CHANGED Viewed

@@ -21,6 +21,10 @@ class PostalCodeSearchKeyDetector(BaseSearchKeyDetector):
         # Returns True if, after removing missing values, values remain,
         # and all of them match the common characteristics of a postal code.
         """
+        # Check only columns that are candidates for postal code by column name
+        if not self._is_search_key_by_name(column.name):
+            return False
         s = column.copy().dropna().astype(str).str.strip()
         s = s[s != ""]  # remove empty strings
         if s.empty:

{upgini-1.2.143.dist-info → upgini-1.2.145.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: upgini
-Version: 1.2.143
+Version: 1.2.145
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.143.dist-info → upgini-1.2.145.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
-upgini/__about__.py,sha256=TkxQ31o1wT_libI5BqRH2rhpiwyfbBiFlzZY9cCcBfw,24
+upgini/__about__.py,sha256=4jbqu35vw7uX_qx6RMvIXk8QXoruhLaE4ANuc3Ev6ks,24
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=bkUpUC1sdhCQcLPysB7BC8WiFDPfjiJj1SztExpv0nA,33735
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=KTSNUaJ_j6fBiCFnFfqY92kfSqGMXdgoIO2L7dyYM68,234739
+upgini/features_enricher.py,sha256=7vAiAQNGH5BJENp22NfgFRJfsBNSYyFlIHJmPmA5F-I,235217
 upgini/http.py,sha256=y26x4TQVYuEM3jz8JdASxSyBtvBemUkFf-FmX25sx-s,44356
 upgini/metadata.py,sha256=iYlL91g2PMHjiPIySIZb4IzIIUsPaAMIiV2It95GAjA,12866
 upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
-upgini/search_task.py,sha256=5mL_qV5mVtDkIumM9xCOgfa9Lc2B8mxJ1qI21iaScnQ,18656
+upgini/search_task.py,sha256=zEXqfX95g0aLtVQUpwQjCvFqqMCAIWELAJRoeYBP5tQ,18746
 upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
 upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
 upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
@@ -64,7 +64,7 @@ upgini/utils/hash_utils.py,sha256=mP2yHyzvDNdpa5g3B4MHzulxBeEz_ZSoGl1YF_VnAyE,55
 upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
 upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
 upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
-upgini/utils/postal_code_utils.py,sha256=sxdk32CTLvey6zqBbay_HtNzLqcn2eAYUJL-sNmJuDg,2840
+upgini/utils/postal_code_utils.py,sha256=COQ23lYcLS2Ky-qI-eBOVxhEMdBl7J2cFDT9F3XdswI,3003
 upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
 upgini/utils/psi.py,sha256=D_DMMBVkU4nwMospTwdMpYzNFACDxhqTuNesDngPwyY,11068
 upgini/utils/sample_utils.py,sha256=xpfYaZ2cYP7I2JrcooVc13QNBFawB81cJRuh38451Q4,15123
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=CihpV6SC95HwtlMH60rGAUzVDa4Id0Bva8ySprmNHlE,
 upgini/utils/track_info.py,sha256=NDKeQTUlZaYp15UoP-xLKGoDoJQ0drbDMwB0g9R0PUg,6427
 upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.143.dist-info/METADATA,sha256=rabfdtvLV7EsidthqGBubCvVp8zWDjqDf7_Nur_1Dro,51164
-upgini-1.2.143.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-upgini-1.2.143.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.143.dist-info/RECORD,,
+upgini-1.2.145.dist-info/METADATA,sha256=p_rXUVOW9xW1zDnHfPkzMOUvs3Jumd7teykmLYWyEIM,51164
+upgini-1.2.145.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+upgini-1.2.145.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.145.dist-info/RECORD,,

{upgini-1.2.143.dist-info → upgini-1.2.145.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.143.dist-info → upgini-1.2.145.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.143__py3-none-any.whl → 1.2.145__py3-none-any.whl

Potentially problematic release.

upgini 1.2.143py3-none-any.whl → 1.2.145py3-none-any.whl