PyPI - upgini - Versions diffs - 1.2.121a2__py3-none-any.whl → 1.2.122a1__py3-none-any.whl - Mend

upgini 1.2.121a2py3-none-any.whl → 1.2.122a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

upgini/__about__.py +1 -1
upgini/autofe/feature.py +10 -0
upgini/features_enricher.py +31 -23
upgini/resource_bundle/strings.properties +1 -1
upgini/utils/features_validator.py +18 -7
upgini/utils/psi.py +0 -1
{upgini-1.2.121a2.dist-info → upgini-1.2.122a1.dist-info}/METADATA +1 -1
{upgini-1.2.121a2.dist-info → upgini-1.2.122a1.dist-info}/RECORD +10 -10
{upgini-1.2.121a2.dist-info → upgini-1.2.122a1.dist-info}/WHEEL +0 -0
{upgini-1.2.121a2.dist-info → upgini-1.2.122a1.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.~~121a2~~"
1	+ __version__ = "1.2.122a1"

upgini/autofe/feature.py CHANGED Viewed

@@ -42,6 +42,9 @@ class Column:
     def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
         return self.get_columns(**kwargs)[0]
+    def reset_display_indices(self) -> "Column":
+        return self
     def _unhash(self, feature_name: str) -> str:
         last_component_idx = feature_name.rfind("_")
         if not feature_name.startswith("f_"):
@@ -212,6 +215,13 @@ class Feature:
         self.cached_display_name = None
         return self
+    def reset_display_indices(self) -> "Feature":
+        for child in self.children:
+            child.reset_display_indices()
+        self.display_index = None
+        self.cached_display_name = None
+        return self
     def infer_type(self, data: pd.DataFrame) -> Union[str, DtypeObj]:
         if self.op.output_type:
             return self.op.output_type

upgini/features_enricher.py CHANGED Viewed

@@ -1028,7 +1028,7 @@ class FeaturesEnricher(TransformerMixin):
                     columns_renaming,
                     _,
                 ) = prepared_data
                 gc.collect()
                 if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
@@ -1406,7 +1406,7 @@ class FeaturesEnricher(TransformerMixin):
         self,
         X: pd.DataFrame,
         eval_set: list[tuple[pd.DataFrame, pd.Series]],
-        enriched_eval_set: dict,
+        enriched_eval_set: dict[int, tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]],
         eval_set_dates: dict[int, pd.Series],
         search_keys: dict[str, SearchKey],
         stability_threshold: float,
@@ -1417,31 +1417,42 @@ class FeaturesEnricher(TransformerMixin):
         # Find latest eval set or earliest if all eval sets are before train set
         date_column = self._get_date_column(search_keys)
+        date_converter = DateTimeSearchKeyConverter(
+            date_column, self.date_format, self.logger, self.bundle, generate_cyclical_features=False
+        )
+        X = date_converter.convert(X)
         x_date = X[date_column].dropna()
-        if not is_numeric_dtype(x_date):
-            x_date = pd.to_datetime(x_date).dt.floor("D").astype(np.int64) / 10**6
-        main_min_date = x_date.min()
+        if len(x_date) == 0:
+            self.logger.warning("Empty date column in X")
+            return []
-        for eval_x, _ in eval_set:
-            eval_x_date = eval_x[date_column].dropna()
-            if not is_numeric_dtype(eval_x_date):
-                eval_x[date_column] = pd.to_datetime(eval_x_date).dt.floor("D").astype(np.int64) / 10**6
+        main_min_date = x_date.min()
         # Find minimum date for each eval_set and compare with main dataset
         eval_dates = []
         for i, (eval_x, _) in enumerate(eval_set):
-            if date_column in eval_x.columns:
-                if len(eval_x) < 1000:
-                    self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
-                    continue
-                eval_x_date = eval_x[date_column].dropna()
-                if not is_numeric_dtype(eval_x_date):
-                    eval_x_date = pd.to_datetime(eval_x_date).dt.floor("D").astype(np.int64) / 10**6
-                eval_min_date = eval_x_date.min()
-                eval_max_date = eval_x_date.max()
-                eval_dates.append((i, eval_min_date, eval_max_date))
+            if date_column not in eval_x.columns:
+                self.logger.warning(f"Date column not found in eval_set {i + 1}")
+                continue
+            eval_x = date_converter.convert(eval_x)
+            eval_x_date = eval_x[date_column].dropna()
+            if len(eval_x_date) < 1000:
+                self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
+                continue
+            if len(enriched_eval_set[i][2]) < 1000:
+                self.logger.warning(
+                    f"Enriched eval_set {i} has less than 1000 rows. It will be ignored for stability check"
+                )
+                continue
+            eval_min_date = eval_x_date.min()
+            eval_max_date = eval_x_date.max()
+            eval_dates.append((i, eval_min_date, eval_max_date))
         if not eval_dates:
+            self.logger.warning("There are no correct eval_sets for stability check")
             return []
         # Check if any eval_set has minimum date >= main dataset minimum date
@@ -1464,10 +1475,7 @@ class FeaturesEnricher(TransformerMixin):
         checking_eval_set_df = checking_eval_set_df.copy()
         checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
-        if not is_numeric_dtype(checking_eval_set_df[date_column]):
-            checking_eval_set_df[date_column] = (
-                pd.to_datetime(checking_eval_set_df[date_column]).dt.floor("D").astype(np.int64) / 10**6
-            )
+        checking_eval_set_df = date_converter.convert(checking_eval_set_df)
         psi_values_sparse = calculate_sparsity_psi(
             checking_eval_set_df, cat_features, date_column, self.logger, model_task_type

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -155,7 +155,7 @@ target_outliers_warning=We detected {} outliers in your sample.\nExamples of out
     # features validation
 empty_or_contant_features=Columns {} has value with frequency more than 99%, removed from X
 high_cardinality_features=Columns {} has high cardinality (>90% unique values), removed from X
-one_hot_encoded_features=One hot encoded features detected: {}
+one_hot_encoded_features=One hot encoded features detected. Use int encoding for correct results of fit.\n{}
     # Dataset validation
 dataset_too_few_rows=X size should be at least {} rows after validation

upgini/utils/features_validator.py CHANGED Viewed

@@ -46,7 +46,7 @@ class FeaturesValidator:
         if one_hot_encoded_features:
             msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
-            self.logger.info(msg)
+            warnings.append(msg)
         columns_renaming = columns_renaming or {}
@@ -100,18 +100,29 @@ class FeaturesValidator:
     @staticmethod
     def is_one_hot_encoded(series: pd.Series) -> bool:
         try:
-            # Column contains only 0 and 1 (as strings or numbers)
-            series = series.astype(float)
-            if set(series.unique()) != {0.0, 1.0}:
+            # All rows should be the same type
+            if series.apply(lambda x: type(x)).nunique() != 1:
+                return False
+            # First, handle string representations of True/False
+            series_copy = series.copy()
+            if series_copy.dtype == "object" or series_copy.dtype == "string":
+                # Convert string representations of boolean values to numeric
+                series_copy = series_copy.astype(str).str.strip().str.lower()
+                series_copy = series_copy.replace({"true": "1", "false": "0"})
+            # Column contains only 0 and 1 (as strings or numbers or booleans)
+            series_copy = series_copy.astype(float)
+            if set(series_copy.unique()) != {0.0, 1.0}:
                 return False
-            series = series.astype(int)
+            series_copy = series_copy.astype(int)
             # Column doesn't contain any NaN, np.NaN, space, null, etc.
-            if not (series.isin([0, 1])).all():
+            if not (series_copy.isin([0, 1])).all():
                 return False
-            vc = series.value_counts()
+            vc = series_copy.value_counts()
             # Column should contain both 0 and 1
             if len(vc) != 2:
                 return False

upgini/utils/psi.py CHANGED Viewed

@@ -7,7 +7,6 @@ from typing import Callable, Dict, Optional
 import more_itertools
 import numpy as np
 import pandas as pd
-from pandas.api.types import is_numeric_dtype
 from pydantic import BaseModel
 from upgini.metadata import TARGET, ModelTaskType

{upgini-1.2.121a2.dist-info → upgini-1.2.122a1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.121a2
+Version: 1.2.122a1
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.121a2.dist-info → upgini-1.2.122a1.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-upgini/__about__.py,sha256=Dv8DzHbPAHs_fY_MACW4HNqnYW7CilejShdVPFkTaYM,26
+upgini/__about__.py,sha256=hzzmPAt8OZIX5YRwSKl5dj9LWowWDEnOpFN5Xq2xARQ,26
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=pQ8JQe0cdygD-W9GefJmfE6bnj4EYzXsjlgWdIS9nS8,31578
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=Du1S72F55cqyKbHT3VGSPnJO3XicWABFVkA2-G3chdA,231696
+upgini/features_enricher.py,sha256=lBaecwDHkKpYWTz8fxs5Q12bDJGPLcDOesCPh0xX96s,231839
 upgini/http.py,sha256=-J_wOpnwVnT0ebPC6sOs6fN3AWtCD0LJLu6nlYmxaqk,44348
 upgini/metadata.py,sha256=VzgtgEbPPtNxTrj9LM5qSDP3DujHwAXqbUSKBjPcb9c,12477
 upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
@@ -16,7 +16,7 @@ upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
 upgini/autofe/binary.py,sha256=oOEECc4nRzZN2tYaiqx8F2XHnfWpk1bVvb7ZkZJ0lO8,7709
 upgini/autofe/date.py,sha256=RvexgrL1_6ISYPVrl9HUQmPgpVSGQsTNv8YhNQWs-5M,11329
-upgini/autofe/feature.py,sha256=b4Ps_sCPui9b4h0K3ya85cfL1SWpLVrlHc40zkKVfAY,16329
+upgini/autofe/feature.py,sha256=2jOdTTnUqdUewznxsveuTLgKcPLPNtFWS0YQsYYBbPk,16622
 upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
 upgini/autofe/operator.py,sha256=RB3rKMjFi5Cx81RiYXN3OTCuXjmvzmFKQrxn4h0Oclo,5219
 upgini/autofe/unary.py,sha256=FFtvkQaT0cu_zPZ1jCLcsjik-UUh12qQFF3tUW8NqsE,6675
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
 upgini/normalizer/normalize_utils.py,sha256=mDh2mBW3aQMB4EFP2aHbf2dGMVkOcWnp4sKKvKDBh8w,8511
 upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
 upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
-upgini/resource_bundle/strings.properties,sha256=Kmc6ZHpo0hK-bEQuoQkU0SPIQCnIDYRKqkfN3a_gvRU,29237
+upgini/resource_bundle/strings.properties,sha256=KcXm1Nl6c3zswL91tIbG0DjuuNpzxUdCg1cY9f2-9cg,29283
 upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
 upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/sampler/base.py,sha256=Fva2FEhLiNRPZ9Q6uOtJRtRzwsayjv7aphalAZO_4lc,6452
@@ -58,7 +58,7 @@ upgini/utils/display_utils.py,sha256=uSG3JwpwCIgRJXsp-8ktuJ0Dh-WFti7IrRLMUfHfoDc
 upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
 upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
 upgini/utils/feature_info.py,sha256=6vihytwKma_TlXtTn4l6Aj4kqlOj0ouLy-yWVV6VUw8,7551
-upgini/utils/features_validator.py,sha256=RAnfX80GBFcz6-SlTSR0DF6BZzf7A7IL8dlIqEoSz_s,4265
+upgini/utils/features_validator.py,sha256=A_3AX7X5u5AH7RLgkTiS6dHxaOiq5vm8w4ijQWLGcMY,4871
 upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
 upgini/utils/hash_utils.py,sha256=mP2yHyzvDNdpa5g3B4MHzulxBeEz_ZSoGl1YF_VnAyE,5538
 upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
@@ -66,7 +66,7 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
 upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
 upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
 upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
-upgini/utils/psi.py,sha256=vw8QEktXSx29IiMJMxmDeFU_4lJInJBXt_XL5Muekzo,11114
+upgini/utils/psi.py,sha256=D_DMMBVkU4nwMospTwdMpYzNFACDxhqTuNesDngPwyY,11068
 upgini/utils/sample_utils.py,sha256=xpfYaZ2cYP7I2JrcooVc13QNBFawB81cJRuh38451Q4,15123
 upgini/utils/sklearn_ext.py,sha256=Pcy8sWD6f4YcE5Bu0UmXD4j0ICmXtrT8DJlTArM-_a0,49356
 upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.121a2.dist-info/METADATA,sha256=1XVh2jWKC2I3ElN4ftyEveTny9C1pU5z69Osnp6q7_s,50745
-upgini-1.2.121a2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.121a2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.121a2.dist-info/RECORD,,
+upgini-1.2.122a1.dist-info/METADATA,sha256=3pPdEVaYucgJB5Klks339i5-JTM7hJpEZUmZS7dEWi8,50745
+upgini-1.2.122a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.2.122a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.122a1.dist-info/RECORD,,

{upgini-1.2.121a2.dist-info → upgini-1.2.122a1.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.121a2.dist-info → upgini-1.2.122a1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.121a2__py3-none-any.whl → 1.2.122a1__py3-none-any.whl

upgini 1.2.121a2py3-none-any.whl → 1.2.122a1py3-none-any.whl