PyPI - upgini - Versions diffs - 1.1.244a7__py3-none-any.whl → 1.1.244a8__py3-none-any.whl - Mend

upgini 1.1.244a7py3-none-any.whl → 1.1.244a8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

upgini/dataset.py CHANGED Viewed

@@ -61,7 +61,7 @@ class Dataset:  # (pd.DataFrame):
     FIT_SAMPLE_THRESHOLD = 200_000
     FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
     FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
-    MIN_SAMPLE_THRESHOLD = 20_000
+    MIN_SAMPLE_THRESHOLD = 5_000
     IMBALANCE_THESHOLD = 0.4
     MIN_TARGET_CLASS_ROWS = 100
     MAX_MULTICLASS_CLASS_COUNT = 100

upgini/features_enricher.py CHANGED Viewed

@@ -955,6 +955,7 @@ class FeaturesEnricher(TransformerMixin):
                         fitting_enriched_X,
                         scoring,
                         groups=groups,
+                        text_features=self.generate_features,
                     )
                     metric = wrapper.metric_name
                     multiplier = wrapper.multiplier
@@ -980,6 +981,7 @@ class FeaturesEnricher(TransformerMixin):
                             cat_features,
                             add_params=custom_loss_add_params,
                             groups=groups,
+                            text_features=self.generate_features,
                         )
                         etalon_metric = baseline_estimator.cross_val_predict(
                             fitting_X, y_sorted, self.baseline_score_column
@@ -1004,6 +1006,7 @@ class FeaturesEnricher(TransformerMixin):
                             cat_features,
                             add_params=custom_loss_add_params,
                             groups=groups,
+                            text_features=self.generate_features,
                         )
                         enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
                         self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")

upgini/metrics.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+import re
 from copy import deepcopy
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
@@ -201,6 +202,7 @@ class EstimatorWrapper:
         target_type: ModelTaskType,
         add_params: Optional[Dict[str, Any]] = None,
         groups: Optional[np.ndarray] = None,
+        text_features: Optional[List[str]] = None,
     ):
         self.estimator = estimator
         self.scorer = scorer
@@ -213,6 +215,7 @@ class EstimatorWrapper:
         self.add_params = add_params
         self.cv_estimators = None
         self.groups = groups
+        self.text_features = text_features
     def fit(self, X: pd.DataFrame, y: np.ndarray, **kwargs):
         X, y, _, fit_params = self._prepare_to_fit(X, y)
@@ -285,6 +288,7 @@ class EstimatorWrapper:
                 groups=groups,
                 fit_params=fit_params,
                 return_estimator=True,
+                error_score="raise",
             )
             metrics_by_fold = cv_results["test_score"]
             self.cv_estimators = cv_results["estimator"]
@@ -330,6 +334,7 @@ class EstimatorWrapper:
             "cv": cv,
             "target_type": target_type,
             "groups": groups,
+            "text_features": text_features,
         }
         if estimator is None:
             params = dict()
@@ -391,27 +396,56 @@ class CatBoostWrapper(EstimatorWrapper):
         cv: BaseCrossValidator,
         target_type: ModelTaskType,
         groups: Optional[List[str]] = None,
+        text_features: Optional[List[str]] = None,
     ):
         super(CatBoostWrapper, self).__init__(
-            estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups
+            estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
         )
         self.cat_features = None
         self.cat_features_idx = None
+        self.emb_groups = None
     def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
         X, y, groups, params = super()._prepare_to_fit(X, y)
+        # Find embeddings
+        emb_pattern = r"(.+)_emb\d+"
+        emb_features = [c for c in X.columns if re.match(emb_pattern, c) and is_numeric_dtype(X[c])]
+        embedding_features = []
+        if len(emb_features) > 0:
+            # group by source feature
+            self.emb_groups = dict()
+            for emb in emb_features:
+                source_feature = re.match(emb_pattern, emb).group(1)
+                embs = self.emb_groups.get(source_feature, [])
+                embs.append(emb)
+                self.emb_groups[source_feature] = embs
+            self.emb_groups = {
+                source_feature: embs for source_feature, embs in self.emb_groups.items() if len(embs) > 1
+            }
+            X, embedding_features = self.group_embeddings(X)
+            params["embedding_features"] = embedding_features
+        # Find text features from passed in generate_features
+        if self.text_features is not None:
+            self.text_features = [f for f in self.text_features if not is_numeric_dtype(X[f])]
+            params["text_features"] = self.text_features
+        # Find rest categorical features
         self.cat_features = _get_cat_features(X)
+        if self.text_features is not None:
+            self.cat_features = [
+                f for f in self.cat_features if f not in self.text_features and f not in embedding_features
+            ]
         X = fill_na_cat_features(X, self.cat_features)
-        # unique_cat_features = []
-        # # TODO try to remove this condition because now we remove constant features earlier
-        # for name in cat_features:
-        #     # Remove constant categorical features
-        #     if X[name].nunique() > 1:
-        #         unique_cat_features.append(name)
-        #     else:
-        #         X = X.drop(columns=name)
-        # cat_features_idx = [X.columns.get_loc(c) for c in unique_cat_features]
-        self.cat_features_idx = [X.columns.get_loc(c) for c in self.cat_features]
+        unique_cat_features = []
+        for name in self.cat_features:
+            # Remove constant categorical features
+            if X[name].nunique() > 1:
+                unique_cat_features.append(name)
+            else:
+                X = X.drop(columns=name)
+        self.cat_features_idx = [X.columns.get_loc(c) for c in unique_cat_features]
         if (
             hasattr(self.estimator, "get_param")
             and hasattr(self.estimator, "_init_params")
@@ -422,15 +456,32 @@ class CatBoostWrapper(EstimatorWrapper):
             self.cat_features_idx = list(cat_features_set)
             del self.estimator._init_params["cat_features"]
-        params.update({"cat_features": self.cat_features_idx})
+        params["cat_features"] = self.cat_features_idx
         return X, y, groups, params
+    def group_embeddings(self, df: pd.DataFrame):
+        emb_columns = []
+        for source_feature, embs in self.emb_groups.items():
+            emb_name = f"{source_feature}_emb"
+            df[embs] = df[embs].fillna(0.0)
+            df[emb_name] = df[embs].values.tolist()
+            df = df.drop(columns=embs)
+            emb_columns.append(emb_name)
+        return df, emb_columns
     def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
         X, y, params = super()._prepare_to_calculate(X, y)
+        if self.text_features is not None:
+            params["text_features"] = self.text_features
+        if self.emb_groups is not None:
+            X, emb_columns = self.group_embeddings(X)
+            params["embedding_features"] = emb_columns
         if self.cat_features is not None:
             X = fill_na_cat_features(X, self.cat_features)
         if self.cat_features_idx is not None:
-            params.update({"cat_features": self.cat_features_idx})
+            params["cat_features"] = self.cat_features_idx
         return X, y, params
@@ -444,9 +495,10 @@ class LightGBMWrapper(EstimatorWrapper):
         cv: BaseCrossValidator,
         target_type: ModelTaskType,
         groups: Optional[List[str]] = None,
+        text_features: Optional[List[str]] = None,
     ):
         super(LightGBMWrapper, self).__init__(
-            estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups
+            estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
         )
         self.cat_features = None
@@ -482,9 +534,10 @@ class OtherEstimatorWrapper(EstimatorWrapper):
         cv: BaseCrossValidator,
         target_type: ModelTaskType,
         groups: Optional[List[str]] = None,
+        text_features: Optional[List[str]] = None,
     ):
         super(OtherEstimatorWrapper, self).__init__(
-            estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups
+            estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
         )
         self.cat_features = None

upgini/utils/sklearn_ext.py CHANGED Viewed

@@ -21,6 +21,7 @@ from sklearn.metrics._scorer import _MultimetricScorer
 from sklearn.model_selection import check_cv
 from sklearn.utils.fixes import np_version, parse_version
 from sklearn.utils.validation import indexable
+from sklearn.model_selection import cross_validate as original_cross_validate
 _DEFAULT_TAGS = {
     "non_deterministic": False,
@@ -313,21 +314,23 @@ def cross_validate(
         return ret
     except Exception:
         logging.exception("Failed to execute overriden cross_validate. Fallback to original")
-        return cross_validate(
-            estimator,
-            X,
-            y,
-            groups=groups,
-            scoring=scoring,
-            cv=cv,
-            n_jobs=n_jobs,
-            verbose=verbose,
-            fit_params=fit_params,
-            pre_dispatch=pre_dispatch,
-            return_train_score=return_train_score,
-            return_estimator=return_estimator,
-            error_score=error_score,
-        )
+        raise
+        # fit_params["use_best_model"] = False
+        # return original_cross_validate(
+        #     estimator,
+        #     X,
+        #     y,
+        #     groups=groups,
+        #     scoring=scoring,
+        #     cv=cv,
+        #     n_jobs=n_jobs,
+        #     verbose=verbose,
+        #     fit_params=fit_params,
+        #     pre_dispatch=pre_dispatch,
+        #     return_train_score=return_train_score,
+        #     return_estimator=return_estimator,
+        #     error_score=error_score,
+        # )
 def _fit_and_score(

{upgini-1.1.244a7.dist-info → upgini-1.1.244a8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: upgini
-Version: 1.1.244a7
+Version: 1.1.244a8
 Summary: Intelligent data search & enrichment for Machine Learning
 Home-page: https://upgini.com/
 Author: Upgini Developers

{upgini-1.1.244a7.dist-info → upgini-1.1.244a8.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
 upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
 upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
-upgini/dataset.py,sha256=1AEKFg2ooGnlBzmxX6sw-sDJdDoT8HfHWNYacwGHZGI,50023
+upgini/dataset.py,sha256=2oOmBe8_mpwJ8Fw14gw4uZ1GgLU4PtjozkXhvIXhRq0,50022
 upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
-upgini/features_enricher.py,sha256=zWcHRM5QuBhHqK-eZsU_IgQNjOAqRU1Ga8G9g5elp54,164813
+upgini/features_enricher.py,sha256=tQI3qhwMqBfmPD3pygmT6Jrg6SiuLoc7FIXMUQRj1W4,165007
 upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
 upgini/http.py,sha256=eSG4gOpmCGlXmB6KIPNzAG8tRZNUjyYpMeUeHw_2li4,42264
 upgini/metadata.py,sha256=55t0uQI910tzTcnwxZCUL1413BhTiSm8oqiwp-94NyA,9613
-upgini/metrics.py,sha256=GFAZNu5V-xrALTiju0vMMdM-ckysnF23ogUtLJRNV08,23828
+upgini/metrics.py,sha256=6KglRDnHOotP5HttlkPj2oQMM0MDjY_QtUMrczpl3gQ,26065
 upgini/search_task.py,sha256=5n4qGJmtu48s0-FHAtF3L5qVLMd1JVW3FJlM8dFbh-s,17063
 upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
 upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
@@ -50,12 +50,12 @@ upgini/utils/ip_utils.py,sha256=Zf3F2cnQmOCH09QLQHetpjMFu1PnD0cTmDymn0SnSy8,1672
 upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,408
 upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3x_zs,409
 upgini/utils/progress_bar.py,sha256=iNXyqT3vKCeHpfiG5HHwr7Lk2cTtKViM93Fl8iZnjGc,1564
-upgini/utils/sklearn_ext.py,sha256=IMx2La70AXAggApVpT7sMEjWqVWon5AMZt4MARDsIMQ,43847
+upgini/utils/sklearn_ext.py,sha256=fbRQ2ggX2Ock61RJZ-QqvMasqy8-x71knjQrj19GTMM,44025
 upgini/utils/target_utils.py,sha256=qyj-bGsIEl9X2Vc5gwXtsuRaocvId8bn46F7mZ9dy9A,1707
 upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
 upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
-upgini-1.1.244a7.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.1.244a7.dist-info/METADATA,sha256=fMtoLkWQxixL27Yt7VVjRUUlx_khKSNlSQ4dulmE_hE,48264
-upgini-1.1.244a7.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
-upgini-1.1.244a7.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
-upgini-1.1.244a7.dist-info/RECORD,,
+upgini-1.1.244a8.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.1.244a8.dist-info/METADATA,sha256=2y08SoG74Ck2fqeRZy8OazKjnGNd5_S1G4I5JAYEY5M,48264
+upgini-1.1.244a8.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
+upgini-1.1.244a8.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
+upgini-1.1.244a8.dist-info/RECORD,,

{upgini-1.1.244a7.dist-info → upgini-1.1.244a8.dist-info}/LICENSE RENAMED Viewed

File without changes

{upgini-1.1.244a7.dist-info → upgini-1.1.244a8.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.1.244a7.dist-info → upgini-1.1.244a8.dist-info}/top_level.txt RENAMED Viewed

File without changes

upgini 1.1.244a7__py3-none-any.whl → 1.1.244a8__py3-none-any.whl

upgini 1.1.244a7py3-none-any.whl → 1.1.244a8py3-none-any.whl