upgini 1.2.71a3832.dev11__py3-none-any.whl → 1.2.71a3832.dev13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +20 -9
- upgini/metrics.py +32 -5
- upgini/utils/target_utils.py +4 -2
- {upgini-1.2.71a3832.dev11.dist-info → upgini-1.2.71a3832.dev13.dist-info}/METADATA +1 -1
- {upgini-1.2.71a3832.dev11.dist-info → upgini-1.2.71a3832.dev13.dist-info}/RECORD +8 -8
- {upgini-1.2.71a3832.dev11.dist-info → upgini-1.2.71a3832.dev13.dist-info}/WHEEL +0 -0
- {upgini-1.2.71a3832.dev11.dist-info → upgini-1.2.71a3832.dev13.dist-info}/licenses/LICENSE +0 -0
    
        upgini/__about__.py
    CHANGED
    
    | @@ -1 +1 @@ | |
| 1 | 
            -
            __version__ = "1.2.71a3832. | 
| 1 | 
            +
            __version__ = "1.2.71a3832.dev13"
         | 
    
        upgini/features_enricher.py
    CHANGED
    
    | @@ -12,6 +12,7 @@ import tempfile | |
| 12 12 | 
             
            import time
         | 
| 13 13 | 
             
            import uuid
         | 
| 14 14 | 
             
            from collections import Counter
         | 
| 15 | 
            +
            from copy import deepcopy
         | 
| 15 16 | 
             
            from dataclasses import dataclass
         | 
| 16 17 | 
             
            from threading import Thread
         | 
| 17 18 | 
             
            from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
         | 
| @@ -3812,6 +3813,7 @@ if response.status_code == 200: | |
| 3812 3813 | 
             
                    features_meta = self._search_task.get_all_features_metadata_v2()
         | 
| 3813 3814 | 
             
                    if features_meta is None:
         | 
| 3814 3815 | 
             
                        raise Exception(self.bundle.get("missing_features_meta"))
         | 
| 3816 | 
            +
                    features_meta = deepcopy(features_meta)
         | 
| 3815 3817 |  | 
| 3816 3818 | 
             
                    original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
         | 
| 3817 3819 | 
             
                    df = df.rename(columns=original_names_dict)
         | 
| @@ -3854,6 +3856,7 @@ if response.status_code == 200: | |
| 3854 3856 | 
             
                    features_meta = self._search_task.get_all_features_metadata_v2()
         | 
| 3855 3857 | 
             
                    if features_meta is None:
         | 
| 3856 3858 | 
             
                        raise Exception(self.bundle.get("missing_features_meta"))
         | 
| 3859 | 
            +
                    features_meta = deepcopy(features_meta)
         | 
| 3857 3860 |  | 
| 3858 3861 | 
             
                    original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
         | 
| 3859 3862 | 
             
                    features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
         | 
| @@ -3869,15 +3872,23 @@ if response.status_code == 200: | |
| 3869 3872 |  | 
| 3870 3873 | 
             
                    original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
         | 
| 3871 3874 |  | 
| 3872 | 
            -
                    if updated_shaps is not None:
         | 
| 3873 | 
            -
                        for fm in features_meta:
         | 
| 3874 | 
            -
                            fm.shap_value = updated_shaps.get(fm.name, 0.0)
         | 
| 3875 | 
            -
             | 
| 3876 | 
            -
                    features_meta.sort(key=lambda m: (-m.shap_value, m.name))
         | 
| 3877 3875 | 
             
                    for feature_meta in features_meta:
         | 
| 3878 3876 | 
             
                        if feature_meta.name in original_names_dict.keys():
         | 
| 3879 3877 | 
             
                            feature_meta.name = original_names_dict[feature_meta.name]
         | 
| 3880 3878 |  | 
| 3879 | 
            +
                        if updated_shaps is not None:
         | 
| 3880 | 
            +
                            updating_shap = updated_shaps.get(feature_meta.name)
         | 
| 3881 | 
            +
                            if updating_shap is None:
         | 
| 3882 | 
            +
                                self.logger.warning(
         | 
| 3883 | 
            +
                                    f"WARNING: Shap value for feature {feature_meta.name} not found and will be set to 0.0"
         | 
| 3884 | 
            +
                                )
         | 
| 3885 | 
            +
                                updating_shap = 0.0
         | 
| 3886 | 
            +
                            feature_meta.shap_value = updating_shap
         | 
| 3887 | 
            +
             | 
| 3888 | 
            +
                    features_meta.sort(key=lambda m: (-m.shap_value, m.name))
         | 
| 3889 | 
            +
             | 
| 3890 | 
            +
                    for feature_meta in features_meta:
         | 
| 3891 | 
            +
             | 
| 3881 3892 | 
             
                        is_client_feature = feature_meta.name in df.columns
         | 
| 3882 3893 |  | 
| 3883 3894 | 
             
                        # TODO make a decision about selected features based on special flag from mlb
         | 
| @@ -3889,7 +3900,7 @@ if response.status_code == 200: | |
| 3889 3900 | 
             
                        # Use only important features
         | 
| 3890 3901 | 
             
                        if (
         | 
| 3891 3902 | 
             
                            # feature_meta.name in self.fit_generated_features or
         | 
| 3892 | 
            -
                            feature_meta.name == COUNTRY
         | 
| 3903 | 
            +
                            feature_meta.name == COUNTRY  # constant synthetic column
         | 
| 3893 3904 | 
             
                            # In select_features mode we select also from etalon features and need to show them
         | 
| 3894 3905 | 
             
                            or (not self.fit_select_features and is_client_feature)
         | 
| 3895 3906 | 
             
                        ):
         | 
| @@ -4039,11 +4050,11 @@ if response.status_code == 200: | |
| 4039 4050 | 
             
                    if len(filtered_importances) == 0:
         | 
| 4040 4051 | 
             
                        return []
         | 
| 4041 4052 |  | 
| 4042 | 
            -
                    filtered_importances = list(zip(self.feature_names_, self.feature_importances_))
         | 
| 4043 | 
            -
             | 
| 4044 4053 | 
             
                    if importance_threshold is not None:
         | 
| 4045 4054 | 
             
                        filtered_importances = [
         | 
| 4046 | 
            -
                            (name, importance) | 
| 4055 | 
            +
                            (name, importance)
         | 
| 4056 | 
            +
                            for name, importance in filtered_importances.items()
         | 
| 4057 | 
            +
                            if importance > importance_threshold
         | 
| 4047 4058 | 
             
                        ]
         | 
| 4048 4059 | 
             
                    if max_features is not None:
         | 
| 4049 4060 | 
             
                        filtered_importances = list(filtered_importances)[:max_features]
         | 
    
        upgini/metrics.py
    CHANGED
    
    | @@ -14,7 +14,9 @@ from lightgbm import LGBMClassifier, LGBMRegressor | |
| 14 14 | 
             
            import lightgbm as lgb
         | 
| 15 15 | 
             
            from numpy import log1p
         | 
| 16 16 | 
             
            from pandas.api.types import is_numeric_dtype
         | 
| 17 | 
            +
            # from sklearn.calibration import LabelEncoder
         | 
| 17 18 | 
             
            from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
         | 
| 19 | 
            +
            from sklearn.preprocessing import OrdinalEncoder
         | 
| 18 20 |  | 
| 19 21 | 
             
            from upgini.utils.features_validator import FeaturesValidator
         | 
| 20 22 | 
             
            from upgini.utils.sklearn_ext import cross_validate
         | 
| @@ -754,6 +756,8 @@ class LightGBMWrapper(EstimatorWrapper): | |
| 754 756 | 
             
                        logger=logger,
         | 
| 755 757 | 
             
                    )
         | 
| 756 758 | 
             
                    self.cat_features = None
         | 
| 759 | 
            +
                    # self.cat_features_encoders = dict()
         | 
| 760 | 
            +
                    self.cat_encoder = None
         | 
| 757 761 | 
             
                    self.n_classes = None
         | 
| 758 762 |  | 
| 759 763 | 
             
                def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
         | 
| @@ -765,9 +769,22 @@ class LightGBMWrapper(EstimatorWrapper): | |
| 765 769 | 
             
                    self.cat_features = _get_cat_features(x)
         | 
| 766 770 | 
             
                    if self.cat_features:
         | 
| 767 771 | 
             
                        params["categorical_feature"] = self.cat_features
         | 
| 768 | 
            -
             | 
| 769 | 
            -
             | 
| 770 | 
            -
                         | 
| 772 | 
            +
                        # params["categorical_feature"] = [x.columns.get_loc(c) for c in self.cat_features] Works
         | 
| 773 | 
            +
                        # params["categorical_feature"] = "notauto"
         | 
| 774 | 
            +
                        # params["categorical_feature"] = "name:" + ",".join(self.cat_features)  # Doesn't work
         | 
| 775 | 
            +
                        # cat_indices = [str(x.columns.get_loc(c)) for c in self.cat_features] Doesn't work
         | 
| 776 | 
            +
                        # params["categorical_feature"] = ",".join(cat_indices)
         | 
| 777 | 
            +
                        pass
         | 
| 778 | 
            +
                        x = fill_na_cat_features(x, self.cat_features)
         | 
| 779 | 
            +
                        encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
         | 
| 780 | 
            +
                        encoded = encoder.fit_transform(x[self.cat_features], y_numpy)
         | 
| 781 | 
            +
                        x[self.cat_features] = encoded
         | 
| 782 | 
            +
                        self.cat_encoder = encoder
         | 
| 783 | 
            +
                    # for feature in self.cat_features:
         | 
| 784 | 
            +
                    #     encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
         | 
| 785 | 
            +
                    #     x[feature] = encoder.fit_transform(x[feature])
         | 
| 786 | 
            +
                    #     self.cat_features_encoders[feature] = encoder
         | 
| 787 | 
            +
                        # x[feature] = x[feature].astype("category").cat.codes
         | 
| 771 788 | 
             
                    if not is_numeric_dtype(y_numpy):
         | 
| 772 789 | 
             
                        y_numpy = correct_string_target(y_numpy)
         | 
| 773 790 |  | 
| @@ -776,9 +793,19 @@ class LightGBMWrapper(EstimatorWrapper): | |
| 776 793 | 
             
                def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
         | 
| 777 794 | 
             
                    x, y_numpy, params = super()._prepare_to_calculate(x, y)
         | 
| 778 795 | 
             
                    if self.cat_features is not None:
         | 
| 796 | 
            +
                        params["categorical_feature"] = self.cat_features
         | 
| 797 | 
            +
                        # params["categorical_feature"] = [x.columns.get_loc(c) for c in self.cat_features]
         | 
| 798 | 
            +
                        # params["categorical_feature"] = "notauto"
         | 
| 799 | 
            +
                        # params["categorical_feature"] = "name:" + ",".join(self.cat_features)  # Doesn't work
         | 
| 800 | 
            +
                        # cat_indices = [str(x.columns.get_loc(c)) for c in self.cat_features]
         | 
| 801 | 
            +
                        # params["categorical_feature"] = ",".join(cat_indices)
         | 
| 779 802 | 
             
                        x = fill_na_cat_features(x, self.cat_features)
         | 
| 780 | 
            -
                         | 
| 781 | 
            -
                            x[ | 
| 803 | 
            +
                        if self.cat_encoder is not None:
         | 
| 804 | 
            +
                            x[self.cat_features] = self.cat_encoder.transform(x[self.cat_features])
         | 
| 805 | 
            +
                        # for feature in self.cat_features:
         | 
| 806 | 
            +
                        #     encoder = self.cat_features_encoders[feature]
         | 
| 807 | 
            +
                        #     x[feature] = encoder.transform(x[feature])
         | 
| 808 | 
            +
                            # x[feature] = x[feature].astype("category").cat.codes
         | 
| 782 809 | 
             
                    if not is_numeric_dtype(y):
         | 
| 783 810 | 
             
                        y_numpy = correct_string_target(y_numpy)
         | 
| 784 811 | 
             
                    return x, y_numpy, params
         | 
    
        upgini/utils/target_utils.py
    CHANGED
    
    | @@ -204,7 +204,7 @@ def balance_undersample( | |
| 204 204 | 
             
            def balance_undersample_forced(
         | 
| 205 205 | 
             
                df: pd.DataFrame,
         | 
| 206 206 | 
             
                target_column: str,
         | 
| 207 | 
            -
                id_columns: List[str],
         | 
| 207 | 
            +
                id_columns: Optional[List[str]],
         | 
| 208 208 | 
             
                date_column: str,
         | 
| 209 209 | 
             
                task_type: ModelTaskType,
         | 
| 210 210 | 
             
                cv_type: Optional[CVType],
         | 
| @@ -287,7 +287,7 @@ DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4) | |
| 287 287 |  | 
| 288 288 | 
             
            def balance_undersample_time_series_trunc(
         | 
| 289 289 | 
             
                df: pd.DataFrame,
         | 
| 290 | 
            -
                id_columns: List[str],
         | 
| 290 | 
            +
                id_columns: Optional[List[str]],
         | 
| 291 291 | 
             
                date_column: str,
         | 
| 292 292 | 
             
                sample_size: int,
         | 
| 293 293 | 
             
                random_state: int = 42,
         | 
| @@ -298,6 +298,8 @@ def balance_undersample_time_series_trunc( | |
| 298 298 | 
             
                **kwargs,
         | 
| 299 299 | 
             
            ):
         | 
| 300 300 | 
             
                # Convert date column to datetime
         | 
| 301 | 
            +
                if id_columns is None:
         | 
| 302 | 
            +
                    id_columns = [date_column]
         | 
| 301 303 | 
             
                dates_df = df[id_columns + [date_column]].copy()
         | 
| 302 304 | 
             
                dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
         | 
| 303 305 |  | 
| @@ -1,6 +1,6 @@ | |
| 1 1 | 
             
            Metadata-Version: 2.3
         | 
| 2 2 | 
             
            Name: upgini
         | 
| 3 | 
            -
            Version: 1.2.71a3832. | 
| 3 | 
            +
            Version: 1.2.71a3832.dev13
         | 
| 4 4 | 
             
            Summary: Intelligent data search & enrichment for Machine Learning
         | 
| 5 5 | 
             
            Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
         | 
| 6 6 | 
             
            Project-URL: Homepage, https://upgini.com/
         | 
| @@ -1,12 +1,12 @@ | |
| 1 | 
            -
            upgini/__about__.py,sha256= | 
| 1 | 
            +
            upgini/__about__.py,sha256=buorll9F2OX4EgV8VmlIrj09nqmsSmqAG8T8p6hRCls,34
         | 
| 2 2 | 
             
            upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
         | 
| 3 3 | 
             
            upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
         | 
| 4 4 | 
             
            upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
         | 
| 5 5 | 
             
            upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
         | 
| 6 | 
            -
            upgini/features_enricher.py,sha256= | 
| 6 | 
            +
            upgini/features_enricher.py,sha256=lk80Bx9U36lva6T4lPHBFk88ivrpZ-2uwwMwQg0LglE,207023
         | 
| 7 7 | 
             
            upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
         | 
| 8 8 | 
             
            upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
         | 
| 9 | 
            -
            upgini/metrics.py,sha256= | 
| 9 | 
            +
            upgini/metrics.py,sha256=ot6AhxfRRTzM-dNApWTvmteLBAmGjD9OyAuKmtUTprE,40630
         | 
| 10 10 | 
             
            upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
         | 
| 11 11 | 
             
            upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
         | 
| 12 12 | 
             
            upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
         | 
| @@ -66,11 +66,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml | |
| 66 66 | 
             
            upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
         | 
| 67 67 | 
             
            upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
         | 
| 68 68 | 
             
            upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
         | 
| 69 | 
            -
            upgini/utils/target_utils.py,sha256= | 
| 69 | 
            +
            upgini/utils/target_utils.py,sha256=KNFzJta1SpGU4sp07dHKSeVJlDs_9qgD2wcw5YuJfOc,16661
         | 
| 70 70 | 
             
            upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
         | 
| 71 71 | 
             
            upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
         | 
| 72 72 | 
             
            upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
         | 
| 73 | 
            -
            upgini-1.2.71a3832. | 
| 74 | 
            -
            upgini-1.2.71a3832. | 
| 75 | 
            -
            upgini-1.2.71a3832. | 
| 76 | 
            -
            upgini-1.2.71a3832. | 
| 73 | 
            +
            upgini-1.2.71a3832.dev13.dist-info/METADATA,sha256=JdRugxJAMW4KLyRuz7yIX_PqSz_nObynmhkW5-g_lVs,49102
         | 
| 74 | 
            +
            upgini-1.2.71a3832.dev13.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
         | 
| 75 | 
            +
            upgini-1.2.71a3832.dev13.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
         | 
| 76 | 
            +
            upgini-1.2.71a3832.dev13.dist-info/RECORD,,
         | 
| 
            File without changes
         | 
| 
            File without changes
         |