PyPI - upgini - Versions diffs - 1.2.87.dev3__py3-none-any.whl → 1.2.87.dev5__py3-none-any.whl - Mend

upgini 1.2.87.dev3py3-none-any.whl → 1.2.87.dev5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (9) hide show

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.87.~~dev3~~"
1	+ __version__ = "1.2.87.dev5"

upgini/features_enricher.py CHANGED Viewed

@@ -300,7 +300,7 @@ class FeaturesEnricher(TransformerMixin):
         self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
         self.metrics: Optional[pd.DataFrame] = None
         self.feature_names_ = []
-        self.dropped_client_feature_names_ = []
+        self.zero_shap_client_features = []
         self.feature_importances_ = []
         self.search_id = search_id
         self.disable_force_downsampling = disable_force_downsampling
@@ -315,7 +315,7 @@ class FeaturesEnricher(TransformerMixin):
                     self.logger.debug(f"FeaturesEnricher created from existing search: {search_id}")
                     self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
                     file_metadata = self._search_task.get_file_metadata(trace_id)
-                    x_columns = [c.originalName or c.name for c in file_metadata.columns]
+                    x_columns = [c.name for c in file_metadata.columns]
                     self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
                     df = pd.DataFrame(columns=x_columns)
                     self.__prepare_feature_importances(trace_id, df, silent=True)
@@ -2299,11 +2299,16 @@ if response.status_code == 200:
             self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
-            self.__validate_search_keys(self.search_keys, self.search_id)
+            filtered_columns = self.__filtered_enriched_features(
+                importance_threshold, max_features, trace_id, validated_X
+            )
+            # If there are no important features, return original dataframe
+            if not filtered_columns:
+                msg = self.bundle.get("no_important_features_for_transform")
+                self.__log_warning(msg, show_support_link=True)
+                return X, {c: c for c in X.columns}, [], dict()
-            if len(self.feature_names_) == 0:
-                self.logger.warning(self.bundle.get("no_important_features_for_transform"))
-                return X, {c: c for c in X.columns}, [], {}
+            self.__validate_search_keys(self.search_keys, self.search_id)
             if self._has_paid_features(exclude_features_sources):
                 msg = self.bundle.get("transform_with_paid_features")
@@ -2342,9 +2347,7 @@ if response.status_code == 200:
             is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
-            columns_to_drop = [
-                c for c in df.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
-            ]
+            columns_to_drop = [c for c in df.columns if c in self.feature_names_]
             if len(columns_to_drop) > 0:
                 msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
                 self.logger.warning(msg)
@@ -2400,6 +2403,17 @@ if response.status_code == 200:
             df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
             columns_renaming = normalizer.columns_renaming
+            # If there are no external features, we don't call backend on transform
+            external_features = [fm for fm in features_meta if fm.shap_value > 0 and fm.source != "etalon"]
+            if not external_features:
+                self.logger.warning(
+                    "No external features found, returning original dataframe"
+                    f" with generated important features: {filtered_columns}"
+                )
+                filtered_columns = [c for c in filtered_columns if c in df.columns]
+                self.logger.warning(f"Filtered columns by existance in dataframe: {filtered_columns}")
+                return df[filtered_columns], columns_renaming, generated_features, search_keys
             # Don't pass all features in backend on transform
             runtime_parameters = self._get_copy_of_runtime_parameters()
             features_for_transform = self._search_task.get_features_for_transform() or []
@@ -2444,6 +2458,8 @@ if response.status_code == 200:
             # Explode multiple search keys
             df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
+            # Convert search keys and generate features on them
             email_column = self._get_email_column(search_keys)
             hem_column = self._get_hem_column(search_keys)
             if email_column:
@@ -2632,17 +2648,15 @@ if response.status_code == 200:
                 how="left",
             )
+            selected_generated_features = [
+                c for c in generated_features if not self.fit_select_features or c in filtered_columns
+            ]
             selecting_columns = [
                 c
-                for c in itertools.chain(validated_Xy.columns.tolist(), generated_features)
-                if c not in self.dropped_client_feature_names_
+                for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
+                if c not in self.zero_shap_client_features
             ]
-            filtered_columns = self.__filtered_enriched_features(
-                importance_threshold, max_features, trace_id, validated_X
-            )
-            selecting_columns.extend(
-                c for c in filtered_columns if c in result.columns and c not in validated_X.columns
-            )
+            selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
             if add_fit_system_record_id:
                 selecting_columns.append(SORT_ID)
@@ -2942,7 +2956,10 @@ if response.status_code == 200:
                 self.__log_warning(fintech_warning)
         df, full_duplicates_warning = clean_full_duplicates(df, self.logger, bundle=self.bundle)
         if full_duplicates_warning:
-            self.__log_warning(full_duplicates_warning)
+            if len(df) == 0:
+                raise ValidationError(full_duplicates_warning)
+            else:
+                self.__log_warning(full_duplicates_warning)
         # Explode multiple search keys
         df = self.__add_fit_system_record_id(
@@ -3345,9 +3362,13 @@ if response.status_code == 200:
             Xy[TARGET] = y
             validated_y = Xy[TARGET].copy()
-        if validated_y.nunique() < 2:
+        y_nunique = validated_y.nunique()
+        if y_nunique < 2:
             raise ValidationError(self.bundle.get("y_is_constant"))
+        if self.model_task_type == ModelTaskType.BINARY and y_nunique != 2:
+            raise ValidationError(self.bundle.get("binary_target_unique_count_not_2").format(y_nunique))
         return validated_y
     def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
@@ -3422,9 +3443,13 @@ if response.status_code == 200:
         else:
             raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
-        if validated_eval_y.nunique() < 2:
+        eval_y_nunique = validated_eval_y.nunique()
+        if eval_y_nunique < 2:
             raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
+        if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
+            raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
         return validated_eval_X, validated_eval_y
     def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
@@ -3966,10 +3991,11 @@ if response.status_code == 200:
         original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
         features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
+        # To be sure that names with hash suffixes
         df = df.rename(columns=original_names_dict)
         self.feature_names_ = []
-        self.dropped_client_feature_names_ = []
+        self.zero_shap_client_features = []
         self.feature_importances_ = []
         features_info = []
         features_info_without_links = []
@@ -3981,7 +4007,7 @@ if response.status_code == 200:
             if feature_meta.name in original_names_dict.keys():
                 feature_meta.name = original_names_dict[feature_meta.name]
-            is_client_feature = feature_meta.name in df.columns
+            is_client_feature = original_names_dict.get(feature_meta.name, feature_meta.name) in df.columns
             # Show and update shap values for client features only if select_features is True
             if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
@@ -3997,13 +4023,13 @@ if response.status_code == 200:
         features_meta.sort(key=lambda m: (-m.shap_value, m.name))
         for feature_meta in features_meta:
-            is_client_feature = feature_meta.name in df.columns
+            original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
+            is_client_feature = original_name in df.columns
             # TODO make a decision about selected features based on special flag from mlb
             if original_shaps.get(feature_meta.name, 0.0) == 0.0:
-                if self.fit_select_features:
-                    self.dropped_client_feature_names_.append(feature_meta.name)
+                if is_client_feature and self.fit_select_features:
+                    self.zero_shap_client_features.append(original_name)
                 continue
             # Use only important features

upgini/metrics.py CHANGED Viewed

@@ -807,14 +807,16 @@ class CatBoostWrapper(EstimatorWrapper):
         try:
             from catboost import Pool
+            cat_features = None
             if cat_encoder is not None:
                 if isinstance(self.cv, TimeSeriesSplit) or isinstance(self.cv, BlockedTimeSeriesSplit):
                     encoded = cat_encoder.transform(x[self.cat_features]).astype(int)
-                    cat_features = None
                 else:
                     encoded = cat_encoder.transform(x[self.cat_features])
                     cat_features = encoded.columns.to_list()
                 x[self.cat_features] = encoded
+            else:
+                cat_features = self.cat_features
             # Create Pool for fold data, if need (for example, when categorical features are present)
             fold_pool = Pool(

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -68,6 +68,8 @@ too_many_generate_features=Too many columns passed in `generate_features` argume
 invalid_round_embeddings=Argument `round_embeddings` should be non negative integer
 no_important_features_for_transform=There are no important features for transform. Return input as transformed
 search_task_not_initial=Passed search_id {} is transform id. Please use search task id of fit call: {}.
+binary_target_unique_count_not_2=Binary target should contain only 2 unique values, but {} found
+binary_target_eval_unique_count_not_2=Binary target should contain only 2 unique values, but {} found in eval_set
 # Validation errors
     # params validation
@@ -156,7 +158,7 @@ dataset_too_few_rows=X size should be at least {} rows after validation
 dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample X
 dataset_empty_column_names=Some column names are empty. Add names please
 dataset_full_duplicates={:.5f}% of the rows are fully duplicated
-dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
+dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nSample of incorrect row indexes: {}
 dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
 dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
 dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset

upgini/utils/deduplicate_utils.py CHANGED Viewed

@@ -192,7 +192,7 @@ def clean_full_duplicates(
         unique_columns.remove(TARGET)
         marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
         if marked_duplicates.sum() > 0:
-            dups_indices = df[marked_duplicates].index.to_list()
+            dups_indices = df[marked_duplicates].index.to_list()[:100]
             nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
             num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
             share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup

{upgini-1.2.87.dev3.dist-info → upgini-1.2.87.dev5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.87.dev3
+Version: 1.2.87.dev5
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.87.dev3.dist-info → upgini-1.2.87.dev5.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
-upgini/__about__.py,sha256=-MoNpjvEXC0uIle8xxIgQduzBZJlNzuW-1rPMTm_xc8,28
+upgini/__about__.py,sha256=wcphyJpGJs2mZPWvsK3omRtXm2Q4NsYXyO0X5zcwLMw,28
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=fRtqSkXNONLnPe6cCL967GMt349FTIpXzy_u8LUKncw,35354
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=n8KBoBgJApLiRv4wXeSgfS-PfbB1D5aDOJfFnL0q6v8,214487
+upgini/features_enricher.py,sha256=eFnJVb8jM1INlT-imfjafhWtOfx9EJv2HSvlfyGy0_U,216188
 upgini/http.py,sha256=6Qcepv0tDC72mBBJxYHnA2xqw6QwFaKrXN8o4vju8Es,44372
 upgini/metadata.py,sha256=zt_9k0iQbWXuiRZcel4ORNPdQKt6Ou69ucZD_E1Q46o,12341
-upgini/metrics.py,sha256=CR_MKBcq1RlNMXeqc9S374JzHgunMl-mEmlTnZAm_VI,45236
+upgini/metrics.py,sha256=zIOaiyfQLedU9Fk4877drnlWh-KiImSkZpPeiq6Xr1E,45295
 upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
 upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
 upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
 upgini/normalizer/normalize_utils.py,sha256=g2TcDXZeJp9kAFO2sTqZ4CAsN4J1qHNgoJHZ8gtzUWo,7376
 upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
 upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
-upgini/resource_bundle/strings.properties,sha256=U_ewTI-qPww4X3WcFG3qDf_jv2vo6RrlCehVDjqtzEI,27991
+upgini/resource_bundle/strings.properties,sha256=xpHD-3mW1U6Nca0QghC6FSrQLDci9pInuMpOBPPiB8M,28212
 upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
 upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -52,7 +52,7 @@ upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk
 upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
 upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
 upgini/utils/datetime_utils.py,sha256=UL1ernnawW0LV9mPDpCIc6sFy0HUhFscWVNwfH4V7rI,14366
-upgini/utils/deduplicate_utils.py,sha256=jm9ARZ0fbJFF3aJqj-xm_T6lNh-WErM0H0h6B_L1xQc,8948
+upgini/utils/deduplicate_utils.py,sha256=EpBVCov42-FJIAPfa4jY_ZRct3N2MFaC7i-oJNZ_MGI,8954
 upgini/utils/display_utils.py,sha256=hAeWEcJtPDg8fAVcMNrNB-azFD2WJp1nvbPAhR7SeP4,12071
 upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
 upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.87.dev3.dist-info/METADATA,sha256=Pm-acVK8TpDLvPsO0qluwSjmu0cb3FHmtXmqMj--2Ag,49167
-upgini-1.2.87.dev3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.87.dev3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.87.dev3.dist-info/RECORD,,
+upgini-1.2.87.dev5.dist-info/METADATA,sha256=Jdb6gn8ijXK4ccs5hC9yEPA6dQBzc5FtelPXOJgBfJA,49167
+upgini-1.2.87.dev5.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.2.87.dev5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.87.dev5.dist-info/RECORD,,

{upgini-1.2.87.dev3.dist-info → upgini-1.2.87.dev5.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.87.dev3.dist-info → upgini-1.2.87.dev5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.87.dev3__py3-none-any.whl → 1.2.87.dev5__py3-none-any.whl

Potentially problematic release.

upgini 1.2.87.dev3py3-none-any.whl → 1.2.87.dev5py3-none-any.whl