PyPI - upgini - Versions diffs - 1.2.120a1__py3-none-any.whl → 1.2.121a2__py3-none-any.whl - Mend

upgini 1.2.120a1py3-none-any.whl → 1.2.121a2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

upgini/__about__.py +1 -1
upgini/data_source/data_source_publisher.py +6 -3
upgini/dataset.py +0 -2
upgini/features_enricher.py +7 -10
upgini/http.py +1 -11
upgini/resource_bundle/strings.properties +1 -1
upgini/utils/features_validator.py +37 -16
upgini/utils/sklearn_ext.py +3 -4
{upgini-1.2.120a1.dist-info → upgini-1.2.121a2.dist-info}/METADATA +1 -1
{upgini-1.2.120a1.dist-info → upgini-1.2.121a2.dist-info}/RECORD +12 -12
{upgini-1.2.120a1.dist-info → upgini-1.2.121a2.dist-info}/WHEEL +0 -0
{upgini-1.2.120a1.dist-info → upgini-1.2.121a2.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.~~120a1~~"
1	+ __version__ = "1.2.121a2"

upgini/data_source/data_source_publisher.py CHANGED Viewed

@@ -519,21 +519,24 @@ class DataSourcePublisher:
         description: str = "",
     ):
         if model_type is not None and model_type not in ["ONNX", "CATBOOST"]:
-            raise ValueError(f"Invalid model type: {model_type}. Available values: ONNX")
+            raise ValueError(f"Invalid model type: {model_type}. Available values: ONNX, CATBOOST")
         metadata = {
             "modelName": name,
             "inputNames": input_names,
             "dateColumn": date_column,
             "scoreName": score_name,
             "searchTaskId": search_id,
-            "modelType": model_type or "ONNX",
+            "modelType": model_type or "CATBOOST",
             "description": description,
         }
         trace_id = str(uuid.uuid4())
         with MDC(trace_id=trace_id):
             try:
-                self._rest_client.upload_autofe_model(file_path, metadata, trace_id)
+                result = self._rest_client.upload_autofe_model(file_path, metadata, trace_id)
+                if "ERROR" in result:
+                    raise Exception(result)
+                print(result)
             except Exception:
                 self.logger.exception("Failed to upload autofe model")
                 raise

upgini/dataset.py CHANGED Viewed

@@ -694,9 +694,7 @@ class Dataset:
     def prepare_uploading_file(self, base_path: str) -> str:
         parquet_file_path = f"{base_path}/{self.dataset_name}.parquet"
-        print("Before saving parquet file")
         self.data.to_parquet(path=parquet_file_path, index=False, compression="gzip", engine="fastparquet")
-        print("After saving parquet file")
         uploading_file_size = Path(parquet_file_path).stat().st_size
         self.logger.info(f"Size of prepared uploading file: {uploading_file_size}. {len(self.data)} rows")
         if uploading_file_size > self.MAX_UPLOADING_FILE_SIZE:

upgini/features_enricher.py CHANGED Viewed

@@ -1028,13 +1028,7 @@ class FeaturesEnricher(TransformerMixin):
                     columns_renaming,
                     _,
                 ) = prepared_data
-                # rename baseline_score_column
-                reversed_renaming = {v: k for k, v in columns_renaming.items()}
-                baseline_score_column = self.baseline_score_column
-                if baseline_score_column is not None:
-                    baseline_score_column = reversed_renaming[baseline_score_column]
                 gc.collect()
                 if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
@@ -1089,7 +1083,7 @@ class FeaturesEnricher(TransformerMixin):
                             has_time=has_time,
                         )
                         baseline_cv_result = baseline_estimator.cross_val_predict(
-                            fitting_X, y_sorted, baseline_score_column
+                            fitting_X, y_sorted, self.baseline_score_column
                         )
                         baseline_metric = baseline_cv_result.get_display_metric()
                         if baseline_metric is None:
@@ -1192,7 +1186,7 @@ class FeaturesEnricher(TransformerMixin):
                                     f"on client features: {eval_X_sorted.columns.to_list()}"
                                 )
                                 etalon_eval_results = baseline_estimator.calculate_metric(
-                                    eval_X_sorted, eval_y_sorted, baseline_score_column
+                                    eval_X_sorted, eval_y_sorted, self.baseline_score_column
                                 )
                                 etalon_eval_metric = etalon_eval_results.get_display_metric()
                                 self.logger.info(
@@ -2502,6 +2496,9 @@ if response.status_code == 200:
     ) -> tuple[pd.DataFrame, dict[str, str], list[str], dict[str, SearchKey]]:
         if self._search_task is None:
             raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
+        features_meta = self._search_task.get_all_features_metadata_v2()
+        if features_meta is None:
+            raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
         start_time = time.time()
         search_id = self.search_id or (self._search_task.search_task_id if self._search_task is not None else None)
@@ -2531,7 +2528,6 @@ if response.status_code == 200:
                 self.__display_support_link(msg)
                 return None, {}, [], self.search_keys
-            features_meta = self._search_task.get_all_features_metadata_v2()
             online_api_features = [fm.name for fm in features_meta if fm.from_online_api and fm.shap_value > 0]
             if len(online_api_features) > 0:
                 self.logger.warning(
@@ -3382,6 +3378,7 @@ if response.status_code == 200:
             except KeyboardInterrupt as e:
                 print(self.bundle.get("search_stopping"))
                 self.rest_client.stop_search_task_v2(trace_id, self._search_task.search_task_id)
+                self._search_task = None
                 self.logger.warning(f"Search {self._search_task.search_task_id} stopped by user")
                 print(self.bundle.get("search_stopped"))
                 raise e

upgini/http.py CHANGED Viewed

@@ -426,26 +426,19 @@ class _RestClient:
     ) -> SearchTaskResponse:
         api_path = self.INITIAL_SEARCH_URI_FMT_V2
-        print("Before getting track metrics")
         track_metrics = get_track_metrics(self.client_ip, self.client_visitorid)
-        print("After getting track metrics")
         def open_and_send():
             md5_hash = hashlib.md5()
-            print("Before opening file to calculate hashes")
             with open(file_path, "rb") as file:
                 content = file.read()
                 md5_hash.update(content)
                 digest = md5_hash.hexdigest()
                 metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
-            print("After calculating md5")
-            print("Before calculating sha256")
             digest_sha256 = file_hash(file_path)
-            print("After calculating sha256")
             metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
-            print("Before opening file to send")
             with open(file_path, "rb") as file:
                 files = {
                     "metadata": (
@@ -473,12 +466,9 @@ class _RestClient:
                     )
                 additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
-                print("Before sending request for initial search")
-                response = self._send_post_file_req_v2(
+                return self._send_post_file_req_v2(
                     api_path, files, trace_id=trace_id, additional_headers=additional_headers
                 )
-                print("After sending request")
-                return response
         response = self._with_unauth_retry(open_and_send)
         return SearchTaskResponse(response)

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -155,7 +155,7 @@ target_outliers_warning=We detected {} outliers in your sample.\nExamples of out
     # features validation
 empty_or_contant_features=Columns {} has value with frequency more than 99%, removed from X
 high_cardinality_features=Columns {} has high cardinality (>90% unique values), removed from X
-# one_hot_encoded_features=One hot encoded features detected. Use int encoding for correct results of fit.\n{}
+one_hot_encoded_features=One hot encoded features detected: {}
     # Dataset validation
 dataset_too_few_rows=X size should be at least {} rows after validation

upgini/utils/features_validator.py CHANGED Viewed

@@ -24,7 +24,7 @@ class FeaturesValidator:
         features_for_generate: Optional[List[str]] = None,
         columns_renaming: Optional[Dict[str, str]] = None,
     ) -> Tuple[List[str], List[str]]:
-        # one_hot_encoded_features = []
+        one_hot_encoded_features = []
         empty_or_constant_features = []
         high_cardinality_features = []
         warnings = []
@@ -36,23 +36,17 @@ class FeaturesValidator:
             value_counts = column.value_counts(dropna=False, normalize=True)
             most_frequent_percent = value_counts.iloc[0]
-            if most_frequent_percent >= 0.99:
+            if len(value_counts) == 1:
                 empty_or_constant_features.append(f)
+            elif most_frequent_percent >= 0.99:
+                if self.is_one_hot_encoded(column):
+                    one_hot_encoded_features.append(f)
+                else:
+                    empty_or_constant_features.append(f)
-            # TODO implement one-hot encoding check
-            # if len(value_counts) == 1:
-            #     empty_or_constant_features.append(f)
-            # elif most_frequent_percent >= 0.99:
-            #     empty_or_constant_features.append(f)
-            #     if set(value_counts.index.to_list()) == {0, 1}:
-            #         one_hot_encoded_features.append(f)
-            #     else:
-            #         empty_or_constant_features.append(f)
-            #     continue
-        # if one_hot_encoded_features:
-        #     msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
-        #     warnings.append(msg)
+        if one_hot_encoded_features:
+            msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
+            self.logger.info(msg)
         columns_renaming = columns_renaming or {}
@@ -102,3 +96,30 @@ class FeaturesValidator:
     @staticmethod
     def find_constant_features(df: pd.DataFrame) -> List[str]:
         return [i for i in df if df[i].nunique() <= 1]
+    @staticmethod
+    def is_one_hot_encoded(series: pd.Series) -> bool:
+        try:
+            # Column contains only 0 and 1 (as strings or numbers)
+            series = series.astype(float)
+            if set(series.unique()) != {0.0, 1.0}:
+                return False
+            series = series.astype(int)
+            # Column doesn't contain any NaN, np.NaN, space, null, etc.
+            if not (series.isin([0, 1])).all():
+                return False
+            vc = series.value_counts()
+            # Column should contain both 0 and 1
+            if len(vc) != 2:
+                return False
+            # Minority class is 1
+            if vc[1] >= vc[0]:
+                return False
+            return True
+        except ValueError:
+            return False

upgini/utils/sklearn_ext.py CHANGED Viewed

@@ -1301,6 +1301,7 @@ def _encode_cat_features(X_train, y_train, X_test, y_test, cat_features, estimat
             encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
             encoder.fit(X_train[cat_features], y_train)
+            # OrdinalEncoder doesn't support progressive encoding with target
             X_train[cat_features] = encoder.transform(X_train[cat_features]).astype(int)
             X_test[cat_features] = encoder.transform(X_test[cat_features]).astype(int)
@@ -1314,10 +1315,8 @@ def _encode_cat_features(X_train, y_train, X_test, y_test, cat_features, estimat
             encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
             encoder.fit(X_train[cat_features], y_train)
-            # Progressive encoding on train (using y)
-            X_train[cat_features] = encoder.transform(X_train[cat_features], y_train).astype(int)
-            # Static encoding on validation (no y)
+            # OrdinalEncoder doesn't support progressive encoding with target
+            X_train[cat_features] = encoder.transform(X_train[cat_features]).astype(int)
             X_test[cat_features] = encoder.transform(X_test[cat_features]).astype(int)
             return X_train, y_train, X_test, y_test, [], encoder

{upgini-1.2.120a1.dist-info → upgini-1.2.121a2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.120a1
+Version: 1.2.121a2
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.120a1.dist-info → upgini-1.2.121a2.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,10 @@
-upgini/__about__.py,sha256=J4ou6xfTwIgzTXi7mnxG9WD4vn49_cFGZVdB8RZEIPM,26
+upgini/__about__.py,sha256=Dv8DzHbPAHs_fY_MACW4HNqnYW7CilejShdVPFkTaYM,26
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
-upgini/dataset.py,sha256=9xYeqp-Ti3-QcsucyxlDFOHQef6ZQsBX7bOZMCyT2rM,31665
+upgini/dataset.py,sha256=pQ8JQe0cdygD-W9GefJmfE6bnj4EYzXsjlgWdIS9nS8,31578
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=C9pZKusj_QnG9coPVAa1a_88VC-lLR4Tre4uC10yt04,231852
-upgini/http.py,sha256=CzDgSrYH6-R14G0d8xPyLalb-w42fjj9XOHVXh7leyM,44835
+upgini/features_enricher.py,sha256=Du1S72F55cqyKbHT3VGSPnJO3XicWABFVkA2-G3chdA,231696
+upgini/http.py,sha256=-J_wOpnwVnT0ebPC6sOs6fN3AWtCD0LJLu6nlYmxaqk,44348
 upgini/metadata.py,sha256=VzgtgEbPPtNxTrj9LM5qSDP3DujHwAXqbUSKBjPcb9c,12477
 upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
 upgini/search_task.py,sha256=SAiUd1AytbA2Q6PSnnztr7oTRKpud1wQZ5YtKjsmQHU,18256
@@ -31,14 +31,14 @@ upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_a
 upgini/autofe/timeseries/trend.py,sha256=K1_iw2ko_LIUU8YCUgrvN3n0MkHtsi7-63-8x9er1k4,2129
 upgini/autofe/timeseries/volatility.py,sha256=SvZfhM_ZAWCNpTf87WjSnZsnlblARgruDlu4By4Zvhc,8078
 upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-upgini/data_source/data_source_publisher.py,sha256=suRmAF1i7yiZ8vJjpEKdVr5Wqtr7o1_vjAhaN9B4DU0,26518
+upgini/data_source/data_source_publisher.py,sha256=qXQUYErhCmkWHm2FWgTL0FYZ2aJbxtSDV94OCM3eqUU,26653
 upgini/mdc/__init__.py,sha256=iHJlXQg6xRM1-ZOUtaPSJqw5SpQDszvxp4LyqviNLIQ,1027
 upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
 upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/normalizer/normalize_utils.py,sha256=mDh2mBW3aQMB4EFP2aHbf2dGMVkOcWnp4sKKvKDBh8w,8511
 upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
 upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
-upgini/resource_bundle/strings.properties,sha256=cNeVkWZMyjGCYGqmOOeJqisqPSEBtmfIw_U1rmgQw4w,29285
+upgini/resource_bundle/strings.properties,sha256=Kmc6ZHpo0hK-bEQuoQkU0SPIQCnIDYRKqkfN3a_gvRU,29237
 upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
 upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/sampler/base.py,sha256=Fva2FEhLiNRPZ9Q6uOtJRtRzwsayjv7aphalAZO_4lc,6452
@@ -58,7 +58,7 @@ upgini/utils/display_utils.py,sha256=uSG3JwpwCIgRJXsp-8ktuJ0Dh-WFti7IrRLMUfHfoDc
 upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
 upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
 upgini/utils/feature_info.py,sha256=6vihytwKma_TlXtTn4l6Aj4kqlOj0ouLy-yWVV6VUw8,7551
-upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
+upgini/utils/features_validator.py,sha256=RAnfX80GBFcz6-SlTSR0DF6BZzf7A7IL8dlIqEoSz_s,4265
 upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
 upgini/utils/hash_utils.py,sha256=mP2yHyzvDNdpa5g3B4MHzulxBeEz_ZSoGl1YF_VnAyE,5538
 upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
@@ -68,13 +68,13 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
 upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
 upgini/utils/psi.py,sha256=vw8QEktXSx29IiMJMxmDeFU_4lJInJBXt_XL5Muekzo,11114
 upgini/utils/sample_utils.py,sha256=xpfYaZ2cYP7I2JrcooVc13QNBFawB81cJRuh38451Q4,15123
-upgini/utils/sklearn_ext.py,sha256=jLJWAKkqQinV15Z4y1ZnsN3c-fKFwXTsprs00COnyVU,49315
+upgini/utils/sklearn_ext.py,sha256=Pcy8sWD6f4YcE5Bu0UmXD4j0ICmXtrT8DJlTArM-_a0,49356
 upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
 upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,10882
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.120a1.dist-info/METADATA,sha256=Ai4c0bpRvXFgEYB78zVltQNbWv6HpPdc96IAw85kPJI,50745
-upgini-1.2.120a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.120a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.120a1.dist-info/RECORD,,
+upgini-1.2.121a2.dist-info/METADATA,sha256=1XVh2jWKC2I3ElN4ftyEveTny9C1pU5z69Osnp6q7_s,50745
+upgini-1.2.121a2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.2.121a2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.121a2.dist-info/RECORD,,

{upgini-1.2.120a1.dist-info → upgini-1.2.121a2.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.120a1.dist-info → upgini-1.2.121a2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.120a1__py3-none-any.whl → 1.2.121a2__py3-none-any.whl

upgini 1.2.120a1py3-none-any.whl → 1.2.121a2py3-none-any.whl