PyPI - upgini - Versions diffs - 1.1.240__py3-none-any.whl → 1.1.242__py3-none-any.whl - Mend

upgini 1.1.240py3-none-any.whl → 1.1.242py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

upgini/dataset.py +11 -15
upgini/features_enricher.py +60 -15
upgini/http.py +22 -17
upgini/metrics.py +33 -30
upgini/resource_bundle/strings.properties +2 -0
upgini/search_task.py +19 -17
upgini/utils/target_utils.py +1 -1
upgini/utils/track_info.py +2 -1
{upgini-1.1.240.dist-info → upgini-1.1.242.dist-info}/METADATA +1 -1
{upgini-1.1.240.dist-info → upgini-1.1.242.dist-info}/RECORD +13 -13
{upgini-1.1.240.dist-info → upgini-1.1.242.dist-info}/LICENSE +0 -0
{upgini-1.1.240.dist-info → upgini-1.1.242.dist-info}/WHEEL +0 -0
{upgini-1.1.240.dist-info → upgini-1.1.242.dist-info}/top_level.txt +0 -0

upgini/dataset.py CHANGED Viewed

@@ -20,7 +20,7 @@ from pandas.api.types import (
 from pandas.core.dtypes.common import is_period_dtype
 from upgini.errors import ValidationError
-from upgini.http import ProgressStage, SearchProgress, get_rest_client
+from upgini.http import ProgressStage, SearchProgress, _RestClient
 from upgini.metadata import (
     EVAL_SET_INDEX,
     SYSTEM_COLUMNS,
@@ -78,8 +78,7 @@ class Dataset:  # (pd.DataFrame):
         search_keys: Optional[List[Tuple[str, ...]]] = None,
         model_task_type: Optional[ModelTaskType] = None,
         random_state: Optional[int] = None,
-        endpoint: Optional[str] = None,
-        api_key: Optional[str] = None,
+        rest_client: Optional[_RestClient] = None,
         logger: Optional[logging.Logger] = None,
         warning_counter: Optional[WarningCounter] = None,
         **kwargs,
@@ -114,8 +113,7 @@ class Dataset:  # (pd.DataFrame):
         self.hierarchical_subgroup_keys = []
         self.file_upload_id: Optional[str] = None
         self.etalon_def: Optional[Dict[str, str]] = None
-        self.endpoint = endpoint
-        self.api_key = api_key
+        self.rest_client = rest_client
         self.random_state = random_state
         self.columns_renaming: Dict[str, str] = {}
         self.imbalanced: bool = False
@@ -983,10 +981,10 @@ class Dataset:  # (pd.DataFrame):
             runtime_parameters=runtime_parameters,
         )
-        if self.file_upload_id is not None and get_rest_client(self.endpoint, self.api_key).check_uploaded_file_v2(
+        if self.file_upload_id is not None and self.rest_client.check_uploaded_file_v2(
             trace_id, self.file_upload_id, file_metadata
         ):
-            search_task_response = get_rest_client(self.endpoint, self.api_key).initial_search_without_upload_v2(
+            search_task_response = self.rest_client.initial_search_without_upload_v2(
                 trace_id, self.file_upload_id, file_metadata, file_metrics, search_customization
             )
         else:
@@ -999,7 +997,7 @@ class Dataset:  # (pd.DataFrame):
                     progress_bar.progress = search_progress.to_progress_bar()
                 if progress_callback is not None:
                     progress_callback(search_progress)
-                search_task_response = get_rest_client(self.endpoint, self.api_key).initial_search_v2(
+                search_task_response = self.rest_client.initial_search_v2(
                     trace_id, parquet_file_path, file_metadata, file_metrics, search_customization
                 )
                 # if progress_bar is not None:
@@ -1015,8 +1013,7 @@ class Dataset:  # (pd.DataFrame):
             extract_features,
             accurate_model,
             task_type=self.task_type,
-            endpoint=self.endpoint,
-            api_key=self.api_key,
+            rest_client=self.rest_client,
             logger=self.logger,
         )
@@ -1053,10 +1050,10 @@ class Dataset:  # (pd.DataFrame):
             progress_bar.progress = search_progress.to_progress_bar()
         if progress_callback is not None:
             progress_callback(search_progress)
-        if self.file_upload_id is not None and get_rest_client(self.endpoint, self.api_key).check_uploaded_file_v2(
+        if self.file_upload_id is not None and self.rest_client.check_uploaded_file_v2(
             trace_id, self.file_upload_id, file_metadata
         ):
-            search_task_response = get_rest_client(self.endpoint, self.api_key).validation_search_without_upload_v2(
+            search_task_response = self.rest_client.validation_search_without_upload_v2(
                 trace_id, self.file_upload_id, initial_search_task_id, file_metadata, file_metrics, search_customization
             )
         else:
@@ -1065,7 +1062,7 @@ class Dataset:  # (pd.DataFrame):
                 # To avoid rate limit
                 time.sleep(1)
-                search_task_response = get_rest_client(self.endpoint, self.api_key).validation_search_v2(
+                search_task_response = self.rest_client.validation_search_v2(
                     trace_id,
                     parquet_file_path,
                     initial_search_task_id,
@@ -1085,8 +1082,7 @@ class Dataset:  # (pd.DataFrame):
             return_scores,
             extract_features,
             initial_search_task_id=initial_search_task_id,
-            endpoint=self.endpoint,
-            api_key=self.api_key,
+            rest_client=self.rest_client,
             logger=self.logger,
         )

upgini/features_enricher.py CHANGED Viewed

@@ -233,7 +233,7 @@ class FeaturesEnricher(TransformerMixin):
         self.feature_importances_ = []
         self.search_id = search_id
         if search_id:
-            search_task = SearchTask(search_id, endpoint=self.endpoint, api_key=self._api_key, logger=self.logger)
+            search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
             print(bundle.get("search_by_task_id_start"))
             trace_id = str(uuid.uuid4())
@@ -297,7 +297,8 @@ class FeaturesEnricher(TransformerMixin):
     def _set_api_key(self, api_key: str):
         self._api_key = api_key
         if self.logs_enabled:
-            self.logger = LoggerFactory().get_logger(self.endpoint, self._api_key, self.client_ip, self.client_visitorid)
+            self.logger = LoggerFactory().get_logger(self.endpoint, self._api_key,
+                                                     self.client_ip, self.client_visitorid)
     api_key = property(_get_api_key, _set_api_key)
@@ -855,9 +856,17 @@ class FeaturesEnricher(TransformerMixin):
                 if X is not None and y is None:
                     raise ValidationError("X passed without y")
+                effective_X = X if X is not None else self.X
+                effective_eval_set = eval_set if eval_set is not None else self.eval_set
+                effective_X = X if X is not None else self.X
+                effective_eval_set = eval_set if eval_set is not None else self.eval_set
                 validate_scoring_argument(scoring)
+                self._validate_baseline_score(effective_X, effective_eval_set)
                 if self._has_paid_features(exclude_features_sources):
                     msg = bundle.get("metrics_with_paid_features")
                     self.logger.warning(msg)
@@ -1000,15 +1009,17 @@ class FeaturesEnricher(TransformerMixin):
                         enriched_metric = None
                         uplift = None
+                    effective_X = X if X is not None else self.X
+                    effective_y = y if y is not None else self.y
                     train_metrics = {
                         bundle.get("quality_metrics_segment_header"): bundle.get("quality_metrics_train_segment"),
-                        bundle.get("quality_metrics_rows_header"): _num_samples(self.X),
+                        bundle.get("quality_metrics_rows_header"): _num_samples(effective_X),
                         # bundle.get("quality_metrics_match_rate_header"): self._search_task.initial_max_hit_rate_v2(),
                     }
                     if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
                         y_sorted
                     ):
-                        train_metrics[bundle.get("quality_metrics_mean_target_header")] = round(self.y.mean(), 4)
+                        train_metrics[bundle.get("quality_metrics_mean_target_header")] = round(np.mean(effective_y), 4)
                     if etalon_metric is not None:
                         train_metrics[bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
                     if enriched_metric is not None:
@@ -1064,18 +1075,19 @@ class FeaturesEnricher(TransformerMixin):
                             else:
                                 eval_uplift = None
+                            effective_eval_set = eval_set if eval_set is not None else self.eval_set
                             eval_metrics = {
                                 bundle.get("quality_metrics_segment_header"): bundle.get(
                                     "quality_metrics_eval_segment"
                                 ).format(idx + 1),
-                                bundle.get("quality_metrics_rows_header"): _num_samples(self.eval_set[idx][0]), #_num_samples(eval_X_sorted),
+                                bundle.get("quality_metrics_rows_header"): _num_samples(effective_eval_set[idx][0]),
                                 # bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
                             }
                             if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
                                 eval_y_sorted
                             ):
                                 eval_metrics[bundle.get("quality_metrics_mean_target_header")] = round(
-                                    self.eval_set[idx][1].mean(), 4
+                                    np.mean(effective_eval_set[idx][1]), 4
                                 )
                             if etalon_eval_metric is not None:
                                 eval_metrics[
@@ -1091,6 +1103,9 @@ class FeaturesEnricher(TransformerMixin):
                             metrics.append(eval_metrics)
                     metrics_df = pd.DataFrame(metrics)
+                    mean_target_hdr = bundle.get("quality_metrics_mean_target_header")
+                    if mean_target_hdr in metrics_df.columns:
+                        metrics_df[mean_target_hdr] = metrics_df[mean_target_hdr].astype("float64")
                     do_without_pandas_limits(
                         lambda: self.logger.info(f"Metrics calculation finished successfully:\n{metrics_df}")
                     )
@@ -1802,10 +1817,9 @@ class FeaturesEnricher(TransformerMixin):
             dataset = Dataset(
                 "sample_" + str(uuid.uuid4()),
-                df=df_without_features,  # type: ignore
-                endpoint=self.endpoint,  # type: ignore
-                api_key=self.api_key,  # type: ignore
-                date_format=self.date_format,  # type: ignore
+                df=df_without_features,
+                date_format=self.date_format,
+                rest_client=self.rest_client,
                 logger=self.logger,
             )
             dataset.meaning_types = meaning_types
@@ -2135,11 +2149,10 @@ class FeaturesEnricher(TransformerMixin):
         dataset = Dataset(
             "tds_" + str(uuid.uuid4()),
             df=df,  # type: ignore
-            model_task_type=model_task_type,  # type: ignore
-            endpoint=self.endpoint,  # type: ignore
-            api_key=self.api_key,  # type: ignore
-            date_format=self.date_format,  # type: ignore
-            random_state=self.random_state,  # type: ignore
+            model_task_type=model_task_type,
+            date_format=self.date_format,
+            random_state=self.random_state,
+            rest_client=self.rest_client,
             logger=self.logger,
         )
         dataset.meaning_types = meaning_types
@@ -2468,6 +2481,36 @@ class FeaturesEnricher(TransformerMixin):
             raise ValidationError(bundle.get("y_is_constant_eval_set"))
         return validated_eval_X, validated_eval_y
+    def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
+        if self.baseline_score_column is not None:
+            if self.baseline_score_column not in X.columns:
+                raise ValidationError(bundle.get("baseline_score_column_not_exists").format(self.baseline_score_column))
+            if X[self.baseline_score_column].isna().any():
+                raise ValidationError(bundle.get("baseline_score_column_has_na"))
+            if eval_set is not None:
+                if isinstance(eval_set, tuple):
+                    eval_set = [eval_set]
+                for eval in eval_set:
+                    if self.baseline_score_column not in eval[0].columns:
+                        raise ValidationError(bundle.get("baseline_score_column_not_exists"))
+                    if eval[0][self.baseline_score_column].isna().any():
+                        raise ValidationError(bundle.get("baseline_score_column_has_na"))
+    def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
+        if self.baseline_score_column is not None:
+            if self.baseline_score_column not in X.columns:
+                raise ValidationError(bundle.get("baseline_score_column_not_exists").format(self.baseline_score_column))
+            if X[self.baseline_score_column].isna().any():
+                raise ValidationError(bundle.get("baseline_score_column_has_na"))
+            if eval_set is not None:
+                if isinstance(eval_set, tuple):
+                    eval_set = [eval_set]
+                for eval in eval_set:
+                    if self.baseline_score_column not in eval[0].columns:
+                        raise ValidationError(bundle.get("baseline_score_column_not_exists"))
+                    if eval[0][self.baseline_score_column].isna().any():
+                        raise ValidationError(bundle.get("baseline_score_column_has_na"))
     @staticmethod
     def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
@@ -3396,6 +3439,8 @@ class FeaturesEnricher(TransformerMixin):
 def _num_samples(x):
     """Return number of samples in array-like x."""
+    if x is None:
+        return 0
     message = "Expected sequence or array-like, got %s" % type(x)
     if hasattr(x, "fit") and callable(x.fit):
         # Don't get num_samples from an ensembles length!

upgini/http.py CHANGED Viewed

@@ -301,13 +301,14 @@ class _RestClient:
     USER_AGENT_HEADER_VALUE = "pyupgini/" + __version__
     SEARCH_KEYS_HEADER_NAME = "Search-Keys"
-    def __init__(self, service_endpoint, refresh_token, silent_mode=False, client_ip=None, client_visitorid=None):
+    def __init__(self, service_endpoint, refresh_token, client_ip=None, client_visitorid=None):
         # debug_requests_on()
         self._service_endpoint = service_endpoint
         self._refresh_token = refresh_token
-        self.silent_mode = silent_mode
+        # self.silent_mode = silent_mode
         self.client_ip = client_ip
         self.client_visitorid = client_visitorid
+        print(f"Created RestClient with {client_ip} and {client_visitorid}")
         self._access_token = self._refresh_access_token()
         # self._access_token: Optional[str] = None  # self._refresh_access_token()
         self.last_refresh_time = time.time()
@@ -441,6 +442,10 @@ class _RestClient:
     ) -> SearchTaskResponse:
         api_path = self.INITIAL_SEARCH_URI_FMT_V2
+        print(f"Start initial search with {self.client_ip} and {self.client_visitorid}")
+        track_metrics = get_track_metrics(self.client_ip, self.client_visitorid)
+        print(f"Sending track metrics: {track_metrics}")
         def open_and_send():
             md5_hash = hashlib.md5()
             with open(file_path, "rb") as file:
@@ -461,6 +466,11 @@ class _RestClient:
                         metadata_with_md5.json(exclude_none=True).encode(),
                         "application/json",
                     ),
+                    "tracking": (
+                        "tracking.json",
+                        dumps(track_metrics).encode(),
+                        "application/json",
+                    ),
                     "metrics": ("metrics.json", metrics.json(exclude_none=True).encode(), "application/json"),
                     "file": (metadata_with_md5.name, file, "application/octet-stream"),
                 }
@@ -470,11 +480,6 @@ class _RestClient:
                         search_customization.json(exclude_none=True).encode(),
                         "application/json",
                     )
-                files["tracking"] = (
-                    "tracking.json",
-                    dumps(get_track_metrics(self.client_ip, self.client_visitorid)).encode(),
-                    "application/json",
-                )
                 additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
                 return self._send_post_file_req_v2(
@@ -545,6 +550,11 @@ class _RestClient:
                         metadata_with_md5.json(exclude_none=True).encode(),
                         "application/json",
                     ),
+                    "tracking": (
+                        "tracking.json",
+                        dumps(get_track_metrics(self.client_ip, self.client_visitorid)).encode(),
+                        "application/json",
+                    ),
                     "metrics": ("metrics.json", metrics.json(exclude_none=True).encode(), "application/json"),
                     "file": (metadata_with_md5.name, file, "application/octet-stream"),
                 }
@@ -554,11 +564,6 @@ class _RestClient:
                         search_customization.json(exclude_none=True).encode(),
                         "application/json",
                     )
-                files["tracking"] = (
-                    "ide",
-                    dumps(get_track_metrics(self.client_ip, self.client_visitorid)).encode(),
-                    "application/json",
-                )
                 additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
@@ -922,12 +927,12 @@ def is_demo_api_key(api_token: Optional[str]) -> bool:
 @lru_cache()
 def _get_rest_client(backend_url: str, api_token: str,
                      client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> _RestClient:
-    return _RestClient(backend_url, api_token)
+    return _RestClient(backend_url, api_token, client_ip, client_visitorid)
 class BackendLogHandler(logging.Handler):
-    def __init__(self, rest_client: _RestClient,
-                 client_ip: Optional[str] = None, client_visitorid: Optional[str] = None,
+    def __init__(self, rest_client: _RestClient,
+                 client_ip: Optional[str] = None, client_visitorid: Optional[str] = None,
                  *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.rest_client = rest_client
@@ -982,7 +987,7 @@ class LoggerFactory:
         root.handlers.clear()
     def get_logger(
-        self, backend_url: Optional[str] = None, api_token: Optional[str] = None,
+        self, backend_url: Optional[str] = None, api_token: Optional[str] = None,
         client_ip: Optional[str] = None, client_visitorid: Optional[str] = None
     ) -> logging.Logger:
         url = _resolve_backend_url(backend_url)
@@ -994,7 +999,7 @@ class LoggerFactory:
         upgini_logger = logging.getLogger(f"upgini.{hash(key)}")
         upgini_logger.handlers.clear()
-        rest_client = get_rest_client(backend_url, api_token)
+        rest_client = get_rest_client(backend_url, api_token, client_ip, client_visitorid)
         datadog_handler = BackendLogHandler(rest_client, client_ip, client_visitorid)
         json_formatter = jsonlogger.JsonFormatter(
             "%(asctime)s %(threadName)s %(name)s %(levelname)s %(message)s",

upgini/metrics.py CHANGED Viewed

@@ -215,7 +215,7 @@ class EstimatorWrapper:
         self.groups = groups
     def fit(self, X: pd.DataFrame, y: np.ndarray, **kwargs):
-        X, y, fit_params = self._prepare_to_fit(X, y)
+        X, y, _, fit_params = self._prepare_to_fit(X, y)
         kwargs.update(fit_params)
         self.estimator.fit(X, y, **kwargs)
         return self
@@ -223,7 +223,13 @@ class EstimatorWrapper:
     def predict(self, **kwargs):
         return self.estimator.predict(**kwargs)
-    def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
+    def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
+        X, y, groups = self._prepare_data(X, y, groups=self.groups)
+        return X, y, groups, {}
+    def _prepare_data(
+        self, X: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
+    ) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
         for c in X.columns:
             if is_numeric_dtype(X[c]):
                 X[c] = X[c].astype(float)
@@ -233,36 +239,33 @@ class EstimatorWrapper:
         if not isinstance(y, pd.Series):
             raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
-        joined = pd.concat([X, y], axis=1)
-        joined = joined[joined[y.name].notna()]
-        joined = joined.reset_index(drop=True)
-        X = joined.drop(columns=y.name)
-        y = np.array(list(joined[y.name].values))
-        return X, y, {}
-    def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
-        for c in X.columns:
-            if is_numeric_dtype(X[c]):
-                X[c] = X[c].astype(float)
-            else:
-                X[c] = X[c].astype(str)
+        if groups is not None:
+            X["__groups"] = groups
+            X, y = self._remove_empty_target_rows(X, y)
+            groups = X["__groups"]
+            X.drop(columns="__groups", inplace=True)
+        else:
+            X, y = self._remove_empty_target_rows(X, y)
-        if not isinstance(y, pd.Series):
-            raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
+        return X, y, groups
+    def _remove_empty_target_rows(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
         joined = pd.concat([X, y], axis=1)
         joined = joined[joined[y.name].notna()]
         joined = joined.reset_index(drop=True)
         X = joined.drop(columns=y.name)
         y = np.array(list(joined[y.name].values))
+        return X, y
+    def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
+        X, y, _ = self._prepare_data(X, y)
         return X, y, {}
     def cross_val_predict(
         self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
     ) -> Optional[float]:
-        X, y, fit_params = self._prepare_to_fit(X, y)
-        # if isinstance(self.estimator, CatBoostClassifier) or isinstance(self.estimator, CatBoostRegressor):
-        #     fit_params["early_stopping_rounds"] = 20
+        X, y, groups, fit_params = self._prepare_to_fit(X, y)
         if X.shape[1] == 0:
             return None
@@ -278,7 +281,7 @@ class EstimatorWrapper:
                 y=y,
                 scoring=scorer,
                 cv=self.cv,
-                groups=self.groups,
+                groups=groups,
                 fit_params=fit_params,
                 return_estimator=True,
             )
@@ -393,8 +396,8 @@ class CatBoostWrapper(EstimatorWrapper):
         self.cat_features = None
         self.cat_features_idx = None
-    def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
-        X, y, params = super()._prepare_to_fit(X, y)
+    def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
+        X, y, groups, params = super()._prepare_to_fit(X, y)
         self.cat_features = _get_cat_features(X)
         X = fill_na_cat_features(X, self.cat_features)
         # unique_cat_features = []
@@ -418,7 +421,7 @@ class CatBoostWrapper(EstimatorWrapper):
             del self.estimator._init_params["cat_features"]
         params.update({"cat_features": self.cat_features_idx})
-        return X, y, params
+        return X, y, groups, params
     def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
         X, y, params = super()._prepare_to_calculate(X, y)
@@ -445,8 +448,8 @@ class LightGBMWrapper(EstimatorWrapper):
         )
         self.cat_features = None
-    def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, dict]:
-        X, y, params = super()._prepare_to_fit(X, y)
+    def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
+        X, y, groups, params = super()._prepare_to_fit(X, y)
         self.cat_features = _get_cat_features(X)
         X = fill_na_cat_features(X, self.cat_features)
         for feature in self.cat_features:
@@ -454,7 +457,7 @@ class LightGBMWrapper(EstimatorWrapper):
         if not is_numeric_dtype(y):
             y = correct_string_target(y)
-        return X, y, params
+        return X, y, groups, params
     def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
         X, y, params = super()._prepare_to_calculate(X, y)
@@ -483,8 +486,8 @@ class OtherEstimatorWrapper(EstimatorWrapper):
         )
         self.cat_features = None
-    def _prepare_to_fit(self, X: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, dict]:
-        X, y, params = super()._prepare_to_fit(X, y)
+    def _prepare_to_fit(self, X: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
+        X, y, groups, params = super()._prepare_to_fit(X, y)
         self.cat_features = _get_cat_features(X)
         num_features = [col for col in X.columns if col not in self.cat_features]
         X[num_features] = X[num_features].fillna(-999)
@@ -494,7 +497,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
             X[feature] = X[feature].astype("category").cat.codes
         if not is_numeric_dtype(y):
             y = correct_string_target(y)
-        return X, y, params
+        return X, y, groups, params
     def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
         X, y, params = super()._prepare_to_calculate(X, y)

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -126,6 +126,8 @@ eval_y_multiindex_unsupported=Multi index in y in eval_set is not supported
 eval_x_is_empty=X in eval_set is empty.
 eval_y_is_empty=y in eval_set is empty.
 x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
+baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
+baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
     # target validation
 empty_target=Target is empty in all rows
 non_numeric_target=Binary target should be numerical type

upgini/search_task.py CHANGED Viewed

@@ -8,7 +8,7 @@ import pandas as pd
 from upgini import dataset
 from upgini.http import (
-    LoggerFactory,
+    _RestClient,
     ProviderTaskSummary,
     SearchProgress,
     SearchTaskSummary,
@@ -42,8 +42,7 @@ class SearchTask:
         accurate_model: bool = False,
         initial_search_task_id: Optional[str] = None,
         task_type: Optional[ModelTaskType] = None,
-        endpoint: Optional[str] = None,
-        api_key: Optional[str] = None,
+        rest_client: Optional[_RestClient] = None,
         logger: Optional[logging.Logger] = None,
     ):
         self.search_task_id = search_task_id
@@ -54,8 +53,7 @@ class SearchTask:
         self.accurate_model = accurate_model
         self.task_type = task_type
         self.summary = None
-        self.endpoint = endpoint
-        self.api_key = api_key
+        self.rest_client = rest_client
         if logger is not None:
             self.logger = logger
         else:
@@ -65,7 +63,7 @@ class SearchTask:
         self.unused_features_for_generation: Optional[List[str]] = None
     def get_progress(self, trace_id: str) -> SearchProgress:
-        return get_rest_client(self.endpoint, self.api_key).get_search_progress(trace_id, self.search_task_id)
+        return self.rest_client.get_search_progress(trace_id, self.search_task_id)
     def poll_result(self, trace_id: str, quiet: bool = False, check_fit: bool = False) -> "SearchTask":
         completed_statuses = {"COMPLETED", "VALIDATION_COMPLETED"}
@@ -73,7 +71,7 @@ class SearchTask:
         submitted_statuses = {"SUBMITTED", "VALIDATION_SUBMITTED"}
         if not quiet:
             print(bundle.get("polling_search_task").format(self.search_task_id))
-            if is_demo_api_key(self.api_key):
+            if is_demo_api_key(self.rest_client._refresh_token):
                 print(bundle.get("polling_unregister_information"))
         search_task_id = self.initial_search_task_id if self.initial_search_task_id is not None else self.search_task_id
@@ -81,14 +79,14 @@ class SearchTask:
             with Spinner():
                 if self.PROTECT_FROM_RATE_LIMIT:
                     time.sleep(1)  # this is neccesary to avoid requests rate limit restrictions
-                self.summary = get_rest_client(self.endpoint, self.api_key).search_task_summary_v2(
+                self.summary = self.rest_client.search_task_summary_v2(
                     trace_id, search_task_id
                 )
                 while self.summary.status not in completed_statuses and (
                     not check_fit or "VALIDATION" not in self.summary.status
                 ):
                     time.sleep(self.POLLING_DELAY_SECONDS)
-                    self.summary = get_rest_client(self.endpoint, self.api_key).search_task_summary_v2(
+                    self.summary = self.rest_client.search_task_summary_v2(
                         trace_id, search_task_id
                     )
                     if self.summary.status in failed_statuses:
@@ -104,7 +102,7 @@ class SearchTask:
         except KeyboardInterrupt as e:
             if not check_fit:
                 print(bundle.get("search_stopping"))
-                get_rest_client(self.endpoint, self.api_key).stop_search_task_v2(trace_id, search_task_id)
+                self.rest_client.stop_search_task_v2(trace_id, search_task_id)
                 self.logger.warning(f"Search {search_task_id} stopped by user")
                 print(bundle.get("search_stopped"))
             raise e
@@ -132,7 +130,7 @@ class SearchTask:
             for provider_summary in self.summary.initial_important_providers:
                 if provider_summary.status == "COMPLETED":
                     self.provider_metadata_v2.append(
-                        get_rest_client(self.endpoint, self.api_key).get_provider_search_metadata_v3(
+                        self.rest_client.get_provider_search_metadata_v3(
                             provider_summary.ads_search_task_id, trace_id
                         )
                     )
@@ -258,8 +256,8 @@ class SearchTask:
         if self.PROTECT_FROM_RATE_LIMIT:
             time.sleep(1)  # this is neccesary to avoid requests rate limit restrictions
         return _get_all_initial_raw_features_cached(
-            self.endpoint,
-            self.api_key,
+            self.rest_client._service_endpoint,
+            self.rest_client._refresh_token,
             trace_id,
             self.search_task_id,
             metrics_calculation,
@@ -269,7 +267,11 @@ class SearchTask:
     def get_target_outliers(self, trace_id: str) -> Optional[pd.DataFrame]:
         self._check_finished_initial_search()
         return _get_target_outliers_cached(
-            self.endpoint, self.api_key, trace_id, self.search_task_id, self.PROTECT_FROM_RATE_LIMIT
+            self.rest_client._service_endpoint,
+            self.rest_client._refresh_token,
+            trace_id,
+            self.search_task_id,
+            self.PROTECT_FROM_RATE_LIMIT
         )
     def get_max_initial_eval_set_hit_rate_v2(self) -> Optional[Dict[int, float]]:
@@ -287,8 +289,8 @@ class SearchTask:
     def get_all_validation_raw_features(self, trace_id: str, metrics_calculation=False) -> Optional[pd.DataFrame]:
         self._check_finished_validation_search()
         return _get_all_validation_raw_features_cached(
-            self.endpoint,
-            self.api_key,
+            self.rest_client._service_endpoint,
+            self.rest_client._refresh_token,
             trace_id,
             self.search_task_id,
             metrics_calculation,
@@ -296,7 +298,7 @@ class SearchTask:
         )
     def get_file_metadata(self, trace_id: str) -> FileMetadata:
-        return get_rest_client(self.endpoint, self.api_key).get_search_file_metadata(self.search_task_id, trace_id)
+        return self.rest_client.get_search_file_metadata(self.search_task_id, trace_id)
 @lru_cache()

upgini/utils/target_utils.py CHANGED Viewed

@@ -30,7 +30,7 @@ def define_task(y: pd.Series, logger: Optional[logging.Logger] = None, silent: b
     target_items = target.nunique()
     if target_items == 1:
         raise ValidationError(bundle.get("dataset_constant_target"))
     if target_items == 2:
         task = ModelTaskType.BINARY
     else:

upgini/utils/track_info.py CHANGED Viewed

@@ -50,6 +50,7 @@ def _get_execution_ide() -> str:
     except Exception:
         return "other"
 @lru_cache()
 def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> dict:
     # default values
@@ -73,7 +74,7 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
             display(
                 Javascript(
                     """
-                        import('https://upgini.github.io/upgini/js/visitorid.js')
+                        import('https://upgini.github.io/upgini/js/a.js')
                             .then(FingerprintJS => FingerprintJS.load())
                             .then(fp => fp.get())
                             .then(result => window.visitorId = result.visitorId);

{upgini-1.1.240.dist-info → upgini-1.1.242.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: upgini
-Version: 1.1.240
+Version: 1.1.242
 Summary: Intelligent data search & enrichment for Machine Learning
 Home-page: https://upgini.com/
 Author: Upgini Developers

{upgini-1.1.240.dist-info → upgini-1.1.242.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
 upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
 upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
-upgini/dataset.py,sha256=qSjv09LKzCYayucb_JlhExw9uSRcscLWTaD8hqATE3s,49676
+upgini/dataset.py,sha256=y9rpNhdLU9QgfFZndrPGK-S6CL67q5ocmB9HMzwHtaA,49395
 upgini/errors.py,sha256=BqpvfhW2jJW5fa5KXj0alhXatGl-WK4xTl309-QNLp8,959
-upgini/features_enricher.py,sha256=DUo-pvBqHwp5O_Fr71f56TwGvZsmAM-KyzFUBMUAHk4,160312
+upgini/features_enricher.py,sha256=n2L9MWq4WoUQIzoDDECFyiuprwZslFPPhbLfpXsT3sQ,162975
 upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
-upgini/http.py,sha256=RG93QmV3mqKixQsSHqYeM1Mtucp-EpdavcpCuhufnGE,42141
+upgini/http.py,sha256=xeSatYNnSBMQfGMXsER_ZvhR5zfDTY8_E1g3YpIOb38,42477
 upgini/metadata.py,sha256=FZ5CQluLLWrfrBVThSIes1SW6wcs7n50aNZwzYnHiF0,9584
-upgini/metrics.py,sha256=YeYHJtEIs8OG-EzidG-nbSYB919pjZ4MMbdcZ_jfV2s,23639
-upgini/search_task.py,sha256=sqgb5MfwWXg6YAbVhLOPcVJ5tDCUyzxFRWfd9aWj8SM,17236
+upgini/metrics.py,sha256=rteVPPjDFYlL5bBFVpu-YwwXQGNV1IzwT7V7L9JtjaE,23762
+upgini/search_task.py,sha256=nTVrb3CE4M1zfDkI-W_qVdUhsc90b98w3lo0XxegeKo,17200
 upgini/spinner.py,sha256=yhakBaydMNS8E8TRAwTdCMdnWrHeWT0cR1M8c9hP6jA,1157
 upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
 upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
@@ -28,7 +28,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
 upgini/normalizer/phone_normalizer.py,sha256=VIgLXuDuzzjPEXiy_LyDVLZKGaS7-le6Fh6T4D-TQDU,9930
 upgini/resource_bundle/__init__.py,sha256=M7GtS7KPQw9pinz8P2aQWXpSkD2YFwUPVGk1w92Pn84,7888
 upgini/resource_bundle/exceptions.py,sha256=KT-OnqA2J4OTfLjhbEl3KFZM2ci7EOPjqJuY_rXp3vs,622
-upgini/resource_bundle/strings.properties,sha256=1mpOkd_wkKIJGwWRBgfXz0mLx4lqdDro5IUoj8BBxuE,24527
+upgini/resource_bundle/strings.properties,sha256=C6rXpf2nXByeCTCog1ZacEF9bKal6JJNlDUTvE0szAQ,24706
 upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/sampler/base.py,sha256=X2PVsfZ3Rl7twpFDh5UWyxqY2K_jcMGxZ2NcHLwFRj4,6489
 upgini/sampler/random_under_sampler.py,sha256=whX_f_TtalHH8Seyn_7n3sX_TSiDHeYfALmme9saqDg,4082
@@ -50,11 +50,11 @@ upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,4
 upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3x_zs,409
 upgini/utils/progress_bar.py,sha256=iNXyqT3vKCeHpfiG5HHwr7Lk2cTtKViM93Fl8iZnjGc,1564
 upgini/utils/sklearn_ext.py,sha256=IMx2La70AXAggApVpT7sMEjWqVWon5AMZt4MARDsIMQ,43847
-upgini/utils/target_utils.py,sha256=n03QhNbm9P5OvvI_RPex2Wa8_swrE5l3CslPniU95Bg,1712
-upgini/utils/track_info.py,sha256=NK4VSPR4gkphnt0fMiOLEQLaOW04HPK0nKLgZHeS820,5214
+upgini/utils/target_utils.py,sha256=_VjYUm4ECXbgNvxNupr982fuOK_jtkg-8Xw7-zJBz2w,1708
+upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
 upgini/utils/warning_counter.py,sha256=vnmdFo5-7GBkU2bK9h_uC0K0Y_wtfcYstxOdeRfacO0,228
-upgini-1.1.240.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.1.240.dist-info/METADATA,sha256=c5l9RquzeHvhU-aq3esgp-5HWjiIxhc7vc-EIrQd-S8,48262
-upgini-1.1.240.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
-upgini-1.1.240.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
-upgini-1.1.240.dist-info/RECORD,,
+upgini-1.1.242.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.1.242.dist-info/METADATA,sha256=FwVINjwPmABqlcahJ70lv1hjpyDTH7bt3CGKGZmBHE0,48262
+upgini-1.1.242.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
+upgini-1.1.242.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
+upgini-1.1.242.dist-info/RECORD,,

{upgini-1.1.240.dist-info → upgini-1.1.242.dist-info}/LICENSE RENAMED Viewed

File without changes

{upgini-1.1.240.dist-info → upgini-1.1.242.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.1.240.dist-info → upgini-1.1.242.dist-info}/top_level.txt RENAMED Viewed

File without changes

upgini 1.1.240__py3-none-any.whl → 1.1.242__py3-none-any.whl

upgini 1.1.240py3-none-any.whl → 1.1.242py3-none-any.whl