PyPI - upgini - Versions diffs - 1.1.103__py3-none-any.whl → 1.1.104__py3-none-any.whl - Mend

upgini 1.1.103py3-none-any.whl → 1.1.104py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (9) hide show

upgini/data_source/data_source_publisher.py CHANGED Viewed

@@ -44,6 +44,7 @@ class DataSourcePublisher:
         exclude_columns: Optional[List[str]] = None,
         hash_feature_names=False,
         snapshot_frequency_days: Optional[int] = None,
+        features_for_embeddings: Optional[List[str]] = None,
     ) -> str:
         trace_id = str(uuid.uuid4())
@@ -65,6 +66,7 @@ class DataSourcePublisher:
                     "excludeColumns": exclude_columns,
                     "hashFeatureNames": hash_feature_names,
                     "snapshotFrequencyDays": snapshot_frequency_days,
+                    "featuresForEmbeddings": features_for_embeddings,
                 }
                 self.logger.info(f"Start registering data table {request}")

upgini/features_enricher.py CHANGED Viewed

@@ -639,8 +639,8 @@ class FeaturesEnricher(TransformerMixin):
                 if (
                     self._search_task is None
                     or self._search_task.initial_max_hit_rate_v2() is None
-                    or self.X is None
-                    or self.y is None
+                    or (self.X is None and X is None)
+                    or (self.y is None and y is None)
                 ):
                     raise ValidationError(bundle.get("metrics_unfitted_enricher"))
@@ -653,6 +653,18 @@ class FeaturesEnricher(TransformerMixin):
                     self.__display_slack_community_link(msg)
                     return None
+                if (
+                    estimator is not None
+                    and hasattr(estimator, "get_param")
+                    and estimator.get_param("cat_features") is not None
+                ):
+                    cat_features = estimator.get_param("cat_features")
+                    if len(cat_features) > 0 and isinstance(cat_features[0], int):
+                        effectiveX = X or self.X
+                        cat_features = [effectiveX.columns[i] for i in cat_features]
+                else:
+                    cat_features = None
                 prepared_data = self._prepare_data_for_metrics(
                     trace_id, X, y, eval_set, exclude_features_sources, importance_threshold, max_features
                 )
@@ -699,7 +711,7 @@ class FeaturesEnricher(TransformerMixin):
                             f"Calculate baseline {metric} on client features: {fitting_X.columns.to_list()}"
                         )
                         baseline_estimator = EstimatorWrapper.create(
-                            estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring
+                            estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring, cat_features
                         )
                         etalon_metric = baseline_estimator.cross_val_predict(fitting_X, y_sorted)
@@ -711,7 +723,7 @@ class FeaturesEnricher(TransformerMixin):
                             f"Calculate enriched {metric} on combined features: {fitting_enriched_X.columns.to_list()}"
                         )
                         enriched_estimator = EstimatorWrapper.create(
-                            estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring
+                            estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring, cat_features
                         )
                         enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
                         if etalon_metric is not None:

upgini/metrics.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Callable, List, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Union
 import numpy as np
 import pandas as pd
@@ -14,6 +14,7 @@ from sklearn.metrics._regression import (
     mean_squared_error,
 )
 from sklearn.model_selection import BaseCrossValidator, cross_validate
+from copy import deepcopy
 from upgini.errors import ValidationError
 from upgini.metadata import ModelTaskType
@@ -40,7 +41,7 @@ LIGHTGBM_PARAMS = {
     "max_depth": 4,
     "n_estimators": 150,
     "learning_rate": 0.05,
-    "min_child_weight": 1
+    "min_child_weight": 1,
 }
 N_FOLDS = 5
@@ -129,6 +130,7 @@ class EstimatorWrapper:
         cv: BaseCrossValidator,
         X: pd.DataFrame,
         scoring: Union[Callable, str, None] = None,
+        cat_features: Optional[List[str]] = None,
     ) -> "EstimatorWrapper":
         scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
         kwargs = {
@@ -149,8 +151,16 @@ class EstimatorWrapper:
             else:
                 raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
         else:
-            kwargs["estimator"] = estimator
+            if hasattr(estimator, "copy"):
+                estimator_copy = estimator.copy()
+            else:
+                estimator_copy = deepcopy(estimator)
+            kwargs["estimator"] = estimator_copy
             if isinstance(estimator, CatBoostClassifier) or isinstance(estimator, CatBoostRegressor):
+                if cat_features is not None:
+                    estimator_copy.set_params(cat_features=[
+                        X.columns.get_loc(cat_feature) for cat_feature in cat_features
+                    ])
                 estimator = CatBoostWrapper(**kwargs)
             else:
                 try:
@@ -197,6 +207,15 @@ class CatBoostWrapper(EstimatorWrapper):
             else:
                 X = X.drop(columns=name)
         cat_features_idx = [X.columns.get_loc(c) for c in unique_cat_features]
+        if (
+            hasattr(self.estimator, "get_param")
+            and hasattr(self.estimator, "_init_params")
+            and self.estimator.get_param("cat_features") is not None
+        ):
+            cat_features_set = set(cat_features_idx)
+            cat_features_set.update(self.estimator.get_param("cat_features"))
+            cat_features_idx = list(cat_features_set)
+            del self.estimator._init_params["cat_features"]
         params.update({"cat_features": cat_features_idx})
         return X, y, params
@@ -376,6 +395,6 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
 def _is_too_many_categorical_values(X: pd.DataFrame) -> bool:
     many_values_features_count = 0
     for f in _get_cat_features(X):
-        if X[f].nunique() > 100:
+        if X[f].astype("string").nunique() > 100:
             many_values_features_count += 1
     return many_values_features_count >= 2

upgini/utils/email_utils.py CHANGED Viewed

@@ -9,8 +9,7 @@ from pandas.api.types import is_string_dtype
 from upgini.metadata import SearchKey
 from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
-EMAIL_REGEX = re.compile(r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+')
+EMAIL_REGEX = re.compile(r"^[a-zA-Z0-9.!#$%&’*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*$")
 class EmailSearchKeyDetector(BaseSearchKeyDetector):
@@ -20,13 +19,11 @@ class EmailSearchKeyDetector(BaseSearchKeyDetector):
     def _is_search_key_by_values(self, column: pd.Series) -> bool:
         if not is_string_dtype(column):
             return False
+        if not column.astype("string").str.contains("@").any():
+            return False
         all_count = len(column)
-        is_email_count = len(
-            column.loc[
-                column.astype("string").str.fullmatch(EMAIL_REGEX)
-            ]
-        )
+        is_email_count = len(column.loc[column.astype("string").str.fullmatch(EMAIL_REGEX)])
         return is_email_count / all_count > 0.1

{upgini-1.1.103.dist-info → upgini-1.1.104.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: upgini
-Version: 1.1.103
+Version: 1.1.104
 Summary: Low-code feature search and enrichment library for machine learning
 Home-page: https://upgini.com/
 Author: Upgini Developers

{upgini-1.1.103.dist-info → upgini-1.1.104.dist-info}/RECORD RENAMED Viewed

@@ -2,17 +2,17 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
 upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
 upgini/dataset.py,sha256=fl01WTbM2smgskjrHQJS9oTzymEj5ZulGngCU_d5PnQ,42110
 upgini/errors.py,sha256=BqpvfhW2jJW5fa5KXj0alhXatGl-WK4xTl309-QNLp8,959
-upgini/features_enricher.py,sha256=bjPiQDGn2ULS1dEFq7vIFxF49QM8UjJWFvTewqjDdHw,109147
+upgini/features_enricher.py,sha256=PtZIiOtpLItYRm3U7e5gsWAwAiTze4rznuKFFHjFpuQ,109768
 upgini/http.py,sha256=kgWj6wU1PbGPoGAbRvK35umXQ5zwEfEKeGy5Az0fss0,35479
 upgini/metadata.py,sha256=GPGsaGi5UtePQR2Qiqc7OJZn-ewvHmvepn3P_wJDW7Y,5856
-upgini/metrics.py,sha256=3gKDUJe4IzcS32hLitbPj2G-y-F66eyzKw0DWSIkun0,13937
+upgini/metrics.py,sha256=uJhtGKgUUFnvdF16xscfe9AGDoDN6LqUV97RWDP39NU,14869
 upgini/search_task.py,sha256=H7l-BhCRF9t58D0L1xNdC_qU_JFHYnAZZ165fVDQgmM,33884
 upgini/spinner.py,sha256=X9a0xhj0QVIwjVTTjXUTuAgPBnyrLbW-B6G534fxs1E,1149
 upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
 upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
 upgini/ads_management/ads_manager.py,sha256=Cc3v4lLLpM0g4oUH_q2DYFN3bNWpSmltAGnZQby3G74,2630
 upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-upgini/data_source/data_source_publisher.py,sha256=siGi3jWuP1wu4QS5g5XwNYGhCF0ILOw14qRXCfxD2jo,8415
+upgini/data_source/data_source_publisher.py,sha256=-Tpqiw6xrCinxdDKIEg6aS68ZqLwxoBrg4J4PTQNs6g,8546
 upgini/mdc/__init__.py,sha256=CuKmWYCqAnmiq1S7wgMzJhSCTsXuoeiZWXSfzw0lyig,1152
 upgini/mdc/context.py,sha256=eVNEubcgkiAP139Vna2qtUBZJWoy15rWWAuB0TFv54E,1484
 upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -31,7 +31,7 @@ upgini/utils/country_utils.py,sha256=9BXSXoGm3nVoOZE_bRENY-KMkwMUFvAF3Au0zxUNA1o
 upgini/utils/cv_utils.py,sha256=PeexQVPWrpUNlmwGtfU1FWA-aI1UyrMDgMT594ErpxA,2252
 upgini/utils/datetime_utils.py,sha256=PK1Fc5rJ_UhCJc1TNOZPSrtsYxjD7v9dsBYOZj1RKvo,4292
 upgini/utils/display_utils.py,sha256=iG3-hdv8_rJDWKwnQYIi1SHF-gLPAEi8jjk_05-qtMg,1934
-upgini/utils/email_utils.py,sha256=H05wMKVZML36Ipwxv1C4StDUyPJVaXWgU5NA06UDJMo,3048
+upgini/utils/email_utils.py,sha256=2IUxP1e8DsmU4qS1BN3n1JmuziZO_cV35fNf4Di0yxc,3090
 upgini/utils/features_validator.py,sha256=LZAKTWtmINWII09UHF0R0muEz7yHLGlJkLUk8zM305Q,2190
 upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
 upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,408
@@ -39,8 +39,8 @@ upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3
 upgini/utils/target_utils.py,sha256=3eHrDy_Dc9ozuOwHGnGA705m9glCxKmjB-DfLrflqiA,1370
 upgini/utils/track_info.py,sha256=O_oL4gy1jH0DVgtiUeZAW0YKCeRT4B_bzH_SZYkFaOE,4076
 upgini/utils/warning_counter.py,sha256=vnmdFo5-7GBkU2bK9h_uC0K0Y_wtfcYstxOdeRfacO0,228
-upgini-1.1.103.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.1.103.dist-info/METADATA,sha256=8zAaEAC2-WpBOPqrRagZYdO1mABJzDTWhVgs7R86E1Q,41101
-upgini-1.1.103.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
-upgini-1.1.103.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
-upgini-1.1.103.dist-info/RECORD,,
+upgini-1.1.104.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.1.104.dist-info/METADATA,sha256=nvFzylBFXaBafhSec5_Ja5KfVcrNc2pzAHNRVzBMzhA,41101
+upgini-1.1.104.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
+upgini-1.1.104.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
+upgini-1.1.104.dist-info/RECORD,,

{upgini-1.1.103.dist-info → upgini-1.1.104.dist-info}/LICENSE RENAMED Viewed

File without changes

{upgini-1.1.103.dist-info → upgini-1.1.104.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.1.103.dist-info → upgini-1.1.104.dist-info}/top_level.txt RENAMED Viewed

File without changes

upgini 1.1.103__py3-none-any.whl → 1.1.104__py3-none-any.whl

Potentially problematic release.

upgini 1.1.103py3-none-any.whl → 1.1.104py3-none-any.whl