PyPI - upgini - Versions diffs - 1.1.315a1__py3-none-any.whl → 1.1.316__py3-none-any.whl - Mend

upgini 1.1.315a1py3-none-any.whl → 1.1.316py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (11) hide show

upgini/__about__.py +1 -1
upgini/autofe/binary.py +4 -1
upgini/autofe/unary.py +3 -0
upgini/data_source/data_source_publisher.py +9 -0
upgini/dataset.py +1 -1
upgini/features_enricher.py +42 -24
upgini/utils/datetime_utils.py +0 -1
{upgini-1.1.315a1.dist-info → upgini-1.1.316.dist-info}/METADATA +1 -1
{upgini-1.1.315a1.dist-info → upgini-1.1.316.dist-info}/RECORD +11 -11
{upgini-1.1.315a1.dist-info → upgini-1.1.316.dist-info}/WHEEL +0 -0
{upgini-1.1.315a1.dist-info → upgini-1.1.316.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.1.~~315a1~~"
1	+ __version__ = "1.1.316"

upgini/autofe/binary.py CHANGED Viewed

@@ -141,7 +141,7 @@ class Distance(PandasOperand):
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         return pd.Series(
-            1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
+            1 - self.__dot(left, right) / (self.__norm(left) * self.__norm(right)), index=left.index
         )
     # row-wise dot product
@@ -152,6 +152,9 @@ class Distance(PandasOperand):
         res = res.reindex(left.index.union(right.index))
         return res
+    def __norm(self, vector: pd.Series) -> pd.Series:
+        return np.sqrt(self.__dot(vector, vector))
 # Left for backward compatibility
 class Sim(Distance):

upgini/autofe/unary.py CHANGED Viewed

@@ -121,6 +121,9 @@ class Norm(PandasOperand):
     def calculate_unary(self, data: pd.Series) -> pd.Series:
         data_dropna = data.dropna()
+        if data_dropna.empty:
+            return data
         normalized_data = Normalizer().transform(data_dropna.to_frame().T).T
         normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
         normalized_data = normalized_data.reindex(data.index)

upgini/data_source/data_source_publisher.py CHANGED Viewed

@@ -63,6 +63,7 @@ class DataSourcePublisher:
         keep_features: Optional[List[str]] = None,
         date_features: Optional[List[str]] = None,
         date_vector_features: Optional[List[str]] = None,
+        generate_runtime_embeddings: Optional[List[str]] = None,
         _force_generation=False,
         _silent=False,
     ) -> str:
@@ -163,6 +164,8 @@ class DataSourcePublisher:
                     if date_format is None:
                         raise ValidationError("date_format should be presented if you use date vector features")
                     request["dateVectorFeatures"] = date_vector_features
+                if generate_runtime_embeddings is not None:
+                    request["generateRuntimeEmbeddingsFeatures"] = generate_runtime_embeddings
                 self.logger.info(f"Start registering data table {request}")
                 task_id = self._rest_client.register_ads(request, trace_id)
@@ -276,6 +279,8 @@ class DataSourcePublisher:
         client_emails: Optional[List[str]] = None,
         date_features: Optional[List[str]] = None,
         date_vector_features: Optional[List[str]] = None,
+        exclude_from_autofe_generation: Optional[List[str]] = None,
+        generate_runtime_embeddings: Optional[List[str]] = None,
     ):
         trace_id = str(uuid.uuid4())
         with MDC(trace_id=trace_id):
@@ -327,6 +332,10 @@ class DataSourcePublisher:
                     request["dateFeatures"] = date_features
                 if date_vector_features is not None:
                     request["dateVectorFeatures"] = date_vector_features
+                if exclude_from_autofe_generation is not None:
+                    request["excludeFromGenerationFeatures"] = exclude_from_autofe_generation
+                if generate_runtime_embeddings is not None:
+                    request["generateRuntimeEmbeddingsFeatures"] = generate_runtime_embeddings
                 self.logger.info(f"Activating data tables with request {request}")
                 self._rest_client.activate_datatables(request, trace_id)

upgini/dataset.py CHANGED Viewed

@@ -692,7 +692,7 @@ class Dataset:  # (pd.DataFrame):
         parquet_file_path = f"{base_path}/{self.dataset_name}.parquet"
         self.data.to_parquet(path=parquet_file_path, index=False, compression="gzip", engine="fastparquet")
         uploading_file_size = Path(parquet_file_path).stat().st_size
-        self.logger.info(f"Size of prepared uploading file: {uploading_file_size}")
+        self.logger.info(f"Size of prepared uploading file: {uploading_file_size}. {len(self.data)} rows")
         if uploading_file_size > self.MAX_UPLOADING_FILE_SIZE:
             raise ValidationError(self.bundle.get("dataset_too_big_file"))
         return parquet_file_path

upgini/features_enricher.py CHANGED Viewed

@@ -846,17 +846,37 @@ class FeaturesEnricher(TransformerMixin):
                 self.logger.warning(msg)
                 print(msg)
+            if X is not None and y is None:
+                raise ValidationError("X passed without y")
             self.__validate_search_keys(self.search_keys, self.search_id)
             effective_X = X if X is not None else self.X
             effective_y = y if y is not None else self.y
             effective_eval_set = eval_set if eval_set is not None else self.eval_set
             effective_eval_set = self._check_eval_set(effective_eval_set, effective_X, self.bundle)
+            if (
+                self._search_task is None
+                or self._search_task.provider_metadata_v2 is None
+                or len(self._search_task.provider_metadata_v2) == 0
+                or effective_X is None
+                or effective_y is None
+            ):
+                raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
+            validated_X = self._validate_X(effective_X)
+            validated_y = self._validate_y(validated_X, effective_y)
+            validated_eval_set = (
+                [self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in effective_eval_set]
+                if effective_eval_set is not None
+                else None
+            )
             try:
                 self.__log_debug_information(
-                    effective_X,
-                    effective_y,
-                    effective_eval_set,
+                    validated_X,
+                    validated_y,
+                    validated_eval_set,
                     exclude_features_sources=exclude_features_sources,
                     cv=cv if cv is not None else self.cv,
                     importance_threshold=importance_threshold,
@@ -866,21 +886,9 @@ class FeaturesEnricher(TransformerMixin):
                     remove_outliers_calc_metrics=remove_outliers_calc_metrics,
                 )
-                if (
-                    self._search_task is None
-                    or self._search_task.provider_metadata_v2 is None
-                    or len(self._search_task.provider_metadata_v2) == 0
-                    or effective_X is None
-                    or effective_y is None
-                ):
-                    raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
-                if X is not None and y is None:
-                    raise ValidationError("X passed without y")
                 validate_scoring_argument(scoring)
-                self._validate_baseline_score(effective_X, effective_eval_set)
+                self._validate_baseline_score(validated_X, validated_eval_set)
                 if self._has_paid_features(exclude_features_sources):
                     msg = self.bundle.get("metrics_with_paid_features")
@@ -889,7 +897,7 @@ class FeaturesEnricher(TransformerMixin):
                     return None
                 cat_features, search_keys_for_metrics = self._get_client_cat_features(
-                    estimator, effective_X, self.search_keys
+                    estimator, validated_X, self.search_keys
                 )
                 prepared_data = self._prepare_data_for_metrics(
@@ -1034,10 +1042,10 @@ class FeaturesEnricher(TransformerMixin):
                         self.bundle.get("quality_metrics_rows_header"): _num_samples(effective_X),
                     }
                     if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
-                        effective_y
+                        validated_y
                     ):
                         train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
-                            np.mean(effective_y), 4
+                            np.mean(validated_y), 4
                         )
                     if etalon_metric is not None:
                         train_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
@@ -1107,10 +1115,10 @@ class FeaturesEnricher(TransformerMixin):
                                 # self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
                             }
                             if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
-                                effective_eval_set[idx][1]
+                                validated_eval_set[idx][1]
                             ):
                                 eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
-                                    np.mean(effective_eval_set[idx][1]), 4
+                                    np.mean(validated_eval_set[idx][1]), 4
                                 )
                             if etalon_eval_metric is not None:
                                 eval_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = (
@@ -3158,6 +3166,7 @@ class FeaturesEnricher(TransformerMixin):
         if len(search_key_names_by_type) == 0:
             return df, {}
+        self.logger.info(f"Start exploding dataset by {search_key_names_by_type}. Size before: {len(df)}")
         multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
         other_columns = [col for col in df.columns if col not in multiple_keys_columns]
         exploded_dfs = []
@@ -3176,6 +3185,7 @@ class FeaturesEnricher(TransformerMixin):
             columns_renaming[new_search_key] = new_search_key
         df = pd.concat(exploded_dfs, ignore_index=True)
+        self.logger.info(f"Finished explosion. Size after: {len(df)}")
         return df, unnest_search_keys
     def __add_fit_system_record_id(
@@ -3209,18 +3219,26 @@ class FeaturesEnricher(TransformerMixin):
                 date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
             sort_columns = [date_column] if date_column is not None else []
+            sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
+            sorted_other_keys = [k for k in sorted_other_keys if k not in sort_exclude_columns]
             other_columns = sorted(
                 [
                     c
                     for c in df.columns
-                    if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
+                    if c not in sort_columns
+                    and c not in sorted_other_keys
+                    and c not in sort_exclude_columns
+                    and df[c].nunique() > 1
                 ]
             )
+            all_other_columns = sorted_other_keys + other_columns
             search_keys_hash = "search_keys_hash"
-            if len(other_columns) > 0:
+            if len(all_other_columns) > 0:
                 sort_columns.append(search_keys_hash)
-                df[search_keys_hash] = pd.util.hash_pandas_object(df[other_columns], index=False)
+                df[search_keys_hash] = pd.util.hash_pandas_object(df[all_other_columns], index=False)
             df = df.sort_values(by=sort_columns)

upgini/utils/datetime_utils.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import datetime
 import logging
 import re
-import pytz
 from typing import Dict, List, Optional
 import numpy as np

{upgini-1.1.315a1.dist-info → upgini-1.1.316.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.1.315a1
+Version: 1.1.316
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.1.315a1.dist-info → upgini-1.1.316.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-upgini/__about__.py,sha256=O5c86RzdiiVrkKCE9atzXzd_M73VBsx6my_6YBJx_co,26
+upgini/__about__.py,sha256=DQCLPSfZIiyKQ88S6JJcAEA3dURvJk2NhtYNJeB5Mq8,24
 upgini/__init__.py,sha256=Xs0YFVBu1KUdtZzbStGRPQtLt3YLzJnjx5nIUBlX8BE,415
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
-upgini/dataset.py,sha256=c6jghh32P9_2CspELYCOsmNIOiShuCADnCCJ8Jj2t50,30834
+upgini/dataset.py,sha256=yAWIygHejxdKXOA4g3QjtCu0VRa9at-4nPPuugCr77U,30857
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=LqGOMObkFsAm58sBL3UhTmc7TOnDQmLivxl3jbXh-n0,187132
+upgini/features_enricher.py,sha256=Gu4gsnMVjcsfWnJlu4Np3jpE9Au1UywhuHQb0Xv5YNg,187982
 upgini/http.py,sha256=a4Epc9YLIJBuYk4t8E_2-QDLBtJFqKO35jn2SnYQZCg,42920
 upgini/lazy_import.py,sha256=EwoM0msNGbSmWBhGbrLDny1DSnOlvTxCjmMKPxYlDms,610
 upgini/metadata.py,sha256=YQ-1HZGyPOksP2iM50ff_pMHXLyzvpChqSfNh8Z0ke4,10833
@@ -15,15 +15,15 @@ upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9Jvf
 upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
 upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/autofe/all_operands.py,sha256=3LiH9iU-ArGmYpS8FHWH7yCFx40ILfvlSXJlKIa75BQ,2542
-upgini/autofe/binary.py,sha256=VyDCv6lw3LlKCsAS9ghwwh6_1OYBbejSzwGGH1Vc1tI,6908
+upgini/autofe/binary.py,sha256=2Z5FrfdCtesKEHBuabEBiRvwOAzcRoFKAX1wvGpHL0I,7003
 upgini/autofe/date.py,sha256=AO3P8GtUHD6vPE_1Vrj3nsnXYBxiXe7vun6aLHReZgQ,9064
 upgini/autofe/feature.py,sha256=gwGWY2UcX_0wHAvfEiu1rRU7GFZyzMWZIaPVcf6kD80,14223
 upgini/autofe/groupby.py,sha256=4WjDzQxqpZxB79Ih4ihMMI5GDxaFqiH6ZelfV82ClT4,3091
 upgini/autofe/operand.py,sha256=MKEsl3zxpWzRDpTkE0sNJxTu62U20sWOvEKhPjUWS6s,2915
-upgini/autofe/unary.py,sha256=B4wp8oKnlJ0nUng-DRMKSiF8MHlhAFYbgmo9Nd_0ZaA,3777
+upgini/autofe/unary.py,sha256=oIMf-IVy7L7GkzxMmQyExX0tOH9RhWeQh7cGxxMDiPk,3832
 upgini/autofe/vector.py,sha256=dLxfAstJs-gw_OQ1xxoxcM6pVzORlV0HVzdzt7cLXVQ,606
 upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-upgini/data_source/data_source_publisher.py,sha256=kTewGmdoxTVkZEqDdbhWbmIKIvb7W0w7ml3WOo-qc2g,21450
+upgini/data_source/data_source_publisher.py,sha256=Vg0biG86YB0OEaoxbK9YYrr4yARm11_h3bTWIBgoScA,22115
 upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
 upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
 upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -42,7 +42,7 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
 upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
 upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
 upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
-upgini/utils/datetime_utils.py,sha256=JSHCx6kpt7n60i3cphI5yWEatQK729x1coSjC8Gafrg,12135
+upgini/utils/datetime_utils.py,sha256=niZcf2YqAwokUFUW474zajlzv9HAMf7nv9v_WPJHpyc,12123
 upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
 upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
 upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
 upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
-upgini-1.1.315a1.dist-info/METADATA,sha256=4Q_gEjMvnOKE9krAg4WFtPnWyF9LpTeMc6ZEPJuCF70,48224
-upgini-1.1.315a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.1.315a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.1.315a1.dist-info/RECORD,,
+upgini-1.1.316.dist-info/METADATA,sha256=12UKpdX0d9nky8XWhKtyQjDK2MVWtbsEr811NSWrKmE,48222
+upgini-1.1.316.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.1.316.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.1.316.dist-info/RECORD,,

{upgini-1.1.315a1.dist-info → upgini-1.1.316.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.1.315a1.dist-info → upgini-1.1.316.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.1.315a1__py3-none-any.whl → 1.1.316__py3-none-any.whl

Potentially problematic release.

upgini 1.1.315a1py3-none-any.whl → 1.1.316py3-none-any.whl