PyPI - upgini - Versions diffs - 1.1.269__py3-none-any.whl → 1.1.273__py3-none-any.whl - Mend

upgini 1.1.269py3-none-any.whl → 1.1.273py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (10) hide show

upgini/autofe/all_operands.py +11 -1
upgini/autofe/date.py +60 -3
upgini/data_source/data_source_publisher.py +13 -3
upgini/features_enricher.py +16 -17
upgini/resource_bundle/strings.properties +1 -1
{upgini-1.1.269.dist-info → upgini-1.1.273.dist-info}/METADATA +1 -1
{upgini-1.1.269.dist-info → upgini-1.1.273.dist-info}/RECORD +10 -10
{upgini-1.1.269.dist-info → upgini-1.1.273.dist-info}/WHEEL +1 -1
{upgini-1.1.269.dist-info → upgini-1.1.273.dist-info}/LICENSE +0 -0
{upgini-1.1.269.dist-info → upgini-1.1.273.dist-info}/top_level.txt +0 -0

upgini/autofe/all_operands.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from typing import Dict
-from upgini.autofe.date import DateDiff, DateDiffType2
+from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded
 from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
 from upgini.autofe.operand import Operand
 from upgini.autofe.unary import Abs, Log, Residual, Sqrt, Square, Sigmoid, Floor, Freq
@@ -38,6 +38,16 @@ ALL_OPERANDS: Dict[str, Operand] = {
         Sim(),
         DateDiff(),
         DateDiffType2(),
+        DateListDiff(aggregation="min"),
+        DateListDiff(aggregation="max"),
+        DateListDiff(aggregation="mean"),
+        DateListDiff(aggregation="nunique"),
+        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=0, upper_bound=18),
+        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=18, upper_bound=23),
+        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=23, upper_bound=30),
+        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
+        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
+        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
     ]
 }

upgini/autofe/date.py CHANGED Viewed

@@ -1,11 +1,12 @@
-from typing import Optional, Union
+from typing import Any, Optional, Union
 import numpy as np
 import pandas as pd
+from pydantic import BaseModel
 from upgini.autofe.operand import PandasOperand
-class DateDiffMixin:
+class DateDiffMixin(BaseModel):
     diff_unit: str = "D"
     left_unit: Optional[str] = None
     right_unit: Optional[str] = None
@@ -38,7 +39,6 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
     name = "date_diff_type2"
     is_binary = True
     has_symmetry_importance = True
-    is_vectorizable = False
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         left = self._convert_to_date(left, self.left_unit)
@@ -51,3 +51,60 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
         diff = (future - left) / np.timedelta64(1, self.diff_unit)
         return diff
+_ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len, 0)}
+class DateListDiff(PandasOperand, DateDiffMixin):
+    is_binary = True
+    has_symmetry_importance = True
+    aggregation: str
+    def __init__(self, **data: Any) -> None:
+        if "name" not in data:
+            data["name"] = f"date_diff_{data.get('aggregation')}"
+        super().__init__(**data)
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        left = self._convert_to_date(left, self.left_unit)
+        right = right.apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
+        return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
+    def _diff(self, x):
+        x = x / np.timedelta64(1, self.diff_unit)
+        return x[x > 0]
+    def _agg(self, x):
+        method = getattr(np, self.aggregation, None)
+        default = np.nan
+        if method is None and self.aggregation in _ext_aggregations:
+            method, default = _ext_aggregations[self.aggregation]
+        elif not callable(method):
+            raise ValueError(f"Unsupported aggregation: {self.aggregation}")
+        return method(x) if len(x) > 0 else default
+class DateListDiffBounded(DateListDiff):
+    lower_bound: Optional[int]
+    upper_bound: Optional[int]
+    def __init__(self, **data: Any) -> None:
+        if "name" not in data:
+            lower_bound = data.get("lower_bound")
+            upper_bound = data.get("upper_bound")
+            components = [
+                "date_diff",
+                data.get("diff_unit"),
+                str(lower_bound if lower_bound is not None else "minusinf"),
+                str(upper_bound if upper_bound is not None else "plusinf"),
+            ]
+            components.append(data.get("aggregation"))
+            data["name"] = "_".join(components)
+        super().__init__(**data)
+    def _agg(self, x):
+        x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
+        return super()._agg(x)

upgini/data_source/data_source_publisher.py CHANGED Viewed

@@ -72,8 +72,8 @@ class DataSourcePublisher:
                     )
                 if search_keys is None or len(search_keys) == 0:
                     raise ValidationError("Empty search keys")
-                if SearchKey.DATE in search_keys.values() and date_format is None:
-                    raise ValidationError("date_format is required for DATE search key")
+                # if SearchKey.DATE in search_keys.values() and date_format is None:
+                #     raise ValidationError("date_format is required for DATE search key")
                 if update_frequency not in self.ACCEPTABLE_UPDATE_FREQUENCIES:
                     raise ValidationError(
                         f"Invalid update frequency: {update_frequency}. "
@@ -85,11 +85,19 @@ class DataSourcePublisher:
                     or set(search_keys.values()) == {SearchKey.MSISDN_RANGE_FROM, SearchKey.MSISDN_RANGE_TO}
                 ) and sort_column is None:
                     raise ValidationError("Sort column is required for passed search keys")
+                if (
+                    set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
+                    and snapshot_frequency_days is None
+                    and join_date_abs_limit_days is None
+                ):
+                    raise ValidationError(
+                        "With MSISDN and DATE keys one of the snapshot_frequency_days or"
+                        " join_date_abs_limit_days parameters is required"
+                    )
                 request = {
                     "dataTableUri": data_table_uri,
                     "searchKeys": {k: v.value.value for k, v in search_keys.items()},
-                    "dateFormat": date_format,
                     "excludeColumns": exclude_columns,
                     "hashFeatureNames": str(hash_feature_names).lower(),
                     "snapshotFrequencyDays": snapshot_frequency_days,
@@ -98,6 +106,8 @@ class DataSourcePublisher:
                     "featuresForEmbeddings": features_for_embeddings,
                     "forceGeneration": str(_force_generation).lower(),
                 }
+                if date_format is not None:
+                    request["dateFormat"] = date_format
                 if secondary_search_keys is not None:
                     request["secondarySearchKeys"] = {k: v.value.value for k, v in secondary_search_keys.items()}
                 if sort_column is not None:

upgini/features_enricher.py CHANGED Viewed

@@ -424,7 +424,7 @@ class FeaturesEnricher(TransformerMixin):
                 self.X = X
                 self.y = y
                 self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
-                self.dump_input(trace_id, X, y, eval_set)
+                self.dump_input(trace_id, X, y, self.eval_set)
                 self.__inner_fit(
                     trace_id,
                     X,
@@ -563,7 +563,7 @@ class FeaturesEnricher(TransformerMixin):
                 self.X = X
                 self.y = y
                 self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
-                self.dump_input(trace_id, X, y, eval_set)
+                self.dump_input(trace_id, X, y, self.eval_set)
                 if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
                     raise ValidationError(self.bundle.get("dataset_too_many_rows_registered").format(Dataset.MAX_ROWS))
@@ -823,12 +823,16 @@ class FeaturesEnricher(TransformerMixin):
                 print(msg)
             self.__validate_search_keys(self.search_keys, self.search_id)
+            effective_X = X if X is not None else self.X
+            effective_y = y if y is not None else self.y
+            effective_eval_set = eval_set if eval_set is not None else self.eval_set
+            effective_eval_set = self._check_eval_set(effective_eval_set, effective_X, self.bundle)
             try:
                 self.__log_debug_information(
-                    X if X is not None else self.X,
-                    y if y is not None else self.y,
-                    eval_set if eval_set is not None else self.eval_set,
+                    effective_X,
+                    effective_y,
+                    effective_eval_set,
                     exclude_features_sources=exclude_features_sources,
                     cv=cv if cv is not None else self.cv,
                     importance_threshold=importance_threshold,
@@ -842,17 +846,14 @@ class FeaturesEnricher(TransformerMixin):
                     self._search_task is None
                     or self._search_task.provider_metadata_v2 is None
                     or len(self._search_task.provider_metadata_v2) == 0
-                    or (self.X is None and X is None)
-                    or (self.y is None and y is None)
+                    or effective_X is None
+                    or effective_y is None
                 ):
                     raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
                 if X is not None and y is None:
                     raise ValidationError("X passed without y")
-                effective_X = X if X is not None else self.X
-                effective_eval_set = eval_set if eval_set is not None else self.eval_set
                 validate_scoring_argument(scoring)
                 self._validate_baseline_score(effective_X, effective_eval_set)
@@ -872,8 +873,7 @@ class FeaturesEnricher(TransformerMixin):
                 ):
                     cat_features = estimator.get_param("cat_features")
                     if len(cat_features) > 0 and isinstance(cat_features[0], int):
-                        effectiveX = X or self.X
-                        cat_features = [effectiveX.columns[i] for i in cat_features]
+                        cat_features = [effective_X.columns[i] for i in cat_features]
                         for cat_feature in cat_features:
                             if cat_feature in self.search_keys:
                                 if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
@@ -883,9 +883,9 @@ class FeaturesEnricher(TransformerMixin):
                 prepared_data = self._prepare_data_for_metrics(
                     trace_id=trace_id,
-                    X=X,
-                    y=y,
-                    eval_set=eval_set,
+                    X=effective_X,
+                    y=effective_y,
+                    eval_set=effective_eval_set,
                     exclude_features_sources=exclude_features_sources,
                     importance_threshold=importance_threshold,
                     max_features=max_features,
@@ -995,8 +995,6 @@ class FeaturesEnricher(TransformerMixin):
                         enriched_metric = None
                         uplift = None
-                    effective_X = X if X is not None else self.X
-                    effective_y = y if y is not None else self.y
                     train_metrics = {
                         self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
                             "quality_metrics_train_segment"
@@ -2823,6 +2821,7 @@ class FeaturesEnricher(TransformerMixin):
             maybe_date_col = self._get_date_column(self.search_keys)
             if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
+                # TODO cast date column to single dtype
                 min_date = X[maybe_date_col].min()
                 max_date = X[maybe_date_col].max()
                 self.logger.info(f"Dates interval is ({min_date}, {max_date})")

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -203,7 +203,7 @@ phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`.
 target_type_detected=\nDetected task type: {}\n
 # all_ok_community_invite=Chat with us in Slack community:
 all_ok_community_invite=❓ Support request
-too_small_for_metrics=Your train dataset contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
+too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
 imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
 loss_selection_info=Using loss `{}` for feature selection
 loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator

{upgini-1.1.269.dist-info → upgini-1.1.273.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: upgini
-Version: 1.1.269
+Version: 1.1.273
 Summary: Intelligent data search & enrichment for Machine Learning
 Home-page: https://upgini.com/
 Author: Upgini Developers

{upgini-1.1.269.dist-info → upgini-1.1.273.dist-info}/RECORD RENAMED Viewed

@@ -2,7 +2,7 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
 upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
 upgini/dataset.py,sha256=xb4gIANyGbdcuM8Awyq2pJPiH_3k_LEbETApJgAoRBA,45529
 upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
-upgini/features_enricher.py,sha256=hdI3dRDyg9rKMGK3IyRTMTDxESEbF1xmtH6dp8k3srw,174132
+upgini/features_enricher.py,sha256=LPYSCGq89WLaL5iQNikTyhICUs_APtqEvhn5XRENn1U,174105
 upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
 upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
 upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
@@ -13,23 +13,23 @@ upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1
 upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
 upgini/ads_management/ads_manager.py,sha256=fP4Yqx3h2Snw5X335TbXEwFoupq1RYsE7y0PAduvetU,2646
 upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-upgini/autofe/all_operands.py,sha256=Nb7Fu4owDNy9gKbJN88c1DxODNtEEGAhiLT1-Eoc9yI,1587
+upgini/autofe/all_operands.py,sha256=H66wqVLD-H9k8A4-q2wslhV9QaNxlb49f8YiT0Xfkps,2356
 upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
-upgini/autofe/date.py,sha256=lrZ5xpQO2L0c2bPta3EMdd1v5czDH_WY08Ww1s50t4w,1824
+upgini/autofe/date.py,sha256=cc0GMAJR0QZOI_Qp2V5UDklaXLNS_79O1GhU6GlOYzg,3895
 upgini/autofe/feature.py,sha256=2FQRGtIumNz60hFAjfLReaY18SI7HxzYZOoC5avzSjQ,11847
 upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
 upgini/autofe/operand.py,sha256=dhtToPDGWtP_0u_RjayUpezJJZAgq_TzNbPH0bI9OXI,2805
 upgini/autofe/unary.py,sha256=YRTzQLttbDdOnkogWBPnBexpu7uHWSLSFAxSCu3iFdY,3145
 upgini/autofe/vector.py,sha256=5qhI_bdwaWM1l7fgCkx1tMt9R9gxWzoYCl-7WO4KiOs,604
 upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-upgini/data_source/data_source_publisher.py,sha256=QASEDhJ9SxJKcWxoN2vUPxrM_HTlwKQOPa92L7EQneA,15962
+upgini/data_source/data_source_publisher.py,sha256=J2lrpPuysUHPeqTSfoybBtPRTBCFu7R5KzaakhjaRDc,16485
 upgini/mdc/__init__.py,sha256=ETDh3JKbrDdPMOECiYLAa8lvKYe68mv4IY6fZa9FimA,1126
 upgini/mdc/context.py,sha256=Sl1S_InKlzzRxYqwJ2k24lawJdCKWgGJ-RIRfvzWJrk,1468
 upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/normalizer/phone_normalizer.py,sha256=lhwsPEnfyjeIsndW2EcQGZksXYsfxaQ1ghAzVYoDRKM,9927
 upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
 upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
-upgini/resource_bundle/strings.properties,sha256=00KNv1A3rxXioktqB9o_V_zX0etC2LZO7NBIEsCoNNQ,26087
+upgini/resource_bundle/strings.properties,sha256=TM9OykiEXNpcgFN3DpqBGbQs4N9m4mzHBn-k6aazc30,26111
 upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
 upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
@@ -56,8 +56,8 @@ upgini/utils/sklearn_ext.py,sha256=e1aMNXk1zUt7uFnl0FcUF0zOnaXSE7z5xBHmJPknUVs,4
 upgini/utils/target_utils.py,sha256=9K67tkY7LWhQMO-vbbPqBaO-KriAmg_6fVz5RQRaLQc,7802
 upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
 upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
-upgini-1.1.269.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.1.269.dist-info/METADATA,sha256=eabO8mMQA4qAV37lMnBhxe2gpllcmOWFI65Hhb7b5Ec,48156
-upgini-1.1.269.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-upgini-1.1.269.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
-upgini-1.1.269.dist-info/RECORD,,
+upgini-1.1.273.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.1.273.dist-info/METADATA,sha256=Omoz12LfHouVHSu4OlfpbPbHZJ4ZXW5K1bTUo3jFswg,48156
+upgini-1.1.273.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+upgini-1.1.273.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
+upgini-1.1.273.dist-info/RECORD,,

{upgini-1.1.269.dist-info → upgini-1.1.273.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.43.0)
+Generator: bdist_wheel (0.42.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{upgini-1.1.269.dist-info → upgini-1.1.273.dist-info}/LICENSE RENAMED Viewed

File without changes

{upgini-1.1.269.dist-info → upgini-1.1.273.dist-info}/top_level.txt RENAMED Viewed

File without changes

upgini 1.1.269__py3-none-any.whl → 1.1.273__py3-none-any.whl

Potentially problematic release.

upgini 1.1.269py3-none-any.whl → 1.1.273py3-none-any.whl