PyPI - upgini - Versions diffs - 1.2.57__py3-none-any.whl → 1.2.57a2__py3-none-any.whl - Mend

upgini 1.2.57py3-none-any.whl → 1.2.57a2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (11) hide show

upgini/__about__.py +1 -1
upgini/autofe/vector.py +5 -82
upgini/data_source/data_source_publisher.py +1 -0
upgini/dataset.py +16 -8
upgini/features_enricher.py +11 -6
upgini/resource_bundle/strings.properties +1 -0
upgini/utils/email_utils.py +6 -6
{upgini-1.2.57.dist-info → upgini-1.2.57a2.dist-info}/METADATA +1 -1
{upgini-1.2.57.dist-info → upgini-1.2.57a2.dist-info}/RECORD +11 -11
{upgini-1.2.57.dist-info → upgini-1.2.57a2.dist-info}/WHEEL +1 -1
{upgini-1.2.57.dist-info → upgini-1.2.57a2.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.57"
1	+ __version__ = "1.2.57a2"

upgini/autofe/vector.py CHANGED Viewed

@@ -2,11 +2,7 @@ import abc
 from typing import Dict, List, Optional
 import pandas as pd
-try:
-    from pydantic import field_validator as validator  # V2
-except ImportError:
-    from pydantic import validator  # V1
+from pydantic import validator
 from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
@@ -33,16 +29,12 @@ class Sum(PandasOperand, VectorizableMixin):
 class TimeSeriesBase(PandasOperand, abc.ABC):
     is_vector: bool = True
     date_unit: Optional[str] = None
-    offset_size: int = 0
-    offset_unit: str = "D"
     def get_params(self) -> Dict[str, Optional[str]]:
         res = super().get_params()
         res.update(
             {
                 "date_unit": self.date_unit,
-                "offset_size": self.offset_size,
-                "offset_unit": self.offset_unit,
             }
         )
         return res
@@ -54,31 +46,13 @@ class TimeSeriesBase(PandasOperand, abc.ABC):
         ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
         ts.set_index(date.name, inplace=True)
         ts = ts[ts.index.notna()].sort_index()
-        ts = (
-            ts.groupby([c.name for c in data[1:-1]])
-            .apply(self._shift)[data[-1].name]
-            .to_frame()
-            .reset_index()
-            .set_index(date.name)
-            .groupby([c.name for c in data[1:-1]])
-            if len(data) > 2
-            else self._shift(ts)
-        )
+        ts = ts.groupby([c.name for c in data[1:-1]]) if len(data) > 2 else ts
         ts = self._aggregate(ts)
         ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
         ts.index = date.index
         return ts.iloc[:, -1]
-    def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
-        if self.offset_size > 0:
-            return ts.iloc[:, :-1].merge(
-                ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
-                left_index=True,
-                right_index=True,
-            )
-        return ts
     @abc.abstractmethod
     def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
         pass
@@ -93,7 +67,6 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
     window_unit: str = "D"
     @validator("window_unit")
-    @classmethod
     def validate_window_unit(cls, v: str) -> str:
         try:
             pd.tseries.frequencies.to_offset(v)
@@ -104,35 +77,12 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
             )
     def to_formula(self) -> str:
-        roll_component = f"roll_{self.window_size}{self.window_unit}"
-        if self.offset_size > 0:
-            roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
-        return f"{roll_component}_{self.aggregation}"
+        return f"roll_{self.window_size}{self.window_unit}_{self.aggregation}"
     @classmethod
     def from_formula(cls, formula: str) -> Optional["Roll"]:
         import re
-        # Try matching pattern with offset first
-        pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
-        match_with_offset = re.match(pattern_with_offset, formula)
-        if match_with_offset:
-            window_size = int(match_with_offset.group(1))
-            window_unit = match_with_offset.group(2)
-            offset_size = int(match_with_offset.group(3))
-            offset_unit = match_with_offset.group(4)
-            aggregation = match_with_offset.group(5)
-            return cls(
-                window_size=window_size,
-                window_unit=window_unit,
-                offset_size=offset_size,
-                offset_unit=offset_unit,
-                aggregation=aggregation,
-            )
-        # If no offset pattern found, try basic pattern
         pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
         match = re.match(pattern, formula)
@@ -157,7 +107,7 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
         return res
     def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
-        return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
+        return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=self.window_size).agg(
             _roll_aggregations.get(self.aggregation, self.aggregation)
         )
@@ -167,33 +117,12 @@ class Lag(TimeSeriesBase, ParametrizedOperand):
     lag_unit: str = "D"
     def to_formula(self) -> str:
-        lag_component = f"lag_{self.lag_size}{self.lag_unit}"
-        if self.offset_size > 0:
-            lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
-        return lag_component
+        return f"lag_{self.lag_size}{self.lag_unit}"
     @classmethod
     def from_formula(cls, formula: str) -> Optional["Lag"]:
         import re
-        # Try matching pattern with offset first
-        pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
-        match_with_offset = re.match(pattern_with_offset, formula)
-        if match_with_offset:
-            lag_size = int(match_with_offset.group(1))
-            lag_unit = match_with_offset.group(2)
-            offset_size = int(match_with_offset.group(3))
-            offset_unit = match_with_offset.group(4)
-            return cls(
-                lag_size=lag_size,
-                lag_unit=lag_unit,
-                offset_size=offset_size,
-                offset_unit=offset_unit,
-            )
-        # If no offset pattern found, try basic pattern
         pattern = r"^lag_(\d+)([a-zA-Z])$"
         match = re.match(pattern, formula)
@@ -207,12 +136,6 @@ class Lag(TimeSeriesBase, ParametrizedOperand):
     def get_params(self) -> Dict[str, Optional[str]]:
         res = super().get_params()
-        res.update(
-            {
-                "lag_size": self.lag_size,
-                "lag_unit": self.lag_unit,
-            }
-        )
         return res
     def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:

upgini/data_source/data_source_publisher.py CHANGED Viewed

@@ -386,6 +386,7 @@ class DataSourcePublisher:
                 search_keys = [k.value.value for k in search_keys] if search_keys else None
                 request = {"bqTableId": bq_table_id, "searchKeys": search_keys}
                 task_id = self._rest_client.upload_online(request, trace_id)
+                print(f"Uploading online task created. task_id={task_id}")
                 with Spinner():
                     status_response = self._rest_client.poll_ads_management_task_status(task_id, trace_id)
                     while status_response["status"] not in self.FINAL_STATUSES:

upgini/dataset.py CHANGED Viewed

@@ -587,15 +587,23 @@ class Dataset:  # (pd.DataFrame):
         if (
             runtime_parameters is not None
             and runtime_parameters.properties is not None
-            and "generate_features" in runtime_parameters.properties
         ):
-            generate_features = runtime_parameters.properties["generate_features"].split(",")
-            renamed_generate_features = []
-            for f in generate_features:
-                for new_column, orig_column in self.columns_renaming.items():
-                    if f == orig_column:
-                        renamed_generate_features.append(new_column)
-            runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
+            if "generate_features" in runtime_parameters.properties:
+                generate_features = runtime_parameters.properties["generate_features"].split(",")
+                renamed_generate_features = []
+                for f in generate_features:
+                    for new_column, orig_column in self.columns_renaming.items():
+                        if f == orig_column:
+                            renamed_generate_features.append(new_column)
+                runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
+            if "columns_for_online_api" in runtime_parameters.properties:
+                columns_for_online_api = runtime_parameters.properties["columns_for_online_api"].split(",")
+                renamed_columns_for_online_api = []
+                for f in columns_for_online_api:
+                    for new_column, orig_column in self.columns_renaming.items():
+                        if f == orig_column:
+                            renamed_columns_for_online_api.append(new_column)
+                runtime_parameters.properties["columns_for_online_api"] = ",".join(renamed_columns_for_online_api)
         return runtime_parameters

upgini/features_enricher.py CHANGED Viewed

@@ -222,6 +222,7 @@ class FeaturesEnricher(TransformerMixin):
         loss: Optional[str] = None,
         detect_missing_search_keys: bool = True,
         generate_features: Optional[List[str]] = None,
+        columns_for_online_api: Optional[List[str]] = None,
         round_embeddings: Optional[int] = None,
         logs_enabled: bool = True,
         raise_validation_error: bool = True,
@@ -345,6 +346,9 @@ class FeaturesEnricher(TransformerMixin):
                     self.logger.error(msg)
                     raise ValidationError(msg)
                 self.runtime_parameters.properties["round_embeddings"] = round_embeddings
+        self.columns_for_online_api = columns_for_online_api
+        if columns_for_online_api is not None:
+            self.runtime_parameters.properties["columns_for_online_api"] = ",".join(columns_for_online_api)
         maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
         if maybe_downsampling_limit is not None:
             Dataset.FIT_SAMPLE_THRESHOLD = int(maybe_downsampling_limit)
@@ -2620,17 +2624,18 @@ if response.status_code == 200:
             checked_generate_features = []
             for gen_feature in self.generate_features:
                 if gen_feature not in x_columns:
-                    if gen_feature == self._get_phone_column(self.search_keys):
-                        raise ValidationError(
-                            self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
-                        )
-                    else:
-                        self.__log_warning(self.bundle.get("missing_generate_feature").format(gen_feature, x_columns))
+                    msg = self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
+                    self.__log_warning(msg)
                 else:
                     checked_generate_features.append(gen_feature)
             self.generate_features = checked_generate_features
             self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
+        if self.columns_for_online_api is not None and len(self.columns_for_online_api) > 0:
+            for column in self.columns_for_online_api:
+                if column not in validated_X.columns:
+                    raise ValidationError(self.bundle.get("missing_column_for_online_api").format(column))
         if self.id_columns is not None:
             for id_column in self.id_columns:
                 if id_column not in validated_X.columns:

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -111,6 +111,7 @@ x_is_empty=X is empty
 y_is_empty=y is empty
 x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
 missing_generate_feature=Feature {} specified in `generate_features` is not present in input columns: {}
+missing_column_for_online_api=Column {} specified in `columns_for_online_api` is not present in input columns: {}
 x_unstable_by_date=Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
 train_unstable_target=Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
 eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples

upgini/utils/email_utils.py CHANGED Viewed

@@ -116,17 +116,17 @@ class EmailSearchKeyConverter:
         else:
             df[self.hem_column] = df[self.hem_column].astype("string").str.lower()
-        del self.search_keys[self.email_column]
-        if self.email_column in self.unnest_search_keys:
-            self.unnest_search_keys.remove(self.email_column)
+        # del self.search_keys[self.email_column]
+        # if self.email_column in self.unnest_search_keys:
+        #     self.unnest_search_keys.remove(self.email_column)
         one_domain_name = self.email_column + self.ONE_DOMAIN_SUFFIX
         df[one_domain_name] = df[self.email_column].apply(self._email_to_one_domain)
         self.columns_renaming[one_domain_name] = original_email_column
         self.search_keys[one_domain_name] = SearchKey.EMAIL_ONE_DOMAIN
-        if self.email_converted_to_hem:
-            df = df.drop(columns=self.email_column)
-            del self.columns_renaming[self.email_column]
+        # if self.email_converted_to_hem:
+        #     df = df.drop(columns=self.email_column)
+        #     del self.columns_renaming[self.email_column]
         return df

{upgini-1.2.57.dist-info → upgini-1.2.57a2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.57
+Version: 1.2.57a2
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.57.dist-info → upgini-1.2.57a2.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-upgini/__about__.py,sha256=4bvatwbfE15IIgVfHJZH8d-WXGATbSGcT6GSdTUc1l0,23
+upgini/__about__.py,sha256=PD2lbh5FQufk15oyUAYIGJrdUHAs9qG5Btw3lTqrUtI,25
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
-upgini/dataset.py,sha256=vT4JyHmafLNbj54SySXr93f5hNS6-t94aFslbBy-7No,33535
+upgini/dataset.py,sha256=NP5vHqEfZQ1HWz3TcNAa_OhXG8wiMRdydm26D6UBiRU,34166
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=FkAKQV_XOXTobwOXpdy9BPfRkL4fkgoNa2B6NniiCrs,201554
+upgini/features_enricher.py,sha256=qJhzMy_Z16wUduRrtAluawV8h_t4HCg9I7uDpRnhKjk,201884
 upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
 upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
 upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
@@ -21,16 +21,16 @@ upgini/autofe/feature.py,sha256=l8A8E3BH2BmYvqEC81zbcIEfH6KEEhcesJ2BH4fn0-4,1514
 upgini/autofe/groupby.py,sha256=G48_sQZw016eGx3cOy8YQrEIOp95puWqYUpFWd-gdeM,3595
 upgini/autofe/operand.py,sha256=8Ttrfxv_H91dMbS7J55zxluzAJHfGXU_Y2xCh4OHwb8,4774
 upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
-upgini/autofe/vector.py,sha256=udkg4pP7IIeLjt0Cg6rzEKUmGaubOnqsEz3bz9R6E44,7110
+upgini/autofe/vector.py,sha256=bvcop9b0uFFPfQ3FLTwXT2IYfxNl4dIfR8icvnBHvOA,4358
 upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lYQaGZbxDnOd4A3Q,22516
+upgini/data_source/data_source_publisher.py,sha256=0vaYz5v3KclJnA6jAWiTUiMQO5mbBTBINWV9jr2F5xM,22591
 upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
 upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
 upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
 upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
 upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
-upgini/resource_bundle/strings.properties,sha256=0_KAExIi1u48N1CQ13LKJS3bgDlRs-MPOyU3VxcE-qY,27350
+upgini/resource_bundle/strings.properties,sha256=UXMiaFP3p-WdiXyZJN3O_OZstb-F33BWVDxDiofyxd4,27464
 upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
 upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -46,7 +46,7 @@ upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
 upgini/utils/datetime_utils.py,sha256=RVAk4_rakK8X9zjybK3-rj0to0e3elye8tnBuA4wTWU,13491
 upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
 upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
-upgini/utils/email_utils.py,sha256=GbnhHJn1nhUBytmK6PophYqaoq4t7Lp6i0-O0Gd3RV8,5265
+upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
 upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
 upgini/utils/feature_info.py,sha256=0rOXSyCj-sw-8migWP0ge8qrOzGU50dQvH0JUJUrDfQ,6766
 upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
 upgini/utils/target_utils.py,sha256=RlpKGss9kMibVSlA8iZuO_qxmyeplqzn7X8g6hiGGGs,14341
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.57.dist-info/METADATA,sha256=oRGZz3JdygY9pgsN4tSc14GF7Iqhfp4lMXs2TBQX3Qw,49055
-upgini-1.2.57.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
-upgini-1.2.57.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.57.dist-info/RECORD,,
+upgini-1.2.57a2.dist-info/METADATA,sha256=-dEVxWnjwc3LcSqFVJGENL07YJDvWgH8mHQ0PaE93sI,49057
+upgini-1.2.57a2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.2.57a2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.57a2.dist-info/RECORD,,

{upgini-1.2.57.dist-info → upgini-1.2.57a2.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.24.2
+Generator: hatchling 1.25.0
 Root-Is-Purelib: true
 Tag: py3-none-any

{upgini-1.2.57.dist-info → upgini-1.2.57a2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.57__py3-none-any.whl → 1.2.57a2__py3-none-any.whl

Potentially problematic release.

upgini 1.2.57py3-none-any.whl → 1.2.57a2py3-none-any.whl