PyPI - upgini - Versions diffs - 1.1.285__tar.gz → 1.1.285a3418.post1__tar.gz - Mend

upgini 1.1.285tar.gz → 1.1.285a3418.post1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (64) hide show

{upgini-1.1.285 → upgini-1.1.285a3418.post1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.1.285
+Version: 1.1.285a3418.post1
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

upgini-1.1.285a3418.post1/src/upgini/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.1.285a3418-1"

{upgini-1.1.285 → upgini-1.1.285a3418.post1}/src/upgini/autofe/all_operands.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from typing import Dict
 from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
-from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded
+from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded, DatePercentile
 from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
 from upgini.autofe.operand import Operand
 from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Sigmoid, Sqrt, Square
@@ -49,6 +49,7 @@ ALL_OPERANDS: Dict[str, Operand] = {
         DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
         DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
         DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
+        DatePercentile(),
     ]
 }

{upgini-1.1.285 → upgini-1.1.285a3418.post1}/src/upgini/autofe/date.py RENAMED Viewed

@@ -1,9 +1,9 @@
-from typing import Any, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 import numpy as np
 import pandas as pd
 from pandas.core.arrays.timedeltas import TimedeltaArray
-from pydantic import BaseModel
+from pydantic import BaseModel, validator
 from upgini.autofe.operand import PandasOperand
@@ -27,6 +27,17 @@ class DateDiff(PandasOperand, DateDiffMixin):
     is_binary = True
     has_symmetry_importance = True
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "diff_unit": self.diff_unit,
+                "left_unit": self.left_unit,
+                "right_unit": self.right_unit,
+            }
+        )
+        return res
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         left = self._convert_to_date(left, self.left_unit)
         right = self._convert_to_date(right, self.right_unit)
@@ -42,6 +53,17 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
     is_binary = True
     has_symmetry_importance = True
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "diff_unit": self.diff_unit,
+                "left_unit": self.left_unit,
+                "right_unit": self.right_unit,
+            }
+        )
+        return res
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         left = self._convert_to_date(left, self.left_unit)
         right = self._convert_to_date(right, self.right_unit)
@@ -64,6 +86,15 @@ class DateListDiff(PandasOperand, DateDiffMixin):
     has_symmetry_importance = True
     aggregation: str
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "aggregation": self.aggregation,
+            }
+        )
+        return res
     def __init__(self, **data: Any) -> None:
         if "name" not in data:
             data["name"] = f"date_diff_{data.get('aggregation')}"
@@ -116,3 +147,55 @@ class DateListDiffBounded(DateListDiff):
     def _agg(self, x):
         x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
         return super()._agg(x)
+class DatePercentile(PandasOperand):
+    name = "date_per"
+    is_binary = True
+    output_type = "float"
+    date_unit: Optional[str] = None
+    zero_month: Optional[int]
+    zero_year: Optional[int]
+    zero_bounds: Optional[List[float]]
+    step: int = 30
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "date_unit": self.date_unit,
+                "zero_month": self.zero_month,
+                "zero_year": self.zero_year,
+                "zero_bounds": self.zero_bounds,
+                "step": self.step,
+            }
+        )
+        return res
+    @validator("zero_bounds", pre=True)
+    def validate_bounds(cls, value):
+        if value is None or isinstance(value, list):
+            return value
+        elif isinstance(value, str):
+            return value[1:-1].split(", ")
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        # Assuming that left is a date column, right is a feature column
+        left = pd.to_datetime(left, unit=self.date_unit)
+        months = left.dt.month
+        years = left.dt.year
+        month_diffs = 12 * (years - (self.zero_year or 0)) + (months - (self.zero_month or 0))
+        bounds = month_diffs.apply(
+            lambda d: np.array(self.zero_bounds if self.zero_bounds is not None else []) + d * 30
+        )
+        return right.index.to_series().apply(lambda i: self.__perc(right[i], bounds[i]))
+    def __perc(self, f, bounds):
+        hit = np.where(f >= bounds)[0]
+        if hit.size > 0:
+            return np.max(hit) + 1
+        else:
+            return np.nan

{upgini-1.1.285 → upgini-1.1.285a3418.post1}/src/upgini/autofe/feature.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import hashlib
 import itertools
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Set, Tuple, Union
 import numpy as np
 import pandas as pd
@@ -16,6 +16,12 @@ class Column:
         self.data = data
         self.calculate_all = calculate_all
+    def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
+        return self.name
+    def set_op_params(self, params: Dict[str, str]) -> "Column":
+        return self
     def rename_columns(self, mapping: Dict[str, str]) -> "Column":
         self.name = self._unhash(mapping.get(self.name) or self.name)
         return self
@@ -69,19 +75,30 @@ class Feature:
         self.cached_display_name = cached_display_name
         self.alias = alias
-    def set_op_params(self, params: Dict[str, str]) -> "Feature":
+    def set_op_params(self, params: Optional[Dict[str, str]]) -> "Feature":
+        obj_dict = self.op.dict().copy()
+        obj_dict.update(params or {})
+        self.op = self.op.__class__.parse_obj(obj_dict)
         self.op.set_params(params)
+        for child in self.children:
+            child.set_op_params(params)
         return self
     def get_hash(self) -> str:
-        return hashlib.sha256("_".join([self.op.name] + [ch.name for ch in self.children]).encode("utf-8")).hexdigest()[
-            :8
-        ]
+        return hashlib.sha256(
+            "_".join([self.op.name] + [ch.get_display_name() for ch in self.children]).encode("utf-8")
+        ).hexdigest()[:8]
     def set_alias(self, alias: str) -> "Feature":
         self.alias = alias
         return self
+    def get_all_operand_names(self) -> Set[str]:
+        return {self.op.name}.union(
+            {n for f in self.children if isinstance(f, Feature) for n in f.get_all_operand_names()}
+        )
     def rename_columns(self, mapping: Dict[str, str]) -> "Feature":
         for child in self.children:
             child.rename_columns(mapping)

{upgini-1.1.285 → upgini-1.1.285a3418.post1}/src/upgini/autofe/operand.py RENAMED Viewed

@@ -25,8 +25,10 @@ class Operand(BaseModel):
         self.params = params
         return self
-    def get_params(self) -> Dict[str, str]:
-        return self.params
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = {"alias": self.alias}
+        res.update(self.params or {})
+        return res
 MAIN_COLUMN = "main_column"

{upgini-1.1.285 → upgini-1.1.285a3418.post1}/src/upgini/features_enricher.py RENAMED Viewed

@@ -423,7 +423,7 @@ class FeaturesEnricher(TransformerMixin):
             self.logger.info("Start fit")
-            self.__validate_search_keys(self.search_keys)
+            self.__validate_search_keys(self.search_keys, self.search_id)
             # Validate client estimator params
             self._get_client_cat_features(estimator, X, self.search_keys)
@@ -557,7 +557,7 @@ class FeaturesEnricher(TransformerMixin):
             self.logger.info("Start fit_transform")
-            self.__validate_search_keys(self.search_keys)
+            self.__validate_search_keys(self.search_keys, self.search_id)
             search_progress = SearchProgress(0.0, ProgressStage.START_FIT)
             if progress_callback is not None:
@@ -972,8 +972,6 @@ class FeaturesEnricher(TransformerMixin):
                     # 2 Fit and predict with KFold estimator on enriched tds
                     # and calculate final metric (and uplift)
-                    enriched_metric = None
-                    uplift = None
                     enriched_estimator = None
                     if set(fitting_X.columns) != set(fitting_enriched_X.columns):
                         self.logger.info(
@@ -994,15 +992,18 @@ class FeaturesEnricher(TransformerMixin):
                             has_date=has_date,
                         )
                         enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
-                        if enriched_metric is None:
+                        if etalon_metric is None:
                             self.logger.warning(
                                 f"Enriched {metric} on train combined features is None (maybe all features was removed)"
                             )
                             enriched_estimator = None
+                            uplift = None
                         else:
                             self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
-                        if etalon_metric is not None and enriched_metric is not None:
                             uplift = (enriched_metric - etalon_metric) * multiplier
+                    else:
+                        enriched_metric = None
+                        uplift = None
                     train_metrics = {
                         self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
@@ -1451,15 +1452,12 @@ class FeaturesEnricher(TransformerMixin):
             if len(decimal_columns_to_fix) > 0:
                 for col in decimal_columns_to_fix:
                     fitting_eval_X[col] = (
-                        fitting_eval_X[col]
-                        .astype("string").str
-                        .replace(",", ".", regex=False)
-                        .astype(np.float64)
+                        fitting_eval_X[col].astype("string").str.replace(",", ".", regex=False).astype(np.float64)
                     )
                     fitting_enriched_eval_X[col] = (
                         fitting_enriched_eval_X[col]
-                        .astype("string").str
-                        .replace(",", ".", regex=False)
+                        .astype("string")
+                        .str.replace(",", ".", regex=False)
                         .astype(np.float64)
                     )
@@ -2149,7 +2147,7 @@ class FeaturesEnricher(TransformerMixin):
             ]
             return excluded_features[feature_name_header].values.tolist()
-    def __validate_search_keys(self, search_keys: Dict[str, SearchKey], search_id: Optional[str] = None):
+    def __validate_search_keys(self, search_keys: Dict[str, SearchKey], search_id: Optional[str]):
         if (search_keys is None or len(search_keys) == 0) and self.country_code is None:
             if search_id:
                 self.logger.debug(f"search_id {search_id} provided without search_keys")
@@ -3278,10 +3276,16 @@ class FeaturesEnricher(TransformerMixin):
             descriptions = []
             for m in autofe_meta:
-                autofe_feature = Feature.from_formula(m.formula)
                 orig_to_hashed = {base_column.original_name: base_column.hashed_name for base_column in m.base_columns}
-                autofe_feature.rename_columns(orig_to_hashed)
-                autofe_feature.set_display_index(m.display_index)
+                autofe_feature = (
+                    Feature.from_formula(m.formula)
+                    .set_display_index(m.display_index)
+                    .set_alias(m.alias)
+                    .set_op_params(m.operator_params or {})
+                    .rename_columns(orig_to_hashed)
+                )
                 if autofe_feature.op.is_vector:
                     continue
@@ -3302,7 +3306,7 @@ class FeaturesEnricher(TransformerMixin):
                     description[f"Feature {feature_idx}"] = bc.hashed_name
                     feature_idx += 1
-                description["Function"] = autofe_feature.op.name
+                description["Function"] = ",".join(sorted(autofe_feature.get_all_operand_names()))
                 descriptions.append(description)

{upgini-1.1.285 → upgini-1.1.285a3418.post1}/src/upgini/metadata.py RENAMED Viewed

@@ -256,9 +256,11 @@ class BaseColumnMetadata(BaseModel):
 class GeneratedFeatureMetadata(BaseModel):
-    formula: str  # on hashed names
+    alias: Optional[str]
+    formula: str
     display_index: str
     base_columns: List[BaseColumnMetadata]
+    operator_params: Optional[Dict[str, str]]
 class ProviderTaskMetadataV2(BaseModel):

{upgini-1.1.285 → upgini-1.1.285a3418.post1}/src/upgini/utils/custom_loss_utils.py RENAMED Viewed

@@ -11,49 +11,46 @@ def get_runtime_params_custom_loss(
     runtime_parameters: RuntimeParameters,
     logger: Optional[logging.Logger] = None,
 ) -> RuntimeParameters:
-    if not loss:
-        return runtime_parameters
     if logger is None:
         logger = logging.getLogger()
-    selection_loss_reg = [
-        "regression",
-        "regression_l1",
-        "huber",
-        "poisson",
-        "quantile",
-        "mape",
-        "mean_absolute_percentage_error",
-        "gamma",
-        "tweedie",
-    ]
-    selection_loss_binary = ["binary"]
-    selection_loss_multi_clf = ["multiclass", "multiclassova", "multiclass_ova", "ova", "ovr"]
-    use_custom_loss = (
-        True
-        if (
-            (model_task_type == ModelTaskType.REGRESSION)
-            and (loss in selection_loss_reg)
-            or (model_task_type == ModelTaskType.BINARY)
-            and (loss in selection_loss_binary)
-            or (model_task_type == ModelTaskType.MULTICLASS)
-            and (loss in selection_loss_multi_clf)
+    if loss is not None:
+        selection_loss_reg = [
+            "regression",
+            "regression_l1",
+            "huber",
+            "poisson",
+            "quantile",
+            "mape",
+            "mean_absolute_percentage_error",
+            "gamma",
+            "tweedie",
+        ]
+        selection_loss_binary = ["binary"]
+        selection_loss_multi_clf = ["multiclass", "multiclassova", "multiclass_ova", "ova", "ovr"]
+        use_custom_loss = (
+            True
+            if (
+                (model_task_type == ModelTaskType.REGRESSION)
+                and (loss in selection_loss_reg)
+                or (model_task_type == ModelTaskType.BINARY)
+                and (loss in selection_loss_binary)
+                or (model_task_type == ModelTaskType.MULTICLASS)
+                and (loss in selection_loss_multi_clf)
+            )
+            else False
         )
-        else False
-    )
-    if use_custom_loss:
-        runtime_parameters.properties["lightgbm_params_preselection.objective"] = loss
-        runtime_parameters.properties["lightgbm_params_base.objective"] = loss
-        runtime_parameters.properties["lightgbm_params_segment.objective"] = loss
-        msg = bundle.get("loss_selection_info").format(loss)
-        logger.info(msg)
-        print(msg)
-    else:
-        msg = bundle.get("loss_selection_warn").format(loss, model_task_type)
-        logger.warning(msg)
-        print(msg)
+        if use_custom_loss:
+            runtime_parameters.properties["lightgbm_params_preselection.objective"] = loss
+            runtime_parameters.properties["lightgbm_params_base.objective"] = loss
+            runtime_parameters.properties["lightgbm_params_segment.objective"] = loss
+            msg = bundle.get("loss_selection_info").format(loss)
+            logger.info(msg)
+            print(msg)
+        else:
+            msg = bundle.get("loss_selection_warn").format(loss, model_task_type)
+            logger.warning(msg)
+            print(msg)
     return runtime_parameters

{upgini-1.1.285 → upgini-1.1.285a3418.post1}/src/upgini/version_validator.py RENAMED Viewed

@@ -35,7 +35,7 @@ def validate_version(logger: logging.Logger):
         try:
             current_version = parse(__version__)
             latest_version = get_version("upgini")
-            if current_version < latest_version:
+            if current_version < latest_version:  # type: ignore
                 msg = bundle.get("version_warning").format(current_version, latest_version)
                 logger.warning(msg)
                 print(msg)