PyPI - upgini - Versions diffs - 1.2.73a3659.dev2__py3-none-any.whl → 1.2.75__py3-none-any.whl - Mend

upgini 1.2.73a3659.dev2py3-none-any.whl → 1.2.75py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

upgini/__about__.py +1 -1
upgini/autofe/date.py +20 -4
upgini/autofe/feature.py +20 -10
upgini/autofe/unary.py +38 -1
upgini/autofe/vector.py +8 -21
upgini/metrics.py +4 -2
{upgini-1.2.73a3659.dev2.dist-info → upgini-1.2.75.dist-info}/METADATA +1 -1
{upgini-1.2.73a3659.dev2.dist-info → upgini-1.2.75.dist-info}/RECORD +10 -10
{upgini-1.2.73a3659.dev2.dist-info → upgini-1.2.75.dist-info}/WHEEL +0 -0
{upgini-1.2.73a3659.dev2.dist-info → upgini-1.2.75.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.~~73a3659.dev2~~"
1	+ __version__ = "1.2.75"

upgini/autofe/date.py CHANGED Viewed

@@ -187,16 +187,21 @@ class DateListDiff(PandasOperator, DateDiffMixin, ParametrizedOperator):
 class DateListDiffBounded(DateListDiff, ParametrizedOperator):
     lower_bound: Optional[int] = None
     upper_bound: Optional[int] = None
+    normalize: Optional[bool] = None
     def to_formula(self) -> str:
         lower_bound = "minusinf" if self.lower_bound is None else self.lower_bound
         upper_bound = "plusinf" if self.upper_bound is None else self.upper_bound
-        return f"date_diff_{self.diff_unit}_{lower_bound}_{upper_bound}_{self.aggregation}"
+        norm = "_norm" if self.normalize else ""
+        return f"date_diff_{self.diff_unit}_{lower_bound}_{upper_bound}_{self.aggregation}{norm}"
     @classmethod
     def from_formula(cls, formula: str) -> Optional["DateListDiffBounded"]:
         import re
+        normalize = formula.endswith("_norm")
+        formula = formula.replace("_norm", "")
         pattern = r"^date_diff_([^_]+)_((minusinf|\d+))_((plusinf|\d+))_(\w+)$"
         match = re.match(pattern, formula)
@@ -207,8 +212,13 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
         lower_bound = None if match.group(2) == "minusinf" else int(match.group(2))
         upper_bound = None if match.group(4) == "plusinf" else int(match.group(4))
         aggregation = match.group(6)
-        return cls(diff_unit=diff_unit, lower_bound=lower_bound, upper_bound=upper_bound, aggregation=aggregation)
+        return cls(
+            diff_unit=diff_unit,
+            lower_bound=lower_bound,
+            upper_bound=upper_bound,
+            aggregation=aggregation,
+            normalize=normalize,
+        )
     def get_params(self) -> Dict[str, Optional[str]]:
         res = super().get_params()
@@ -216,14 +226,20 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
             res["lower_bound"] = str(self.lower_bound)
         if self.upper_bound is not None:
             res["upper_bound"] = str(self.upper_bound)
+        if self.normalize is not None:
+            res["normalize"] = str(self.normalize)
         return res
     def _agg(self, x):
+        orig_len = len(x)
         x = x[
             (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
             & (x < (self.upper_bound if self.upper_bound is not None else np.inf))
         ]
-        return super()._agg(x)
+        agg_res = super()._agg(x)
+        if self.normalize and orig_len > 0:
+            return agg_res / orig_len
+        return agg_res
 class DatePercentileBase(PandasOperator, abc.ABC):

upgini/autofe/feature.py CHANGED Viewed

@@ -154,24 +154,34 @@ class Feature:
         for child in self.children:
             child.delete_data()
-    def get_op_display_name(self) -> str:
-        return (self.op.alias or self.op.to_formula()).lower()
+    def get_op_display_name(self, use_alias: bool = True) -> str:
+        return (self.op.alias or self.op.to_formula()).lower() if use_alias else self.op.to_formula()
-    def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
+    def get_display_name(self, cache: bool = True, shorten: bool = False, use_op_alias: bool = True, **kwargs) -> str:
         if self.cached_display_name is not None and cache:
             return self.cached_display_name
         should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
-        prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
+        components = []
         if self.alias:
-            components = ["f_autofe", self.alias]
-        elif shorten and (not self.op.is_unary or should_stack_op):
-            components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
-        else:
-            components = (
-                ["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe"] + prev_name + [self.get_op_display_name()]
+            components.extend(["f_autofe", self.alias])
+        elif should_stack_op:
+            components.extend(
+                [
+                    self.children[0].get_display_name(
+                        cache=cache, shorten=shorten, use_op_alias=use_op_alias, **kwargs
+                    ),
+                    self.get_op_display_name(use_alias=use_op_alias),
+                ]
             )
+        elif shorten and not self.op.is_unary:
+            components.extend(["f_autofe", self.get_op_display_name(use_alias=use_op_alias)])
+        else:
+            components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
+                "autofe",
+                self.get_op_display_name(use_alias=use_op_alias),
+            ]
         components.extend([str(self.display_index)] if self.display_index is not None else [])
         display_name = "_".join(components)

upgini/autofe/unary.py CHANGED Viewed

@@ -1,8 +1,10 @@
-from typing import Dict, Optional
+import json
+from typing import Dict, List, Optional
 import numpy as np
 import pandas as pd
 from upgini.autofe.operator import PandasOperator, VectorizableMixin
+from upgini.autofe.utils import pydantic_validator
 class Abs(PandasOperator, VectorizableMixin):
@@ -153,3 +155,38 @@ class Embeddings(PandasOperator):
     is_unary: bool = True
     input_type: Optional[str] = "string"
     output_type: Optional[str] = "vector"
+class Bin(PandasOperator):
+    name: str = "bin"
+    is_unary: bool = True
+    output_type: Optional[str] = "category"
+    bin_bounds: List[int] = []
+    is_categorical: bool = True
+    def calculate_unary(self, data: pd.Series) -> pd.Series:
+        return data.apply(self._bin, bounds=self.bin_bounds).fillna(-1).astype(int).astype("category")
+    def _bin(self, f, bounds):
+        if f is None or np.isnan(f):
+            return np.nan
+        hit = np.where(f >= np.array(bounds))[0]
+        if hit.size > 0:
+            return np.max(hit) + 1
+        else:
+            return np.nan
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "bin_bounds": json.dumps(self.bin_bounds),
+            }
+        )
+        return res
+    @pydantic_validator("bin_bounds", mode="before")
+    def parse_bin_bounds(cls, value):
+        if isinstance(value, str):
+            return json.loads(value)
+        return value

upgini/autofe/vector.py CHANGED Viewed

@@ -1,8 +1,8 @@
-from typing import Dict, List, Optional
+from typing import List, Optional
 import pandas as pd
-from upgini.autofe.operator import OperatorRegistry, PandasOperator, VectorizableMixin
+from upgini.autofe.operator import PandasOperator, VectorizableMixin
 class Mean(PandasOperator, VectorizableMixin):
@@ -24,23 +24,10 @@ class Sum(PandasOperator, VectorizableMixin):
         return pd.DataFrame(data).T.fillna(0).sum(axis=1)
-class OnnxModel(PandasOperator, metaclass=OperatorRegistry):
-    name: str = "onnx"
+class Vectorize(PandasOperator, VectorizableMixin):
+    name: str = "vectorize"
     is_vector: bool = True
-    output_type: Optional[str] = "float"
-    model_name: str = ""
-    def get_params(self) -> Dict[str, Optional[str]]:
-        res = super().get_params()
-        res.update(
-            {
-                "model_name": self.model_name,
-            }
-        )
-        return res
-    # def load_model(self):
-    #     ...
-    # def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
-    #     ...
+    group_index: int = 0
+    def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
+        return pd.DataFrame(data).T.apply(lambda x: x.to_list(), axis=1)

upgini/metrics.py CHANGED Viewed

@@ -326,7 +326,7 @@ class EstimatorWrapper:
         for c in x.columns:
             if is_numeric_dtype(x[c]):
                 x[c] = x[c].astype(float)
-            else:
+            elif not x[c].dtype == "category":
                 x[c] = x[c].astype(str)
         if not isinstance(y, pd.Series):
@@ -481,7 +481,7 @@ class EstimatorWrapper:
             "logger": logger,
         }
         if estimator is None:
-            params = {}
+            params = {"random_state": DEFAULT_RANDOM_STATE}
             if target_type == ModelTaskType.MULTICLASS:
                 params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
                 params = _get_add_params(params, add_params)
@@ -749,6 +749,8 @@ class LightGBMWrapper(EstimatorWrapper):
         if self.target_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]:
             self.n_classes = len(np.unique(y_numpy))
         if LIGHTGBM_EARLY_STOPPING_ROUNDS is not None:
+            if self.target_type == ModelTaskType.BINARY:
+                params["eval_metric"] = "auc"
             params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
         self.cat_features = _get_cat_features(x)
         if self.cat_features:

{upgini-1.2.73a3659.dev2.dist-info → upgini-1.2.75.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.73a3659.dev2
+Version: 1.2.75
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.73a3659.dev2.dist-info → upgini-1.2.75.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-upgini/__about__.py,sha256=Vn3aojC64D6rn5ZFKIFRFVE3tY8D8CLC3Y0V5pbn2Jo,33
+upgini/__about__.py,sha256=xPczHfrMrTuUNz8xC9lgCjhkHVDmW9TFPuLq9_c_Ms8,23
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
@@ -6,7 +6,7 @@ upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
 upgini/features_enricher.py,sha256=Li1sPihWVkPUPcma8HRbPFwpCqd9V9d2p5zQUgkpdpU,206998
 upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
 upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
-upgini/metrics.py,sha256=a0bY4oTMb-MgB1yC1IuTcEtotKZxAxjgV_QV2Z4V8u4,38988
+upgini/metrics.py,sha256=pFRKBKyAri7xfe5pkNxcx241HQH95rV9afebgg8Tdiw,39156
 upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
 upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
 upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -15,13 +15,13 @@ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo
 upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
 upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
-upgini/autofe/date.py,sha256=C86F7sPiscUGq2a45UtQA9ADWBWg0kt54mePHHzjbLE,10633
-upgini/autofe/feature.py,sha256=y1x3wijhTVBmloayQAHiscqKU9Ll8kLcGm1PdvS357I,14910
+upgini/autofe/date.py,sha256=MM1S-6imNSzCDOhbNnmsc_bwSqUWBcS8vWAdHF8j1kY,11134
+upgini/autofe/feature.py,sha256=G_YgnsauIoaMgByx9JXDPiKc4nqs0pwWZUfvoIGMKxY,15305
 upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
 upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
-upgini/autofe/unary.py,sha256=yVgPvtfnPSOhrii0YgezddmgWPwyOBCR0JutaIkdTTc,4658
+upgini/autofe/unary.py,sha256=Sx11IoHRh5nwyALzjgG9GQOrVNIs8NZ1JzunAJuN66A,5731
 upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
-upgini/autofe/vector.py,sha256=w7ipoFRvR0BcTYcvJR9EbKc_ycIn9cJ94RLgrgIi4Uc,1212
+upgini/autofe/vector.py,sha256=zehv1J9ChHdZKWjKlkRf6RpfQMCJduZmqCEePYNUfkQ,943
 upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
 upgini/autofe/timeseries/base.py,sha256=rWJqRuFAzTZEsUdWG5s1Vhif9zzRRmalASXvarufRxI,3610
 upgini/autofe/timeseries/cross.py,sha256=BTINVwuZSbm_4NKkVm0FGM68SrvZLENZKXN7-UyvhYI,5319
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=P0cCVRaakWLydYwFjk3TEaQfr0p0hfsJCvKRD8qcxiE,
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.73a3659.dev2.dist-info/METADATA,sha256=WImhNzA5wn2I_HyEYKvKAcUfpIWbQ0spUAI7tgu-fiQ,49101
-upgini-1.2.73a3659.dev2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.73a3659.dev2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.73a3659.dev2.dist-info/RECORD,,
+upgini-1.2.75.dist-info/METADATA,sha256=jUP3dTEC71e0OcENot-gdjVx1gxqUPVPWufkY-vRv60,49091
+upgini-1.2.75.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.2.75.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.75.dist-info/RECORD,,

{upgini-1.2.73a3659.dev2.dist-info → upgini-1.2.75.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.73a3659.dev2.dist-info → upgini-1.2.75.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.73a3659.dev2__py3-none-any.whl → 1.2.75__py3-none-any.whl

upgini 1.2.73a3659.dev2py3-none-any.whl → 1.2.75py3-none-any.whl