PyPI - upgini - Versions diffs - 1.1.262a3250.post4__py3-none-any.whl → 1.1.280a3418.post2__py3-none-any.whl - Mend

upgini 1.1.262a3250.post4py3-none-any.whl → 1.1.280a3418.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (49) hide show

upgini/__about__.py +1 -0
upgini/ads.py +6 -2
upgini/ads_management/ads_manager.py +4 -2
upgini/autofe/all_operands.py +16 -4
upgini/autofe/binary.py +2 -1
upgini/autofe/date.py +74 -7
upgini/autofe/feature.py +1 -1
upgini/autofe/groupby.py +3 -1
upgini/autofe/operand.py +4 -3
upgini/autofe/unary.py +20 -1
upgini/autofe/vector.py +2 -0
upgini/data_source/data_source_publisher.py +14 -4
upgini/dataset.py +8 -7
upgini/errors.py +1 -1
upgini/features_enricher.py +156 -63
upgini/http.py +11 -10
upgini/mdc/__init__.py +1 -3
upgini/mdc/context.py +4 -6
upgini/metadata.py +3 -0
upgini/metrics.py +160 -96
upgini/normalizer/phone_normalizer.py +2 -2
upgini/resource_bundle/__init__.py +5 -5
upgini/resource_bundle/strings.properties +9 -4
upgini/sampler/base.py +1 -4
upgini/sampler/random_under_sampler.py +2 -5
upgini/search_task.py +4 -4
upgini/spinner.py +1 -1
upgini/utils/__init__.py +3 -2
upgini/utils/base_search_key_detector.py +2 -2
upgini/utils/blocked_time_series.py +4 -2
upgini/utils/country_utils.py +2 -2
upgini/utils/custom_loss_utils.py +3 -2
upgini/utils/cv_utils.py +2 -2
upgini/utils/datetime_utils.py +75 -18
upgini/utils/deduplicate_utils.py +61 -18
upgini/utils/email_utils.py +3 -3
upgini/utils/fallback_progress_bar.py +1 -1
upgini/utils/features_validator.py +2 -1
upgini/utils/progress_bar.py +1 -1
upgini/utils/sklearn_ext.py +15 -15
upgini/utils/target_utils.py +21 -7
upgini/utils/track_info.py +27 -15
upgini/version_validator.py +2 -2
{upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info}/METADATA +21 -23
upgini-1.1.280a3418.post2.dist-info/RECORD +62 -0
{upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info}/WHEEL +1 -2
upgini-1.1.262a3250.post4.dist-info/RECORD +0 -62
upgini-1.1.262a3250.post4.dist-info/top_level.txt +0 -1
{upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info/licenses}/LICENSE +0 -0

upgini/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.1.280a3418-2"

upgini/ads.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Dict, Optional
 import numpy as np
 import pandas as pd
-from pandas.api.types import is_string_dtype
+from pandas.api.types import is_object_dtype, is_string_dtype
 from upgini import SearchKey
 from upgini.http import get_rest_client
@@ -34,7 +34,11 @@ def upload_user_ads(name: str, df: pd.DataFrame, search_keys: Dict[str, SearchKe
             if df[column_name].notnull().sum() < min_valid_rows_count:
                 raise ValueError(bundle.get("ads_upload_to_many_empty_rows"))
             meaning_type = search_keys[column_name].value
-            if meaning_type == FileColumnMeaningType.MSISDN and not is_string_dtype(df[column_name]):
+            if (
+                meaning_type == FileColumnMeaningType.MSISDN
+                and not is_string_dtype(df[column_name])
+                and not is_object_dtype(df[column_name])
+            ):
                 df[column_name] = df[column_name].values.astype(np.int64).astype("string")  # type: ignore
         else:
             meaning_type = FileColumnMeaningType.FEATURE

upgini/ads_management/ads_manager.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import time
-from typing import Dict, Optional
 import uuid
+from typing import Dict, Optional
+import pandas as pd
 from upgini.http import get_rest_client
 from upgini.spinner import Spinner
-import pandas as pd
 class AdsManager:

upgini/autofe/all_operands.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from typing import Dict
-from upgini.autofe.date import DateDiff, DateDiffFuture
+from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
+from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded
 from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
 from upgini.autofe.operand import Operand
-from upgini.autofe.unary import Abs, Log, Residual, Sqrt, Square, Sigmoid, Floor, Freq
-from upgini.autofe.binary import Min, Max, Add, Subtract, Multiply, Divide, Sim
+from upgini.autofe.unary import Abs, Bin, Floor, Freq, Log, Residual, Sigmoid, Sqrt, Square
 from upgini.autofe.vector import Mean, Sum
 ALL_OPERANDS: Dict[str, Operand] = {
@@ -37,7 +38,18 @@ ALL_OPERANDS: Dict[str, Operand] = {
         Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
         Sim(),
         DateDiff(),
-        DateDiffFuture(),
+        DateDiffType2(),
+        DateListDiff(aggregation="min"),
+        DateListDiff(aggregation="max"),
+        DateListDiff(aggregation="mean"),
+        DateListDiff(aggregation="nunique"),
+        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=0, upper_bound=18),
+        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=18, upper_bound=23),
+        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=23, upper_bound=30),
+        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
+        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
+        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
+        Bin(),
     ]
 }

upgini/autofe/binary.py CHANGED Viewed

@@ -1,9 +1,10 @@
-from upgini.autofe.operand import PandasOperand, VectorizableMixin
 import numpy as np
 import pandas as pd
 from numpy import dot
 from numpy.linalg import norm
+from upgini.autofe.operand import PandasOperand, VectorizableMixin
 class Min(PandasOperand):
     name = "min"

upgini/autofe/date.py CHANGED Viewed

@@ -1,11 +1,14 @@
-from typing import Optional, Union
+from typing import Any, Optional, Union
 import numpy as np
 import pandas as pd
+from pandas.core.arrays.timedeltas import TimedeltaArray
+from pydantic import BaseModel
 from upgini.autofe.operand import PandasOperand
-class DateDiffMixin:
+class DateDiffMixin(BaseModel):
     diff_unit: str = "D"
     left_unit: Optional[str] = None
     right_unit: Optional[str] = None
@@ -34,18 +37,82 @@ class DateDiff(PandasOperand, DateDiffMixin):
         return x
-class DateDiffFuture(PandasOperand, DateDiffMixin):
-    name = "date_diff_future"
+class DateDiffType2(PandasOperand, DateDiffMixin):
+    name = "date_diff_type2"
     is_binary = True
     has_symmetry_importance = True
-    is_vectorizable = False
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         left = self._convert_to_date(left, self.left_unit)
         right = self._convert_to_date(right, self.right_unit)
-        future = pd.to_datetime(dict(day=right.dt.day, month=right.dt.month, year=left.dt.year))
+        future = right + (left.dt.year - right.dt.year).apply(
+            lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
+        )
+        future = pd.to_datetime(future)
         before = future[future < left]
-        future[future < left] = pd.to_datetime(dict(day=before.dt.day, month=before.dt.month, year=before.dt.year + 1))
+        future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
         diff = (future - left) / np.timedelta64(1, self.diff_unit)
         return diff
+_ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len, 0)}
+class DateListDiff(PandasOperand, DateDiffMixin):
+    is_binary = True
+    has_symmetry_importance = True
+    aggregation: str
+    def __init__(self, **data: Any) -> None:
+        if "name" not in data:
+            data["name"] = f"date_diff_{data.get('aggregation')}"
+        super().__init__(**data)
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        left = self._convert_to_date(left, self.left_unit)
+        right = right.apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
+        return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
+    def _diff(self, x: TimedeltaArray):
+        if self.diff_unit == "Y":
+            x = (x / 365 / 24 / 60 / 60 / 10**9).astype(int)
+        elif self.diff_unit == "M":
+            raise Exception("Unsupported difference unit: Month")
+        else:
+            x = x / np.timedelta64(1, self.diff_unit)
+        return x[x > 0]
+    def _agg(self, x):
+        method = getattr(np, self.aggregation, None)
+        default = np.nan
+        if method is None and self.aggregation in _ext_aggregations:
+            method, default = _ext_aggregations[self.aggregation]
+        elif not callable(method):
+            raise ValueError(f"Unsupported aggregation: {self.aggregation}")
+        return method(x) if len(x) > 0 else default
+class DateListDiffBounded(DateListDiff):
+    lower_bound: Optional[int]
+    upper_bound: Optional[int]
+    def __init__(self, **data: Any) -> None:
+        if "name" not in data:
+            lower_bound = data.get("lower_bound")
+            upper_bound = data.get("upper_bound")
+            components = [
+                "date_diff",
+                data.get("diff_unit"),
+                str(lower_bound if lower_bound is not None else "minusinf"),
+                str(upper_bound if upper_bound is not None else "plusinf"),
+            ]
+            components.append(data.get("aggregation"))
+            data["name"] = "_".join(components)
+        super().__init__(**data)
+    def _agg(self, x):
+        x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
+        return super()._agg(x)

upgini/autofe/feature.py CHANGED Viewed

@@ -215,7 +215,7 @@ class Feature:
             return Column(string)
         def is_trivial_char(c: str) -> bool:
-            return not (c in "()+-*/,")
+            return c not in "()+-*/,"
         def find_prev(string: str) -> int:
             if string[-1] != ")":

upgini/autofe/groupby.py CHANGED Viewed

@@ -1,7 +1,9 @@
-from upgini.autofe.operand import PandasOperand, VectorizableMixin
 from typing import Optional
 import pandas as pd
+from upgini.autofe.operand import PandasOperand, VectorizableMixin
 class GroupByThenAgg(PandasOperand, VectorizableMixin):
     agg: Optional[str]

upgini/autofe/operand.py CHANGED Viewed

@@ -1,8 +1,9 @@
-from pydantic import BaseModel
-from typing import Dict, List, Optional, Tuple, Union
 import abc
-import pandas as pd
+from typing import Dict, List, Optional, Tuple, Union
 import numpy as np
+import pandas as pd
+from pydantic import BaseModel
 class Operand(BaseModel):

upgini/autofe/unary.py CHANGED Viewed

@@ -1,7 +1,8 @@
-from upgini.autofe.operand import PandasOperand, VectorizableMixin
 import numpy as np
 import pandas as pd
+from upgini.autofe.operand import PandasOperand, VectorizableMixin
 class Abs(PandasOperand, VectorizableMixin):
     name = "abs"
@@ -110,3 +111,21 @@ class Freq(PandasOperand):
     def calculate_unary(self, data: pd.Series) -> pd.Series:
         value_counts = data.value_counts(normalize=True)
         return self._loc(data, value_counts)
+class Bin(PandasOperand):
+    name = "bin"
+    is_unary = True
+    output_type = "int"
+    input_type = "discrete"
+    zero_bound_low: int
+    zero_bound_high: int
+    step: int
+    def calculate_unary(self, data: pd.Series) -> pd.Series:
+        res = pd.Series(np.zeros(data.shape), index=data.index, dtype="int")
+        res.update((data[data < self.zero_bound_low] - self.zero_bound_low) // self.step)
+        res.update((data[data >= self.zero_bound_high] - self.zero_bound_high) // self.step + 1)
+        return res

upgini/autofe/vector.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from typing import List
 import pandas as pd
 from upgini.autofe.operand import PandasOperand, VectorizableMixin

upgini/data_source/data_source_publisher.py CHANGED Viewed

@@ -48,6 +48,7 @@ class DataSourcePublisher:
         data_table_uri: str,
         search_keys: Dict[str, SearchKey],
         update_frequency: str,
+        exclude_from_autofe_generation: Optional[List[str]],
         secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
         sort_column: Optional[str] = None,
         date_format: Optional[str] = None,
@@ -57,7 +58,6 @@ class DataSourcePublisher:
         join_date_abs_limit_days: Optional[int] = None,
         features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
         data_table_id_to_replace: Optional[str] = None,
-        exclude_from_autofe_generation: Optional[List[str]] = None,
         _force_generation=False,
         _silent=False,
     ) -> str:
@@ -72,8 +72,8 @@ class DataSourcePublisher:
                     )
                 if search_keys is None or len(search_keys) == 0:
                     raise ValidationError("Empty search keys")
-                if SearchKey.DATE in search_keys.values() and date_format is None:
-                    raise ValidationError("date_format is required for DATE search key")
+                # if SearchKey.DATE in search_keys.values() and date_format is None:
+                #     raise ValidationError("date_format is required for DATE search key")
                 if update_frequency not in self.ACCEPTABLE_UPDATE_FREQUENCIES:
                     raise ValidationError(
                         f"Invalid update frequency: {update_frequency}. "
@@ -85,11 +85,19 @@ class DataSourcePublisher:
                     or set(search_keys.values()) == {SearchKey.MSISDN_RANGE_FROM, SearchKey.MSISDN_RANGE_TO}
                 ) and sort_column is None:
                     raise ValidationError("Sort column is required for passed search keys")
+                if (
+                    set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
+                    and snapshot_frequency_days is None
+                    and join_date_abs_limit_days is None
+                ):
+                    raise ValidationError(
+                        "With MSISDN and DATE keys one of the snapshot_frequency_days or"
+                        " join_date_abs_limit_days parameters is required"
+                    )
                 request = {
                     "dataTableUri": data_table_uri,
                     "searchKeys": {k: v.value.value for k, v in search_keys.items()},
-                    "dateFormat": date_format,
                     "excludeColumns": exclude_columns,
                     "hashFeatureNames": str(hash_feature_names).lower(),
                     "snapshotFrequencyDays": snapshot_frequency_days,
@@ -98,6 +106,8 @@ class DataSourcePublisher:
                     "featuresForEmbeddings": features_for_embeddings,
                     "forceGeneration": str(_force_generation).lower(),
                 }
+                if date_format is not None:
+                    request["dateFormat"] = date_format
                 if secondary_search_keys is not None:
                     request["secondarySearchKeys"] = {k: v.value.value for k, v in secondary_search_keys.items()}
                 if sort_column is not None:

upgini/dataset.py CHANGED Viewed

@@ -15,6 +15,7 @@ from pandas.api.types import (
     is_float_dtype,
     is_integer_dtype,
     is_numeric_dtype,
+    is_object_dtype,
     is_period_dtype,
     is_string_dtype,
 )
@@ -60,7 +61,7 @@ class Dataset:  # (pd.DataFrame):
     FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
     FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
     MIN_SAMPLE_THRESHOLD = 5_000
-    IMBALANCE_THESHOLD = 0.4
+    IMBALANCE_THESHOLD = 0.6
     BINARY_BOOTSTRAP_LOOPS = 5
     MULTICLASS_BOOTSTRAP_LOOPS = 2
     MIN_TARGET_CLASS_ROWS = 100
@@ -94,7 +95,7 @@ class Dataset:  # (pd.DataFrame):
                 data = pd.read_csv(path, **kwargs)
             else:
                 # try different separators: , ; \t ...
-                with open(path, mode="r") as csvfile:
+                with open(path) as csvfile:
                     sep = csv.Sniffer().sniff(csvfile.read(2048)).delimiter
                 kwargs["sep"] = sep
                 data = pd.read_csv(path, **kwargs)
@@ -219,7 +220,7 @@ class Dataset:  # (pd.DataFrame):
         """Check that string values less than maximum characters for LLM"""
         # self.logger.info("Validate too long string values")
         for col in self.data.columns:
-            if is_string_dtype(self.data[col]):
+            if is_string_dtype(self.data[col]) or is_object_dtype(self.data[col]):
                 max_length: int = self.data[col].astype("str").str.len().max()
                 if max_length > self.MAX_STRING_FEATURE_LENGTH:
                     self.data[col] = self.data[col].astype("str").str.slice(stop=self.MAX_STRING_FEATURE_LENGTH)
@@ -250,7 +251,7 @@ class Dataset:  # (pd.DataFrame):
     @staticmethod
     def _ip_to_int(ip: Optional[_BaseAddress]) -> Optional[int]:
         try:
-            if isinstance(ip, IPv4Address) or isinstance(ip, IPv6Address):
+            if isinstance(ip, (IPv4Address, IPv6Address)):
                 return int(ip)
         except Exception:
             pass
@@ -258,7 +259,7 @@ class Dataset:  # (pd.DataFrame):
     @staticmethod
     def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
         try:
-            if isinstance(ip, IPv4Address) or isinstance(ip, IPv6Address):
+            if isinstance(ip, (IPv4Address, IPv6Address)):
                 return str(int(ip))
         except Exception:
             pass
@@ -350,7 +351,7 @@ class Dataset:  # (pd.DataFrame):
         if postal_code is not None and postal_code in self.data.columns:
             # self.logger.info("Normalize postal code")
-            if is_string_dtype(self.data[postal_code]):
+            if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
                 try:
                     self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
                 except Exception:
@@ -821,7 +822,7 @@ class Dataset:  # (pd.DataFrame):
             return DataType.INT
         elif is_float_dtype(pandas_data_type):
             return DataType.DECIMAL
-        elif is_string_dtype(pandas_data_type):
+        elif is_string_dtype(pandas_data_type) or is_object_dtype(pandas_data_type):
             return DataType.STRING
         else:
             msg = self.bundle.get("dataset_invalid_column_type").format(column_name, pandas_data_type)

upgini/errors.py CHANGED Viewed

@@ -16,7 +16,7 @@ class UnauthorizedError(HttpError):
     """Unauthorized error from REST API."""
     def __init__(self, message, status_code):
-        message = "Unauthorized, please check your authorization token ({})".format(message)
+        message = f"Unauthorized, please check your authorization token ({message})"
         super(UnauthorizedError, self).__init__(message, status_code)

upgini 1.1.262a3250.post4__py3-none-any.whl → 1.1.280a3418.post2__py3-none-any.whl

Potentially problematic release.

upgini 1.1.262a3250.post4py3-none-any.whl → 1.1.280a3418.post2py3-none-any.whl