PyPI - upgini - Versions diffs - 1.1.297__tar.gz → 1.1.298__tar.gz - Mend

upgini 1.1.297tar.gz → 1.1.298tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (65) hide show

{upgini-1.1.297 → upgini-1.1.298}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.1.297
+Version: 1.1.298
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/
@@ -131,7 +131,7 @@ Description-Content-Type: text/markdown
 |Consumer Confidence index| 44 |22|-|Monthly|date, country|No
 |World economic indicators|191 |41|-|Monthly|date, country|No
 |Markets data|-|17|-|Monthly|date, datetime|No
-|World mobile & fixed broadband network coverage and perfomance |167|-|3|Monthly|country, postal/ZIP code|No
+|World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
 |World demographic data |90|-|2|Annual|country, postal/ZIP code|No
 |World house prices |44|-|3|Annual|country, postal/ZIP code|No
 |Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
@@ -840,4 +840,4 @@ Some convenient ways to start contributing are:
 - [More perks for registered users](https://profile.upgini.com)
 <sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
-Please report it here</a></sup>
+Please report it here</a></sup>

{upgini-1.1.297 → upgini-1.1.298}/README.md RENAMED Viewed

@@ -90,7 +90,7 @@
 |Consumer Confidence index| 44 |22|-|Monthly|date, country|No
 |World economic indicators|191 |41|-|Monthly|date, country|No
 |Markets data|-|17|-|Monthly|date, datetime|No
-|World mobile & fixed broadband network coverage and perfomance |167|-|3|Monthly|country, postal/ZIP code|No
+|World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
 |World demographic data |90|-|2|Annual|country, postal/ZIP code|No
 |World house prices |44|-|3|Annual|country, postal/ZIP code|No
 |Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
@@ -799,4 +799,4 @@ Some convenient ways to start contributing are:
 - [More perks for registered users](https://profile.upgini.com)
 <sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
-Please report it here</a></sup>
+Please report it here</a></sup>

upgini-1.1.298/src/upgini/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.1.298"

{upgini-1.1.297 → upgini-1.1.298}/src/upgini/autofe/all_operands.py RENAMED Viewed

@@ -1,7 +1,14 @@
 from typing import Dict
 from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
-from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded, DatePercentile
+from upgini.autofe.date import (
+    DateDiff,
+    DateDiffType2,
+    DateListDiff,
+    DateListDiffBounded,
+    DatePercentile,
+    DatePercentileMethod2,
+)
 from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
 from upgini.autofe.operand import Operand
 from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
@@ -50,6 +57,7 @@ ALL_OPERANDS: Dict[str, Operand] = {
         DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
         DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
         DatePercentile(),
+        DatePercentileMethod2(),
         Norm(),
     ]
 }

{upgini-1.1.297 → upgini-1.1.298}/src/upgini/autofe/date.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import abc
 from typing import Any, Dict, List, Optional, Union
 import numpy as np
@@ -38,6 +39,7 @@ class DateDiffMixin(BaseModel):
 class DateDiff(PandasOperand, DateDiffMixin):
     name = "date_diff"
+    alias = "date_diff_type1"
     is_binary = True
     has_symmetry_importance = True
@@ -159,12 +161,45 @@ class DateListDiffBounded(DateListDiff):
         return super()._agg(x)
-class DatePercentile(PandasOperand):
-    name = "date_per"
+class DatePercentileBase(PandasOperand, abc.ABC):
     is_binary = True
     output_type = "float"
     date_unit: Optional[str] = None
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        # Assuming that left is a date column, right is a feature column
+        left = pd.to_datetime(left, unit=self.date_unit)
+        bounds = self._get_bounds(left)
+        return right.index.to_series().apply(lambda i: self._perc(right[i], bounds[i]))
+    @abc.abstractmethod
+    def _get_bounds(self, date_col: pd.Series) -> pd.Series:
+        pass
+    def _perc(self, f, bounds):
+        hit = np.where(f >= bounds)[0]
+        if hit.size > 0:
+            return np.max(hit) + 1
+        else:
+            return np.nan
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "date_unit": self.date_unit,
+            }
+        )
+        return res
+class DatePercentile(DatePercentileBase):
+    name = "date_per"
+    alias = "date_per_method1"
     zero_month: Optional[int]
     zero_year: Optional[int]
     zero_bounds: Optional[List[float]]
@@ -174,7 +209,6 @@ class DatePercentile(PandasOperand):
         res = super().get_params()
         res.update(
             {
-                "date_unit": self.date_unit,
                 "zero_month": self.zero_month,
                 "zero_year": self.zero_year,
                 "zero_bounds": self.zero_bounds,
@@ -190,22 +224,18 @@ class DatePercentile(PandasOperand):
         elif isinstance(value, str):
             return value[1:-1].split(", ")
-    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
-        # Assuming that left is a date column, right is a feature column
-        left = pd.to_datetime(left, unit=self.date_unit)
-        months = left.dt.month
-        years = left.dt.year
+    def _get_bounds(self, date_col: pd.Series) -> pd.Series:
+        months = date_col.dt.month
+        years = date_col.dt.year
         month_diffs = 12 * (years - (self.zero_year or 0)) + (months - (self.zero_month or 0))
-        bounds = month_diffs.apply(
+        return month_diffs.apply(
             lambda d: np.array(self.zero_bounds if self.zero_bounds is not None else []) + d * self.step
         )
-        return right.index.to_series().apply(lambda i: self.__perc(right[i], bounds[i]))
-    def __perc(self, f, bounds):
-        hit = np.where(f >= bounds)[0]
-        if hit.size > 0:
-            return np.max(hit) + 1
-        else:
-            return np.nan
+class DatePercentileMethod2(DatePercentileBase):
+    name = "date_per_method2"
+    def _get_bounds(self, date_col: pd.Series) -> pd.Series:
+        pass

{upgini-1.1.297 → upgini-1.1.298}/src/upgini/autofe/feature.py RENAMED Viewed

@@ -41,7 +41,7 @@ class Column:
     def get_column_nodes(self) -> List["Column"]:
         return [self]
-    def get_columns(self) -> List[str]:
+    def get_columns(self, **kwargs) -> List[str]:
         return [self.name]
     def infer_type(self, data: pd.DataFrame) -> DtypeObj:
@@ -57,6 +57,12 @@ class Column:
     def to_pretty_formula(self) -> str:
         return self.to_formula()
+    def __eq__(self, value: object) -> bool:
+        if not isinstance(value, Column):
+            return False
+        else:
+            return self.name == value.name and self.calculate_all == value.calculate_all
 class Feature:
     def __init__(
@@ -125,6 +131,9 @@ class Feature:
         for child in self.children:
             child.delete_data()
+    def get_op_display_name(self) -> str:
+        return self.op.alias or self.op.name.lower()
     def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
         if self.cached_display_name is not None and cache:
             return self.cached_display_name
@@ -132,11 +141,11 @@ class Feature:
         if self.alias:
             components = ["f_autofe", self.alias]
         elif shorten and not self.op.is_unary:
-            components = ["f_autofe", self.op.alias or self.op.name.lower()]
+            components = ["f_autofe", self.get_op_display_name()]
         else:
             components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
                 "autofe",
-                self.op.alias or self.op.name.lower(),
+                self.get_op_display_name(),
             ]
         components.extend([str(self.display_index)] if self.display_index is not None else [])
         display_name = "_".join(components)
@@ -306,8 +315,21 @@ class FeatureGroup:
         main_column = None if self.main_column_node is None else self.main_column_node.get_columns()[0]
         if isinstance(self.op, PandasOperand):
             columns = self.get_columns()
-            new_data = self.op.calculate_group(data[columns], main_column=main_column)
-            new_data.rename(columns=dict(zip(columns, self.get_display_names())), inplace=True)
+            lower_order_children = [
+                ch for f in self.children for ch in f.children if ch.get_display_name() != main_column
+            ]
+            lower_order_names = [ch.get_display_name() for ch in lower_order_children]
+            if any(isinstance(f, Feature) for f in lower_order_children):
+                child_data = pd.concat(
+                    [data[main_column]] + [ch.calculate(data) for ch in lower_order_children],
+                    axis=1,
+                )
+                child_data.columns = [main_column] + lower_order_names
+            else:
+                child_data = data[columns]
+            new_data = self.op.calculate_group(child_data, main_column=main_column)
+            new_data.rename(columns=dict(zip(lower_order_names, self.get_display_names())), inplace=True)
         else:
             raise NotImplementedError(f"Unrecognized operator {self.op.name}.")