PyPI - upgini - Versions diffs - 1.1.316a4__tar.gz → 1.1.317__tar.gz - Mend

upgini 1.1.316a4tar.gz → 1.1.317tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (65) hide show

{upgini-1.1.316a4 → upgini-1.1.317}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.1.316a4
+Version: 1.1.317
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/
@@ -29,9 +29,9 @@ Requires-Dist: ipywidgets>=8.1.0
 Requires-Dist: jarowinkler>=2.0.0
 Requires-Dist: levenshtein>=0.25.1
 Requires-Dist: lightgbm>=3.3.2
-Requires-Dist: numpy<=1.26.4,>=1.19.0
+Requires-Dist: numpy>=1.19.0
 Requires-Dist: pandas<3.0.0,>=1.1.0
-Requires-Dist: pydantic<3.0.0,>1.0.0
+Requires-Dist: pydantic<2.0.0,>=1.8.2
 Requires-Dist: pyjwt>=2.8.0
 Requires-Dist: python-bidi==0.4.2
 Requires-Dist: python-dateutil>=2.8.0

{upgini-1.1.316a4 → upgini-1.1.317}/pyproject.toml RENAMED Viewed

@@ -39,9 +39,9 @@ dependencies = [
     "fastparquet>=0.8.1",
     "ipywidgets>=8.1.0",
     "lightgbm>=3.3.2",
-    "numpy>=1.19.0,<=1.26.4",
+    "numpy>=1.19.0",
     "pandas>=1.1.0,<3.0.0",
-    "pydantic>1.0.0,<3.0.0",
+    "pydantic>=1.8.2,<2.0.0",
     "pyjwt>=2.8.0",
     "python-dateutil>=2.8.0",
     "python-json-logger>=2.0.2",
@@ -79,7 +79,7 @@ python = "3.10"
 cov = 'pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=upgini --cov=tests'
 format = "black {args}"
 lint = "ruff check {args}"
-test_all = 'pytest -s -vv tests'
+test_binary = 'pytest -s -vv tests/test_binary_dataset.py'
 [[tool.hatch.envs.test.matrix]]
 python = ["3.8"]
@@ -103,7 +103,7 @@ dependencies = [
 #  "pytest-timeout",
   "requests-mock",
   "pytest-datafiles",
-  "pandas~={matrix:pandas}",
+  "pandas~={matrix:pandas}.0",
 ]
 [tool.black]
@@ -115,5 +115,4 @@ profile = "black"
 [tool.pytest.ini_options]
 pythonpath = [
   "./src"
-]
-addopts="-n 4"
+]

upgini-1.1.317/src/upgini/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.1.317"

{upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/autofe/binary.py RENAMED Viewed

@@ -9,32 +9,32 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
 class Min(PandasOperand):
-    name: str = "min"
-    is_binary: bool = True
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
+    name = "min"
+    is_binary = True
+    is_symmetrical = True
+    has_symmetry_importance = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         return np.minimum(left, right)
 class Max(PandasOperand):
-    name: str = "max"
-    is_binary: bool = True
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
+    name = "max"
+    is_binary = True
+    is_symmetrical = True
+    has_symmetry_importance = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         return np.maximum(left, right)
 class Add(PandasOperand, VectorizableMixin):
-    name: str = "+"
-    alias: str = "add"
-    is_binary: bool = True
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
-    is_vectorizable: bool = True
+    name = "+"
+    alias = "add"
+    is_binary = True
+    is_symmetrical = True
+    has_symmetry_importance = True
+    is_vectorizable = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         return left + right
@@ -48,12 +48,12 @@ class Add(PandasOperand, VectorizableMixin):
 class Subtract(PandasOperand, VectorizableMixin):
-    name: str = "-"
-    alias: str = "sub"
-    is_binary: bool = True
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
-    is_vectorizable: bool = True
+    name = "-"
+    alias = "sub"
+    is_binary = True
+    is_symmetrical = True
+    has_symmetry_importance = True
+    is_vectorizable = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         return left - right
@@ -67,12 +67,12 @@ class Subtract(PandasOperand, VectorizableMixin):
 class Multiply(PandasOperand, VectorizableMixin):
-    name: str = "*"
-    alias: str = "mul"
-    is_binary: bool = True
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
-    is_vectorizable: bool = True
+    name = "*"
+    alias = "mul"
+    is_binary = True
+    is_symmetrical = True
+    has_symmetry_importance = True
+    is_vectorizable = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         return left * right
@@ -86,12 +86,12 @@ class Multiply(PandasOperand, VectorizableMixin):
 class Divide(PandasOperand, VectorizableMixin):
-    name: str = "/"
-    alias: str = "div"
-    is_binary: bool = True
-    has_symmetry_importance: bool = True
-    is_vectorizable: bool = True
-    output_type: Optional[str] = "float"
+    name = "/"
+    alias = "div"
+    is_binary = True
+    has_symmetry_importance = True
+    is_vectorizable = True
+    output_type = "float"
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         return left / right.replace(0, np.nan)
@@ -105,10 +105,10 @@ class Divide(PandasOperand, VectorizableMixin):
 class Combine(PandasOperand):
-    name: str = "Combine"
-    is_binary: bool = True
-    has_symmetry_importance: bool = True
-    output_type: Optional[str] = "object"
+    name = "Combine"
+    is_binary = True
+    has_symmetry_importance = True
+    output_type = "object"
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         temp = left.astype(str) + "_" + right.astype(str)
@@ -117,13 +117,13 @@ class Combine(PandasOperand):
 class CombineThenFreq(PandasOperand):
-    name: str = "CombineThenFreq"
-    is_binary: bool = True
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
-    output_type: Optional[str] = "float"
-    is_distribution_dependent: bool = True
-    input_type: Optional[str] = "discrete"
+    name = "CombineThenFreq"
+    is_binary = True
+    is_symmetrical = True
+    has_symmetry_importance = True
+    output_type = "float"
+    is_distribution_dependent = True
+    input_type = "discrete"
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         temp = left.astype(str) + "_" + right.astype(str)
@@ -133,15 +133,15 @@ class CombineThenFreq(PandasOperand):
 class Distance(PandasOperand):
-    name: str = "dist"
-    is_binary: bool = True
-    output_type: Optional[str] = "float"
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
+    name = "dist"
+    is_binary = True
+    output_type = "float"
+    is_symmetrical = True
+    has_symmetry_importance = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         return pd.Series(
-            1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
+            1 - self.__dot(left, right) / (self.__norm(left) * self.__norm(right)), index=left.index
         )
     # row-wise dot product
@@ -152,14 +152,17 @@ class Distance(PandasOperand):
         res = res.reindex(left.index.union(right.index))
         return res
+    def __norm(self, vector: pd.Series) -> pd.Series:
+        return np.sqrt(self.__dot(vector, vector))
 # Left for backward compatibility
 class Sim(Distance):
-    name: str = "sim"
-    is_binary: bool = True
-    output_type: Optional[str] = "float"
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
+    name = "sim"
+    is_binary = True
+    output_type = "float"
+    is_symmetrical = True
+    has_symmetry_importance = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         return 1 - super().calculate_binary(left, right)
@@ -188,12 +191,12 @@ class StringSim(PandasOperand, abc.ABC):
 class JaroWinklerSim1(StringSim):
-    name: str = "sim_jw1"
-    is_binary: bool = True
-    input_type: Optional[str] = "string"
-    output_type: Optional[str] = "float"
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
+    name = "sim_jw1"
+    is_binary = True
+    input_type = "string"
+    output_type = "float"
+    is_symmetrical = True
+    has_symmetry_importance = True
     def _prepare_value(self, value: Optional[str]) -> Optional[str]:
         return value
@@ -203,12 +206,12 @@ class JaroWinklerSim1(StringSim):
 class JaroWinklerSim2(StringSim):
-    name: str = "sim_jw2"
-    is_binary: bool = True
-    input_type: Optional[str] = "string"
-    output_type: Optional[str] = "float"
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
+    name = "sim_jw2"
+    is_binary = True
+    input_type = "string"
+    output_type = "float"
+    is_symmetrical = True
+    has_symmetry_importance = True
     def _prepare_value(self, value: Optional[str]) -> Optional[str]:
         return value[::-1] if value is not None else None
@@ -218,12 +221,12 @@ class JaroWinklerSim2(StringSim):
 class LevenshteinSim(StringSim):
-    name: str = "sim_lv"
-    is_binary: bool = True
-    input_type: Optional[str] = "string"
-    output_type: Optional[str] = "float"
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
+    name = "sim_lv"
+    is_binary = True
+    input_type = "string"
+    output_type = "float"
+    is_symmetrical = True
+    has_symmetry_importance = True
     def _prepare_value(self, value: Optional[str]) -> Optional[str]:
         return value

{upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/autofe/date.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import abc
+import json
 from typing import Any, Dict, List, Optional, Union
 import numpy as np
@@ -38,10 +39,10 @@ class DateDiffMixin(BaseModel):
 class DateDiff(PandasOperand, DateDiffMixin):
-    name: str = "date_diff"
-    alias: Optional[str] = "date_diff_type1"
-    is_binary: bool = True
-    has_symmetry_importance: bool = True
+    name = "date_diff"
+    alias = "date_diff_type1"
+    is_binary = True
+    has_symmetry_importance = True
     replace_negative: bool = False
@@ -70,9 +71,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
 class DateDiffType2(PandasOperand, DateDiffMixin):
-    name: str = "date_diff_type2"
-    is_binary: bool = True
-    has_symmetry_importance: bool = True
+    name = "date_diff_type2"
+    is_binary = True
+    has_symmetry_importance = True
     def get_params(self) -> Dict[str, Optional[str]]:
         res = super().get_params()
@@ -104,8 +105,8 @@ _count_aggregations = ["nunique", "count"]
 class DateListDiff(PandasOperand, DateDiffMixin):
-    is_binary: bool = True
-    has_symmetry_importance: bool = True
+    is_binary = True
+    has_symmetry_importance = True
     aggregation: str
     replace_negative: bool = False
@@ -165,8 +166,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
 class DateListDiffBounded(DateListDiff):
-    lower_bound: Optional[int] = None
-    upper_bound: Optional[int] = None
+    lower_bound: Optional[int]
+    upper_bound: Optional[int]
     def __init__(self, **data: Any) -> None:
         if "name" not in data:
@@ -191,8 +192,8 @@ class DateListDiffBounded(DateListDiff):
 class DatePercentileBase(PandasOperand, abc.ABC):
-    is_binary: bool = True
-    output_type: Optional[str] = "float"
+    is_binary = True
+    output_type = "float"
     date_unit: Optional[str] = None
@@ -226,12 +227,12 @@ class DatePercentileBase(PandasOperand, abc.ABC):
 class DatePercentile(DatePercentileBase):
-    name: str = "date_per"
-    alias: Optional[str] = "date_per_method1"
+    name = "date_per"
+    alias = "date_per_method1"
-    zero_month: Optional[int] = None
-    zero_year: Optional[int] = None
-    zero_bounds: Optional[List[float]] = None
+    zero_month: Optional[int]
+    zero_year: Optional[int]
+    zero_bounds: Optional[List[float]]
     step: int = 30
     def get_params(self) -> Dict[str, Optional[str]]:
@@ -246,12 +247,12 @@ class DatePercentile(DatePercentileBase):
         )
         return res
-    @validator("zero_bounds", pre="true")
+    @validator("zero_bounds", pre=True)
     def validate_bounds(cls, value):
         if value is None or isinstance(value, list):
             return value
         elif isinstance(value, str):
-            return value[1:-1].split(", ")
+            return json.loads(value)
     def _get_bounds(self, date_col: pd.Series) -> pd.Series:
         months = date_col.dt.month
@@ -264,7 +265,7 @@ class DatePercentile(DatePercentileBase):
 class DatePercentileMethod2(DatePercentileBase):
-    name: str = "date_per_method2"
+    name = "date_per_method2"
     def _get_bounds(self, date_col: pd.Series) -> pd.Series:
         pass

{upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/autofe/groupby.py RENAMED Viewed

@@ -7,9 +7,9 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
 class GroupByThenAgg(PandasOperand, VectorizableMixin):
     agg: Optional[str]
-    is_vectorizable: bool = True
-    is_grouping: bool = True
-    is_distribution_dependent: bool = True
+    is_vectorizable = True
+    is_grouping = True
+    is_distribution_dependent = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         temp = left.groupby(right).agg(self.agg)
@@ -24,17 +24,17 @@ class GroupByThenAgg(PandasOperand, VectorizableMixin):
 class GroupByThenMedian(GroupByThenAgg):
-    name: str = "GroupByThenMedian"
-    pandas_agg: str = "median"
-    is_distribution_dependent: bool = True
+    name = "GroupByThenMedian"
+    pandas_agg = "median"
+    is_distribution_dependent = True
 class GroupByThenRank(PandasOperand, VectorizableMixin):
-    name: str = "GroupByThenRank"
-    is_vectorizable: bool = True
-    is_grouping: bool = True
-    output_type: Optional[str] = "float"
-    is_distribution_dependent: bool = True
+    name = "GroupByThenRank"
+    is_vectorizable = True
+    is_grouping = True
+    output_type = "float"
+    is_distribution_dependent = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
@@ -49,12 +49,12 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
 class GroupByThenNUnique(PandasOperand, VectorizableMixin):
-    name: str = "GroupByThenNUnique"
-    is_vectorizable: bool = True
-    is_grouping: bool = True
-    output_type: Optional[str] = "int"
-    is_distribution_dependent: bool = True
-    input_type: Optional[str] = "discrete"
+    name = "GroupByThenNUnique"
+    is_vectorizable = True
+    is_grouping = True
+    output_type = "int"
+    is_distribution_dependent = True
+    input_type = "discrete"
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         nunique = left.groupby(right).nunique()
@@ -69,11 +69,11 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
 class GroupByThenFreq(PandasOperand):
-    name: str = "GroupByThenFreq"
-    is_grouping: bool = True
-    output_type: Optional[str] = "float"
-    is_distribution_dependent: bool = True
-    input_type: Optional[str] = "discrete"
+    name = "GroupByThenFreq"
+    is_grouping = True
+    output_type = "float"
+    is_distribution_dependent = True
+    input_type = "discrete"
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         def _f(x):

{upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/autofe/operand.py RENAMED Viewed

@@ -8,19 +8,19 @@ from pydantic import BaseModel
 class Operand(BaseModel):
     name: str
-    alias: Optional[str] = None
+    alias: Optional[str]
     is_unary: bool = False
     is_symmetrical: bool = False
     has_symmetry_importance: bool = False
-    input_type: Optional[str] = None
-    output_type: Optional[str] = None
+    input_type: Optional[str]
+    output_type: Optional[str]
     is_categorical: bool = False
     is_vectorizable: bool = False
     is_grouping: bool = False
     is_binary: bool = False
     is_vector: bool = False
     is_distribution_dependent: bool = False
-    params: Optional[Dict[str, str]] = None
+    params: Optional[Dict[str, str]]
     def set_params(self, params: Dict[str, str]):
         self.params = params

{upgini-1.1.316a4 → upgini-1.1.317}/src/upgini/autofe/unary.py RENAMED Viewed

@@ -1,4 +1,3 @@
-from typing import Optional
 import numpy as np
 import pandas as pd
 from sklearn.preprocessing import Normalizer
@@ -7,10 +6,10 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
 class Abs(PandasOperand, VectorizableMixin):
-    name: str = "abs"
-    is_unary: bool = True
-    is_vectorizable: bool = True
-    group_index: int = 0
+    name = "abs"
+    is_unary = True
+    is_vectorizable = True
+    group_index = 0
     def calculate_unary(self, data: pd.Series) -> pd.Series:
         return data.abs()
@@ -20,11 +19,11 @@ class Abs(PandasOperand, VectorizableMixin):
 class Log(PandasOperand, VectorizableMixin):
-    name: str = "log"
-    is_unary: bool = True
-    is_vectorizable: bool = True
-    output_type: Optional[str] = "float"
-    group_index: int = 0
+    name = "log"
+    is_unary = True
+    is_vectorizable = True
+    output_type = "float"
+    group_index = 0
     def calculate_unary(self, data: pd.Series) -> pd.Series:
         return self._round_value(np.log(np.abs(data.replace(0, np.nan))), 10)
@@ -34,11 +33,11 @@ class Log(PandasOperand, VectorizableMixin):
 class Sqrt(PandasOperand, VectorizableMixin):
-    name: str = "sqrt"
-    is_unary: bool = True
-    is_vectorizable: bool = True
-    output_type: Optional[str] = "float"
-    group_index: int = 0
+    name = "sqrt"
+    is_unary = True
+    is_vectorizable = True
+    output_type = "float"
+    group_index = 0
     def calculate_unary(self, data: pd.Series) -> pd.Series:
         return self._round_value(np.sqrt(np.abs(data)))
@@ -48,10 +47,10 @@ class Sqrt(PandasOperand, VectorizableMixin):
 class Square(PandasOperand, VectorizableMixin):
-    name: str = "square"
-    is_unary: bool = True
-    is_vectorizable: bool = True
-    group_index: int = 0
+    name = "square"
+    is_unary = True
+    is_vectorizable = True
+    group_index = 0
     def calculate_unary(self, data: pd.Series) -> pd.Series:
         return np.square(data)
@@ -61,11 +60,11 @@ class Square(PandasOperand, VectorizableMixin):
 class Sigmoid(PandasOperand, VectorizableMixin):
-    name: str = "sigmoid"
-    is_unary: bool = True
-    is_vectorizable: bool = True
-    output_type: Optional[str] = "float"
-    group_index: int = 0
+    name = "sigmoid"
+    is_unary = True
+    is_vectorizable = True
+    output_type = "float"
+    group_index = 0
     def calculate_unary(self, data: pd.Series) -> pd.Series:
         return self._round_value(1 / (1 + np.exp(-data)))
@@ -75,12 +74,12 @@ class Sigmoid(PandasOperand, VectorizableMixin):
 class Floor(PandasOperand, VectorizableMixin):
-    name: str = "floor"
-    is_unary: bool = True
-    is_vectorizable: bool = True
-    output_type: Optional[str] = "int"
-    input_type: Optional[str] = "continuous"
-    group_index: int = 0
+    name = "floor"
+    is_unary = True
+    is_vectorizable = True
+    output_type = "int"
+    input_type = "continuous"
+    group_index = 0
     def calculate_unary(self, data: pd.Series) -> pd.Series:
         return np.floor(data)
@@ -90,11 +89,11 @@ class Floor(PandasOperand, VectorizableMixin):
 class Residual(PandasOperand, VectorizableMixin):
-    name: str = "residual"
-    is_unary: bool = True
-    is_vectorizable: bool = True
-    input_type: Optional[str] = "continuous"
-    group_index: int = 0
+    name = "residual"
+    is_unary = True
+    is_vectorizable = True
+    input_type = "continuous"
+    group_index = 0
     def calculate_unary(self, data: pd.Series) -> pd.Series:
         return data - np.floor(data)
@@ -104,11 +103,11 @@ class Residual(PandasOperand, VectorizableMixin):
 class Freq(PandasOperand):
-    name: str = "freq"
-    is_unary: bool = True
-    output_type: Optional[str] = "float"
-    is_distribution_dependent: bool = True
-    input_type: Optional[str] = "discrete"
+    name = "freq"
+    is_unary = True
+    output_type = "float"
+    is_distribution_dependent = True
+    input_type = "discrete"
     def calculate_unary(self, data: pd.Series) -> pd.Series:
         value_counts = data.value_counts(normalize=True)
@@ -116,9 +115,9 @@ class Freq(PandasOperand):
 class Norm(PandasOperand):
-    name: str = "norm"
-    is_unary: bool = True
-    output_type: Optional[str] = "float"
+    name = "norm"
+    is_unary = True
+    output_type = "float"
     def calculate_unary(self, data: pd.Series) -> pd.Series:
         data_dropna = data.dropna()
@@ -132,7 +131,7 @@ class Norm(PandasOperand):
 class Embeddings(PandasOperand):
-    name: str = "emb"
-    is_unary: bool = True
-    input_type: Optional[str] = "string"
-    output_type: Optional[str] = "vector"
+    name = "emb"
+    is_unary = True
+    input_type = "string"
+    output_type = "vector"

upgini 1.1.316a4__tar.gz → 1.1.317__tar.gz

Potentially problematic release.

upgini 1.1.316a4tar.gz → 1.1.317tar.gz