PyPI - upgini - Versions diffs - 1.1.316a5__tar.gz → 1.1.317__tar.gz - Mend

upgini 1.1.316a5tar.gz → 1.1.317tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (65) hide show

{upgini-1.1.316a5 → upgini-1.1.317}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.1.316a5
+Version: 1.1.317
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/
@@ -29,9 +29,9 @@ Requires-Dist: ipywidgets>=8.1.0
 Requires-Dist: jarowinkler>=2.0.0
 Requires-Dist: levenshtein>=0.25.1
 Requires-Dist: lightgbm>=3.3.2
-Requires-Dist: numpy<=1.26.4,>=1.19.0
+Requires-Dist: numpy>=1.19.0
 Requires-Dist: pandas<3.0.0,>=1.1.0
-Requires-Dist: pydantic<3.0.0,>1.0.0
+Requires-Dist: pydantic<2.0.0,>=1.8.2
 Requires-Dist: pyjwt>=2.8.0
 Requires-Dist: python-bidi==0.4.2
 Requires-Dist: python-dateutil>=2.8.0

{upgini-1.1.316a5 → upgini-1.1.317}/pyproject.toml RENAMED Viewed

@@ -39,9 +39,9 @@ dependencies = [
     "fastparquet>=0.8.1",
     "ipywidgets>=8.1.0",
     "lightgbm>=3.3.2",
-    "numpy>=1.19.0,<=1.26.4",
+    "numpy>=1.19.0",
     "pandas>=1.1.0,<3.0.0",
-    "pydantic>1.0.0,<3.0.0",
+    "pydantic>=1.8.2,<2.0.0",
     "pyjwt>=2.8.0",
     "python-dateutil>=2.8.0",
     "python-json-logger>=2.0.2",
@@ -79,15 +79,15 @@ python = "3.10"
 cov = 'pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=upgini --cov=tests'
 format = "black {args}"
 lint = "ruff check {args}"
-test_all = 'pytest -s -vv tests'
+test_binary = 'pytest -s -vv tests/test_binary_dataset.py'
-#[[tool.hatch.envs.test.matrix]]
-#python = ["3.8"]
-#pandas = ["1.1.0"]
+[[tool.hatch.envs.test.matrix]]
+python = ["3.8"]
+pandas = ["1.1.0"]
-#[[tool.hatch.envs.test.matrix]]
-#python = ["3.8", "3.9", "3.10"]
-#pandas = ["1.2.0", "1.3.0", "1.4.0", "1.5.0", "2.0.0"]
+[[tool.hatch.envs.test.matrix]]
+python = ["3.8", "3.9", "3.10"]
+pandas = ["1.2.0", "1.3.0", "1.4.0", "1.5.0", "2.0.0"]
 [[tool.hatch.envs.test.matrix]]
 python = ["3.9", "3.10"]
@@ -103,8 +103,7 @@ dependencies = [
 #  "pytest-timeout",
   "requests-mock",
   "pytest-datafiles",
-  "pytest-xdist",
-  "pandas~={matrix:pandas}",
+  "pandas~={matrix:pandas}.0",
 ]
 [tool.black]
@@ -116,5 +115,4 @@ profile = "black"
 [tool.pytest.ini_options]
 pythonpath = [
   "./src"
-]
-addopts="-n 4"
+]

upgini-1.1.317/src/upgini/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.1.317"

{upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/autofe/binary.py RENAMED Viewed

@@ -9,32 +9,32 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
 class Min(PandasOperand):
-    name: str = "min"
-    is_binary: bool = True
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
+    name = "min"
+    is_binary = True
+    is_symmetrical = True
+    has_symmetry_importance = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         return np.minimum(left, right)
 class Max(PandasOperand):
-    name: str = "max"
-    is_binary: bool = True
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
+    name = "max"
+    is_binary = True
+    is_symmetrical = True
+    has_symmetry_importance = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         return np.maximum(left, right)
 class Add(PandasOperand, VectorizableMixin):
-    name: str = "+"
-    alias: str = "add"
-    is_binary: bool = True
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
-    is_vectorizable: bool = True
+    name = "+"
+    alias = "add"
+    is_binary = True
+    is_symmetrical = True
+    has_symmetry_importance = True
+    is_vectorizable = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         return left + right
@@ -48,12 +48,12 @@ class Add(PandasOperand, VectorizableMixin):
 class Subtract(PandasOperand, VectorizableMixin):
-    name: str = "-"
-    alias: str = "sub"
-    is_binary: bool = True
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
-    is_vectorizable: bool = True
+    name = "-"
+    alias = "sub"
+    is_binary = True
+    is_symmetrical = True
+    has_symmetry_importance = True
+    is_vectorizable = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         return left - right
@@ -67,12 +67,12 @@ class Subtract(PandasOperand, VectorizableMixin):
 class Multiply(PandasOperand, VectorizableMixin):
-    name: str = "*"
-    alias: str = "mul"
-    is_binary: bool = True
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
-    is_vectorizable: bool = True
+    name = "*"
+    alias = "mul"
+    is_binary = True
+    is_symmetrical = True
+    has_symmetry_importance = True
+    is_vectorizable = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         return left * right
@@ -86,12 +86,12 @@ class Multiply(PandasOperand, VectorizableMixin):
 class Divide(PandasOperand, VectorizableMixin):
-    name: str = "/"
-    alias: str = "div"
-    is_binary: bool = True
-    has_symmetry_importance: bool = True
-    is_vectorizable: bool = True
-    output_type: Optional[str] = "float"
+    name = "/"
+    alias = "div"
+    is_binary = True
+    has_symmetry_importance = True
+    is_vectorizable = True
+    output_type = "float"
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         return left / right.replace(0, np.nan)
@@ -105,10 +105,10 @@ class Divide(PandasOperand, VectorizableMixin):
 class Combine(PandasOperand):
-    name: str = "Combine"
-    is_binary: bool = True
-    has_symmetry_importance: bool = True
-    output_type: Optional[str] = "object"
+    name = "Combine"
+    is_binary = True
+    has_symmetry_importance = True
+    output_type = "object"
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         temp = left.astype(str) + "_" + right.astype(str)
@@ -117,13 +117,13 @@ class Combine(PandasOperand):
 class CombineThenFreq(PandasOperand):
-    name: str = "CombineThenFreq"
-    is_binary: bool = True
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
-    output_type: Optional[str] = "float"
-    is_distribution_dependent: bool = True
-    input_type: Optional[str] = "discrete"
+    name = "CombineThenFreq"
+    is_binary = True
+    is_symmetrical = True
+    has_symmetry_importance = True
+    output_type = "float"
+    is_distribution_dependent = True
+    input_type = "discrete"
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         temp = left.astype(str) + "_" + right.astype(str)
@@ -133,15 +133,15 @@ class CombineThenFreq(PandasOperand):
 class Distance(PandasOperand):
-    name: str = "dist"
-    is_binary: bool = True
-    output_type: Optional[str] = "float"
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
+    name = "dist"
+    is_binary = True
+    output_type = "float"
+    is_symmetrical = True
+    has_symmetry_importance = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         return pd.Series(
-            1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
+            1 - self.__dot(left, right) / (self.__norm(left) * self.__norm(right)), index=left.index
         )
     # row-wise dot product
@@ -152,14 +152,17 @@ class Distance(PandasOperand):
         res = res.reindex(left.index.union(right.index))
         return res
+    def __norm(self, vector: pd.Series) -> pd.Series:
+        return np.sqrt(self.__dot(vector, vector))
 # Left for backward compatibility
 class Sim(Distance):
-    name: str = "sim"
-    is_binary: bool = True
-    output_type: Optional[str] = "float"
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
+    name = "sim"
+    is_binary = True
+    output_type = "float"
+    is_symmetrical = True
+    has_symmetry_importance = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         return 1 - super().calculate_binary(left, right)
@@ -188,12 +191,12 @@ class StringSim(PandasOperand, abc.ABC):
 class JaroWinklerSim1(StringSim):
-    name: str = "sim_jw1"
-    is_binary: bool = True
-    input_type: Optional[str] = "string"
-    output_type: Optional[str] = "float"
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
+    name = "sim_jw1"
+    is_binary = True
+    input_type = "string"
+    output_type = "float"
+    is_symmetrical = True
+    has_symmetry_importance = True
     def _prepare_value(self, value: Optional[str]) -> Optional[str]:
         return value
@@ -203,12 +206,12 @@ class JaroWinklerSim1(StringSim):
 class JaroWinklerSim2(StringSim):
-    name: str = "sim_jw2"
-    is_binary: bool = True
-    input_type: Optional[str] = "string"
-    output_type: Optional[str] = "float"
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
+    name = "sim_jw2"
+    is_binary = True
+    input_type = "string"
+    output_type = "float"
+    is_symmetrical = True
+    has_symmetry_importance = True
     def _prepare_value(self, value: Optional[str]) -> Optional[str]:
         return value[::-1] if value is not None else None
@@ -218,12 +221,12 @@ class JaroWinklerSim2(StringSim):
 class LevenshteinSim(StringSim):
-    name: str = "sim_lv"
-    is_binary: bool = True
-    input_type: Optional[str] = "string"
-    output_type: Optional[str] = "float"
-    is_symmetrical: bool = True
-    has_symmetry_importance: bool = True
+    name = "sim_lv"
+    is_binary = True
+    input_type = "string"
+    output_type = "float"
+    is_symmetrical = True
+    has_symmetry_importance = True
     def _prepare_value(self, value: Optional[str]) -> Optional[str]:
         return value

{upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/autofe/date.py RENAMED Viewed

@@ -1,19 +1,15 @@
 import abc
+import json
 from typing import Any, Dict, List, Optional, Union
 import numpy as np
 import pandas as pd
 from pandas.core.arrays.timedeltas import TimedeltaArray
-from pydantic import BaseModel, __version__ as pydantic_version
+from pydantic import BaseModel, validator
 from upgini.autofe.operand import PandasOperand
-def get_pydantic_version():
-    major_version = int(pydantic_version.split('.')[0])
-    return major_version
 class DateDiffMixin(BaseModel):
     diff_unit: str = "D"
     left_unit: Optional[str] = None
@@ -43,10 +39,10 @@ class DateDiffMixin(BaseModel):
 class DateDiff(PandasOperand, DateDiffMixin):
-    name: str = "date_diff"
-    alias: Optional[str] = "date_diff_type1"
-    is_binary: bool = True
-    has_symmetry_importance: bool = True
+    name = "date_diff"
+    alias = "date_diff_type1"
+    is_binary = True
+    has_symmetry_importance = True
     replace_negative: bool = False
@@ -75,9 +71,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
 class DateDiffType2(PandasOperand, DateDiffMixin):
-    name: str = "date_diff_type2"
-    is_binary: bool = True
-    has_symmetry_importance: bool = True
+    name = "date_diff_type2"
+    is_binary = True
+    has_symmetry_importance = True
     def get_params(self) -> Dict[str, Optional[str]]:
         res = super().get_params()
@@ -109,8 +105,8 @@ _count_aggregations = ["nunique", "count"]
 class DateListDiff(PandasOperand, DateDiffMixin):
-    is_binary: bool = True
-    has_symmetry_importance: bool = True
+    is_binary = True
+    has_symmetry_importance = True
     aggregation: str
     replace_negative: bool = False
@@ -170,8 +166,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
 class DateListDiffBounded(DateListDiff):
-    lower_bound: Optional[int] = None
-    upper_bound: Optional[int] = None
+    lower_bound: Optional[int]
+    upper_bound: Optional[int]
     def __init__(self, **data: Any) -> None:
         if "name" not in data:
@@ -196,8 +192,8 @@ class DateListDiffBounded(DateListDiff):
 class DatePercentileBase(PandasOperand, abc.ABC):
-    is_binary: bool = True
-    output_type: Optional[str] = "float"
+    is_binary = True
+    output_type = "float"
     date_unit: Optional[str] = None
@@ -231,12 +227,12 @@ class DatePercentileBase(PandasOperand, abc.ABC):
 class DatePercentile(DatePercentileBase):
-    name: str = "date_per"
-    alias: Optional[str] = "date_per_method1"
+    name = "date_per"
+    alias = "date_per_method1"
-    zero_month: Optional[int] = None
-    zero_year: Optional[int] = None
-    zero_bounds: Optional[List[float]] = None
+    zero_month: Optional[int]
+    zero_year: Optional[int]
+    zero_bounds: Optional[List[float]]
     step: int = 30
     def get_params(self) -> Dict[str, Optional[str]]:
@@ -251,25 +247,12 @@ class DatePercentile(DatePercentileBase):
         )
         return res
-    # Check Pydantic version
-    if get_pydantic_version() >= 2:
-        # Use @field_validator for Pydantic 2.x
-        from pydantic import field_validator
-        @field_validator('zero_bounds', mode='before')
-        def parse_zero_bounds(cls, value):
-            if isinstance(value, str):
-                return value[1:-1].split(", ")
-            return value
-    else:
-        # Use @validator for Pydantic 1.x
-        from pydantic import validator
-        @validator('zero_bounds', pre=True)
-        def parse_zero_bounds(cls, value):
-            if isinstance(value, str):
-                return value[1:-1].split(", ")
+    @validator("zero_bounds", pre=True)
+    def validate_bounds(cls, value):
+        if value is None or isinstance(value, list):
             return value
+        elif isinstance(value, str):
+            return json.loads(value)
     def _get_bounds(self, date_col: pd.Series) -> pd.Series:
         months = date_col.dt.month
@@ -282,7 +265,7 @@ class DatePercentile(DatePercentileBase):
 class DatePercentileMethod2(DatePercentileBase):
-    name: str = "date_per_method2"
+    name = "date_per_method2"
     def _get_bounds(self, date_col: pd.Series) -> pd.Series:
         pass

{upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/autofe/groupby.py RENAMED Viewed

@@ -7,9 +7,9 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
 class GroupByThenAgg(PandasOperand, VectorizableMixin):
     agg: Optional[str]
-    is_vectorizable: bool = True
-    is_grouping: bool = True
-    is_distribution_dependent: bool = True
+    is_vectorizable = True
+    is_grouping = True
+    is_distribution_dependent = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         temp = left.groupby(right).agg(self.agg)
@@ -24,17 +24,17 @@ class GroupByThenAgg(PandasOperand, VectorizableMixin):
 class GroupByThenMedian(GroupByThenAgg):
-    name: str = "GroupByThenMedian"
-    pandas_agg: str = "median"
-    is_distribution_dependent: bool = True
+    name = "GroupByThenMedian"
+    pandas_agg = "median"
+    is_distribution_dependent = True
 class GroupByThenRank(PandasOperand, VectorizableMixin):
-    name: str = "GroupByThenRank"
-    is_vectorizable: bool = True
-    is_grouping: bool = True
-    output_type: Optional[str] = "float"
-    is_distribution_dependent: bool = True
+    name = "GroupByThenRank"
+    is_vectorizable = True
+    is_grouping = True
+    output_type = "float"
+    is_distribution_dependent = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
@@ -49,12 +49,12 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
 class GroupByThenNUnique(PandasOperand, VectorizableMixin):
-    name: str = "GroupByThenNUnique"
-    is_vectorizable: bool = True
-    is_grouping: bool = True
-    output_type: Optional[str] = "int"
-    is_distribution_dependent: bool = True
-    input_type: Optional[str] = "discrete"
+    name = "GroupByThenNUnique"
+    is_vectorizable = True
+    is_grouping = True
+    output_type = "int"
+    is_distribution_dependent = True
+    input_type = "discrete"
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         nunique = left.groupby(right).nunique()
@@ -69,11 +69,11 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
 class GroupByThenFreq(PandasOperand):
-    name: str = "GroupByThenFreq"
-    is_grouping: bool = True
-    output_type: Optional[str] = "float"
-    is_distribution_dependent: bool = True
-    input_type: Optional[str] = "discrete"
+    name = "GroupByThenFreq"
+    is_grouping = True
+    output_type = "float"
+    is_distribution_dependent = True
+    input_type = "discrete"
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         def _f(x):

{upgini-1.1.316a5 → upgini-1.1.317}/src/upgini/autofe/operand.py RENAMED Viewed

@@ -8,19 +8,19 @@ from pydantic import BaseModel
 class Operand(BaseModel):
     name: str
-    alias: Optional[str] = None
+    alias: Optional[str]
     is_unary: bool = False
     is_symmetrical: bool = False
     has_symmetry_importance: bool = False
-    input_type: Optional[str] = None
-    output_type: Optional[str] = None
+    input_type: Optional[str]
+    output_type: Optional[str]
     is_categorical: bool = False
     is_vectorizable: bool = False
     is_grouping: bool = False
     is_binary: bool = False
     is_vector: bool = False
     is_distribution_dependent: bool = False
-    params: Optional[Dict[str, str]] = None
+    params: Optional[Dict[str, str]]
     def set_params(self, params: Dict[str, str]):
         self.params = params

upgini 1.1.316a5__tar.gz → 1.1.317__tar.gz

Potentially problematic release.

upgini 1.1.316a5tar.gz → 1.1.317tar.gz