PyPI - upgini - Versions diffs - 1.1.309__tar.gz → 1.1.309a3511.dev1__tar.gz - Mend

upgini 1.1.309tar.gz → 1.1.309a3511.dev1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (65) hide show

{upgini-1.1.309 → upgini-1.1.309a3511.dev1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.1.309
+Version: 1.1.309a3511.dev1
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/
@@ -26,6 +26,8 @@ Requires-Python: <3.11,>=3.8
 Requires-Dist: catboost>=1.0.3
 Requires-Dist: fastparquet>=0.8.1
 Requires-Dist: ipywidgets>=8.1.0
+Requires-Dist: jarowinkler>=2.0.0
+Requires-Dist: levenshtein>=0.25.1
 Requires-Dist: lightgbm>=3.3.2
 Requires-Dist: numpy>=1.19.0
 Requires-Dist: pandas<3.0.0,>=1.1.0

{upgini-1.1.309 → upgini-1.1.309a3511.dev1}/pyproject.toml RENAMED Viewed

@@ -49,6 +49,9 @@ dependencies = [
     "scikit-learn>=1.3.0",
     "python-bidi==0.4.2",
     "xhtml2pdf==0.2.11",
+    "jarowinkler>=2.0.0",
+    "levenshtein>=0.25.1",
+    "python-bidi==0.4.2",
 ]
 [project.urls]

upgini-1.1.309a3511.dev1/src/upgini/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.1.309a3511.dev1"

{upgini-1.1.309 → upgini-1.1.309a3511.dev1}/src/upgini/autofe/all_operands.py RENAMED Viewed

@@ -1,17 +1,38 @@
 from typing import Dict
-from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
+from upgini.autofe.binary import (
+    Add,
+    Combine,
+    CombineThenFreq,
+    Distance,
+    Divide,
+    JaroWinklerSim1,
+    JaroWinklerSim2,
+    LevenshteinSim,
+    Max,
+    Min,
+    Multiply,
+    Sim,
+    Subtract,
+)
 from upgini.autofe.date import (
+    (
     DateDiff,
     DateDiffType2,
     DateListDiff,
     DateListDiffBounded,
     DatePercentile,
+    DatePercentileMethod2,
+),
     DatePercentileMethod2,
 )
-from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
+from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
 from upgini.autofe.operand import Operand
-from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
+from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
 from upgini.autofe.vector import Mean, Sum
 ALL_OPERANDS: Dict[str, Operand] = {
@@ -39,10 +60,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
         GroupByThenAgg(name="GroupByThenMedian", agg="median"),
         GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
         GroupByThenRank(),
-        Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
-        Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
-        Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
-        Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
+        Combine(),
+        CombineThenFreq(),
+        GroupByThenNUnique(),
+        GroupByThenFreq(),
         Sim(),
         DateDiff(),
         DateDiffType2(),
@@ -59,6 +80,11 @@ ALL_OPERANDS: Dict[str, Operand] = {
         DatePercentile(),
         DatePercentileMethod2(),
         Norm(),
+        JaroWinklerSim1(),
+        JaroWinklerSim2(),
+        LevenshteinSim(),
+        Distance(),
+        Embeddings(),
     ]
 }

{upgini-1.1.309 → upgini-1.1.309a3511.dev1}/src/upgini/autofe/binary.py RENAMED Viewed

@@ -1,7 +1,11 @@
+import abc
+from typing import Optional
+import Levenshtein
 import numpy as np
 import pandas as pd
 from numpy import dot
 from numpy.linalg import norm
+from jarowinkler import jarowinkler_similarity
 from upgini.autofe.operand import PandasOperand, VectorizableMixin
@@ -130,7 +134,27 @@ class CombineThenFreq(PandasOperand):
         self._loc(temp, value_counts)
-class Sim(PandasOperand):
+class Distance(PandasOperand):
+    name = "dist"
+    is_binary = True
+    output_type = "float"
+    is_symmetrical = True
+    has_symmetry_importance = True
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        return pd.Series(
+            1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
+        )
+    # row-wise dot product
+    def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        res = (left.dropna() * right.dropna()).apply(np.sum)
+        res = res.reindex(left.index.union(right.index))
+        return res
+# Left for backward compatibility
+class Sim(Distance):
     name = "sim"
     is_binary = True
     output_type = "float"
@@ -138,4 +162,71 @@ class Sim(PandasOperand):
     has_symmetry_importance = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
-        return dot(left, right) / (norm(left) * norm(right))
+        return 1 - super().calculate_binary(left, right)
+class StringSim(PandasOperand, abc.ABC):
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        sims = []
+        for i in left.index:
+            left_i = self._prepare_value(left.get(i))
+            right_i = self._prepare_value(right.get(i))
+            if left_i is not None and right_i is not None:
+                sims.append(self._similarity(left_i, right_i))
+            else:
+                sims.append(None)
+        return pd.Series(sims, index=left.index)
+    @abc.abstractmethod
+    def _prepare_value(self, value: Optional[str]) -> Optional[str]:
+        pass
+    @abc.abstractmethod
+    def _similarity(self, left: str, right: str) -> float:
+        pass
+class JaroWinklerSim1(StringSim):
+    name = "sim_jw1"
+    is_binary = True
+    input_type = "string"
+    output_type = "float"
+    is_symmetrical = True
+    has_symmetry_importance = True
+    def _prepare_value(self, value: Optional[str]) -> Optional[str]:
+        return value
+    def _similarity(self, left: str, right: str) -> float:
+        return jarowinkler_similarity(left, right)
+class JaroWinklerSim2(StringSim):
+    name = "sim_jw2"
+    is_binary = True
+    input_type = "string"
+    output_type = "float"
+    is_symmetrical = True
+    has_symmetry_importance = True
+    def _prepare_value(self, value: Optional[str]) -> Optional[str]:
+        return value[::-1] if value is not None else None
+    def _similarity(self, left: str, right: str) -> float:
+        return jarowinkler_similarity(left, right)
+class LevenshteinSim(StringSim):
+    name = "sim_lv"
+    is_binary = True
+    input_type = "string"
+    output_type = "float"
+    is_symmetrical = True
+    has_symmetry_importance = True
+    def _prepare_value(self, value: Optional[str]) -> Optional[str]:
+        return value
+    def _similarity(self, left: str, right: str) -> float:
+        return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))

{upgini-1.1.309 → upgini-1.1.309a3511.dev1}/src/upgini/autofe/date.py RENAMED Viewed

@@ -43,6 +43,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
     is_binary = True
     has_symmetry_importance = True
+    replace_negative: bool = False
     def get_params(self) -> Dict[str, Optional[str]]:
         res = super().get_params()
         res.update(
@@ -50,6 +52,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
                 "diff_unit": self.diff_unit,
                 "left_unit": self.left_unit,
                 "right_unit": self.right_unit,
+                "replace_negative": self.replace_negative,
             }
         )
         return res
@@ -61,7 +64,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
         return self.__replace_negative(diff)
     def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
-        x[x < 0] = None
+        if self.replace_negative:
+            x[x < 0] = None
         return x
@@ -101,13 +105,19 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
 class DateListDiff(PandasOperand, DateDiffMixin):
     is_binary = True
     has_symmetry_importance = True
     aggregation: str
+    replace_negative: bool = False
     def get_params(self) -> Dict[str, Optional[str]]:
         res = super().get_params()
         res.update(
             {
                 "aggregation": self.aggregation,
+                "diff_unit": self.diff_unit,
+                "left_unit": self.left_unit,
+                "right_unit": self.right_unit,
+                "replace_negative": self.replace_negative,
             }
         )
         return res
@@ -125,7 +135,7 @@ class DateListDiff(PandasOperand, DateDiffMixin):
     def _diff(self, x: TimedeltaArray):
         x = self._convert_diff_to_unit(x)
-        return x[x > 0]
+        return x[x > 0] if self.replace_negative else x
     def _agg(self, x):
         method = getattr(np, self.aggregation, None)
@@ -157,7 +167,10 @@ class DateListDiffBounded(DateListDiff):
         super().__init__(**data)
     def _agg(self, x):
-        x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
+        x = x[
+            (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
+            & (x < (self.upper_bound if self.upper_bound is not None else np.inf))
+        ]
         return super()._agg(x)

{upgini-1.1.309 → upgini-1.1.309a3511.dev1}/src/upgini/autofe/feature.py RENAMED Viewed

@@ -138,15 +138,17 @@ class Feature:
         if self.cached_display_name is not None and cache:
             return self.cached_display_name
+        should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
+        prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
         if self.alias:
             components = ["f_autofe", self.alias]
-        elif shorten and not self.op.is_unary:
-            components = ["f_autofe", self.get_op_display_name()]
+        elif shorten and (not self.op.is_unary or should_stack_op):
+            components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
         else:
-            components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
-                "autofe",
-                self.get_op_display_name(),
-            ]
+            components = (
+                ["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe"] + prev_name + [self.get_op_display_name()]
+            )
         components.extend([str(self.display_index)] if self.display_index is not None else [])
         display_name = "_".join(components)
@@ -237,12 +239,18 @@ class Feature:
     @staticmethod
     def from_formula(string: str) -> Union[Column, "Feature"]:
-        if string[-1] != ")":
-            return Column(string)
         def is_trivial_char(c: str) -> bool:
             return c not in "()+-*/,"
+        if string[-1] != ")":
+            if all(is_trivial_char(c) for c in string):
+                return Column(string)
+            else:
+                raise ValueError(
+                    f"Unsupported column name: {string}. Column names should not have characters: ['(', ')', '+', '-', '*', '/', ',']"
+                )
         def find_prev(string: str) -> int:
             if string[-1] != ")":
                 return max([(0 if is_trivial_char(c) else i + 1) for i, c in enumerate(string)])
@@ -264,8 +272,11 @@ class Feature:
             return Feature(find_op(string[: p2 - 1]), [Feature.from_formula(string[p2:-1])])
         p1 = find_prev(string[: p2 - 1])
         if string[0] == "(":
+            op = find_op(string[p2 - 1])
+            if op is None:
+                raise ValueError(f"Unsupported operand: {string[p2 - 1]}")
             return Feature(
-                find_op(string[p2 - 1]),
+                op,
                 [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
             )
         else:
@@ -276,6 +287,8 @@ class Feature:
                     [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
                 )
             else:
+                if string[p1 - 1] == "(":
+                    raise ValueError(f"Unsupported operand: {string[: p1 - 1]}")
                 base_features = [
                     Feature.from_formula(string[p2:-1]),
                     Feature.from_formula(string[p1 : p2 - 1]),
@@ -321,10 +334,10 @@ class FeatureGroup:
             lower_order_names = [ch.get_display_name() for ch in lower_order_children]
             if any(isinstance(f, Feature) for f in lower_order_children):
                 child_data = pd.concat(
-                    [data[main_column]] + [ch.calculate(data) for ch in lower_order_children],
+                    [data[main_column or []]] + [ch.calculate(data) for ch in lower_order_children],
                     axis=1,
                 )
-                child_data.columns = [main_column] + lower_order_names
+                child_data.columns = ([main_column] if main_column is not None else []) + lower_order_names
             else:
                 child_data = data[columns]

{upgini-1.1.309 → upgini-1.1.309a3511.dev1}/src/upgini/autofe/unary.py RENAMED Viewed

@@ -125,3 +125,10 @@ class Norm(PandasOperand):
         normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
         normalized_data = normalized_data.reindex(data.index)
         return normalized_data
+class Embeddings(PandasOperand):
+    name = "emb"
+    is_unary = True
+    input_type = "string"
+    output_type = "vector"

{upgini-1.1.309 → upgini-1.1.309a3511.dev1}/src/upgini/data_source/data_source_publisher.py RENAMED Viewed

@@ -3,7 +3,7 @@ import time
 import uuid
 from datetime import datetime
 from enum import Enum
-from typing import Dict, List, Literal, Optional, Union
+from typing import Dict, List, Optional, Union
 from upgini.errors import HttpError, ValidationError
 from upgini.http import LoggerFactory, get_rest_client
@@ -47,9 +47,7 @@ class DataSourcePublisher:
         self,
         data_table_uri: str,
         search_keys: Dict[str, SearchKey],
-        update_frequency: (
-            Literal["Daily"] | Literal["Weekly"] | Literal["Monthly"] | Literal["Quarterly"] | Literal["Annually"]
-        ),
+        update_frequency: str,
         exclude_from_autofe_generation: Optional[List[str]],
         secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
         sort_column: Optional[str] = None,
@@ -235,17 +233,11 @@ class DataSourcePublisher:
                 self.logger.exception("Failed to register data table")
                 raise
-    def remove(self, data_table_ids: List[str] | str):
+    def remove(self, data_table_ids: List[str]):
         trace_id = str(uuid.uuid4())
         with MDC(trace_id=trace_id):
             try:
-                if not data_table_ids:
-                    raise ValidationError("Empty data table ids")
-                if isinstance(data_table_ids, str):
-                    data_table_ids = [data_table_ids]
-                if not isinstance(data_table_ids, list):
-                    raise ValidationError("Invalid format of data_table_ids argument")
-                if len(data_table_ids) == 0:
+                if data_table_ids is None or len(data_table_ids) == 0:
                     raise ValidationError("Empty data table ids")
                 for data_table_id in data_table_ids:
@@ -274,20 +266,16 @@ class DataSourcePublisher:
         source_link: Optional[str] = None,
         update_frequency: Optional[str] = None,
         client_emails: Optional[List[str]] = None,
-        date_features: Optional[List[str]] = None,
-        date_vector_features: Optional[List[str]] = None,
     ):
         trace_id = str(uuid.uuid4())
         with MDC(trace_id=trace_id):
             try:
-                if data_table_ids is None:
+                if data_table_ids is None or len(data_table_ids) == 0:
                     raise ValidationError("Empty data table ids")
                 if isinstance(data_table_ids, str):
                     data_table_ids = [data_table_ids]
                 if not isinstance(data_table_ids, list):
                     raise ValidationError("data_table_ids should be string or list of strings")
-                if len(data_table_ids) == 0:
-                    raise ValidationError("Empty data table ids")
                 if update_frequency is not None and update_frequency not in self.ACCEPTABLE_UPDATE_FREQUENCIES:
                     raise ValidationError(
                         f"Invalid update frequency: {update_frequency}. "
@@ -323,10 +311,6 @@ class DataSourcePublisher:
                     request["updateFrequency"] = update_frequency
                 if client_emails is not None:
                     request["clientEmails"] = client_emails
-                if date_features is not None:
-                    request["dateFeatures"] = date_features
-                if date_vector_features is not None:
-                    request["dateVectorFeatures"] = date_vector_features
                 self.logger.info(f"Activating data tables with request {request}")
                 self._rest_client.activate_datatables(request, trace_id)