PyPI - upgini - Versions diffs - 1.1.299a3511.dev10__tar.gz → 1.1.300__tar.gz - Mend

upgini 1.1.299a3511.dev10tar.gz → 1.1.300tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (66) hide show

{upgini-1.1.299a3511.dev10 → upgini-1.1.300}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.1.299a3511.dev10
+Version: 1.1.300
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/
@@ -26,8 +26,6 @@ Requires-Python: <3.11,>=3.8
 Requires-Dist: catboost>=1.0.3
 Requires-Dist: fastparquet>=0.8.1
 Requires-Dist: ipywidgets>=8.1.0
-Requires-Dist: jarowinkler>=2.0.0
-Requires-Dist: levenshtein>=0.25.1
 Requires-Dist: lightgbm>=3.3.2
 Requires-Dist: numpy>=1.19.0
 Requires-Dist: pandas<3.0.0,>=1.1.0
@@ -133,7 +131,7 @@ Description-Content-Type: text/markdown
 |Consumer Confidence index| 44 |22|-|Monthly|date, country|No
 |World economic indicators|191 |41|-|Monthly|date, country|No
 |Markets data|-|17|-|Monthly|date, datetime|No
-|World mobile & fixed broadband network coverage and perfomance |167|-|3|Monthly|country, postal/ZIP code|No
+|World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
 |World demographic data |90|-|2|Annual|country, postal/ZIP code|No
 |World house prices |44|-|3|Annual|country, postal/ZIP code|No
 |Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
@@ -842,4 +840,4 @@ Some convenient ways to start contributing are:
 - [More perks for registered users](https://profile.upgini.com)
 <sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
-Please report it here</a></sup>
+Please report it here</a></sup>

{upgini-1.1.299a3511.dev10 → upgini-1.1.300}/README.md RENAMED Viewed

@@ -90,7 +90,7 @@
 |Consumer Confidence index| 44 |22|-|Monthly|date, country|No
 |World economic indicators|191 |41|-|Monthly|date, country|No
 |Markets data|-|17|-|Monthly|date, datetime|No
-|World mobile & fixed broadband network coverage and perfomance |167|-|3|Monthly|country, postal/ZIP code|No
+|World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
 |World demographic data |90|-|2|Annual|country, postal/ZIP code|No
 |World house prices |44|-|3|Annual|country, postal/ZIP code|No
 |Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
@@ -799,4 +799,4 @@ Some convenient ways to start contributing are:
 - [More perks for registered users](https://profile.upgini.com)
 <sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
-Please report it here</a></sup>
+Please report it here</a></sup>

{upgini-1.1.299a3511.dev10 → upgini-1.1.300}/pyproject.toml RENAMED Viewed

@@ -49,9 +49,6 @@ dependencies = [
     "scikit-learn>=1.3.0",
     "python-bidi==0.4.2",
     "xhtml2pdf==0.2.11",
-    "jarowinkler>=2.0.0",
-    "levenshtein>=0.25.1",
-    "python-bidi==0.4.2",
 ]
 [project.urls]

upgini-1.1.300/src/upgini/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.1.300"

{upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/autofe/all_operands.py RENAMED Viewed

@@ -1,20 +1,6 @@
 from typing import Dict
-from upgini.autofe.binary import (
-    Add,
-    Combine,
-    CombineThenFreq,
-    Distance,
-    Divide,
-    JaroWinklerSim1,
-    JaroWinklerSim2,
-    LevenshteinSim,
-    Max,
-    Min,
-    Multiply,
-    Sim,
-    Subtract,
-)
+from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
 from upgini.autofe.date import (
     DateDiff,
     DateDiffType2,
@@ -23,9 +9,9 @@ from upgini.autofe.date import (
     DatePercentile,
     DatePercentileMethod2,
 )
-from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
+from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
 from upgini.autofe.operand import Operand
-from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
+from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
 from upgini.autofe.vector import Mean, Sum
 ALL_OPERANDS: Dict[str, Operand] = {
@@ -53,10 +39,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
         GroupByThenAgg(name="GroupByThenMedian", agg="median"),
         GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
         GroupByThenRank(),
-        Combine(),
-        CombineThenFreq(),
-        GroupByThenNUnique(),
-        GroupByThenFreq(),
+        Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
+        Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
+        Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
+        Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
         Sim(),
         DateDiff(),
         DateDiffType2(),
@@ -73,11 +59,6 @@ ALL_OPERANDS: Dict[str, Operand] = {
         DatePercentile(),
         DatePercentileMethod2(),
         Norm(),
-        JaroWinklerSim1(),
-        JaroWinklerSim2(),
-        LevenshteinSim(),
-        Distance(),
-        Embeddings(),
     ]
 }

{upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/autofe/binary.py RENAMED Viewed

@@ -1,11 +1,7 @@
-import abc
-from typing import Optional
-import Levenshtein
 import numpy as np
 import pandas as pd
 from numpy import dot
 from numpy.linalg import norm
-from jarowinkler import jarowinkler_similarity
 from upgini.autofe.operand import PandasOperand, VectorizableMixin
@@ -134,27 +130,7 @@ class CombineThenFreq(PandasOperand):
         self._loc(temp, value_counts)
-class Distance(PandasOperand):
-    name = "dist"
-    is_binary = True
-    output_type = "float"
-    is_symmetrical = True
-    has_symmetry_importance = True
-    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
-        return pd.Series(
-            1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
-        )
-    # row-wise dot product
-    def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
-        res = (left.dropna() * right.dropna()).apply(np.sum)
-        res = res.reindex(left.index.union(right.index))
-        return res
-# Left for backward compatibility
-class Sim(Distance):
+class Sim(PandasOperand):
     name = "sim"
     is_binary = True
     output_type = "float"
@@ -162,71 +138,4 @@ class Sim(Distance):
     has_symmetry_importance = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
-        return 1 - super().calculate_binary(left, right)
-class StringSim(PandasOperand, abc.ABC):
-    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
-        sims = []
-        for i in left.index:
-            left_i = self._prepare_value(left.get(i))
-            right_i = self._prepare_value(right.get(i))
-            if left_i is not None and right_i is not None:
-                sims.append(self._similarity(left_i, right_i))
-            else:
-                sims.append(None)
-        return pd.Series(sims, index=left.index)
-    @abc.abstractmethod
-    def _prepare_value(self, value: Optional[str]) -> Optional[str]:
-        pass
-    @abc.abstractmethod
-    def _similarity(self, left: str, right: str) -> float:
-        pass
-class JaroWinklerSim1(StringSim):
-    name = "sim_jw1"
-    is_binary = True
-    input_type = "string"
-    output_type = "float"
-    is_symmetrical = True
-    has_symmetry_importance = True
-    def _prepare_value(self, value: Optional[str]) -> Optional[str]:
-        return value
-    def _similarity(self, left: str, right: str) -> float:
-        return jarowinkler_similarity(left, right)
-class JaroWinklerSim2(StringSim):
-    name = "sim_jw2"
-    is_binary = True
-    input_type = "string"
-    output_type = "float"
-    is_symmetrical = True
-    has_symmetry_importance = True
-    def _prepare_value(self, value: Optional[str]) -> Optional[str]:
-        return value[::-1] if value is not None else None
-    def _similarity(self, left: str, right: str) -> float:
-        return jarowinkler_similarity(left, right)
-class LevenshteinSim(StringSim):
-    name = "sim_lv"
-    is_binary = True
-    input_type = "string"
-    output_type = "float"
-    is_symmetrical = True
-    has_symmetry_importance = True
-    def _prepare_value(self, value: Optional[str]) -> Optional[str]:
-        return value
-    def _similarity(self, left: str, right: str) -> float:
-        return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
+        return dot(left, right) / (norm(left) * norm(right))

{upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/autofe/date.py RENAMED Viewed

@@ -43,8 +43,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
     is_binary = True
     has_symmetry_importance = True
-    replace_negative: bool = False
     def get_params(self) -> Dict[str, Optional[str]]:
         res = super().get_params()
         res.update(
@@ -52,7 +50,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
                 "diff_unit": self.diff_unit,
                 "left_unit": self.left_unit,
                 "right_unit": self.right_unit,
-                "replace_negative": self.replace_negative,
             }
         )
         return res
@@ -64,8 +61,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
         return self.__replace_negative(diff)
     def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
-        if self.replace_negative:
-            x[x < 0] = None
+        x[x < 0] = None
         return x
@@ -89,7 +85,7 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
         left = self._convert_to_date(left, self.left_unit)
         right = self._convert_to_date(right, self.right_unit)
         future = right + (left.dt.year - right.dt.year).apply(
-            lambda y: pd.tseries.offsets.DateOffset(years=0 if np.isnan(y) else y)
+            lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
         )
         future = pd.to_datetime(future)
         before = future[future < left]
@@ -105,19 +101,13 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
 class DateListDiff(PandasOperand, DateDiffMixin):
     is_binary = True
     has_symmetry_importance = True
     aggregation: str
-    replace_negative: bool = False
     def get_params(self) -> Dict[str, Optional[str]]:
         res = super().get_params()
         res.update(
             {
                 "aggregation": self.aggregation,
-                "diff_unit": self.diff_unit,
-                "left_unit": self.left_unit,
-                "right_unit": self.right_unit,
-                "replace_negative": self.replace_negative,
             }
         )
         return res
@@ -135,7 +125,7 @@ class DateListDiff(PandasOperand, DateDiffMixin):
     def _diff(self, x: TimedeltaArray):
         x = self._convert_diff_to_unit(x)
-        return x[x > 0] if self.replace_negative else x
+        return x[x > 0]
     def _agg(self, x):
         method = getattr(np, self.aggregation, None)
@@ -167,10 +157,7 @@ class DateListDiffBounded(DateListDiff):
         super().__init__(**data)
     def _agg(self, x):
-        x = x[
-            (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
-            & (x < (self.upper_bound if self.upper_bound is not None else np.inf))
-        ]
+        x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
         return super()._agg(x)

{upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/autofe/feature.py RENAMED Viewed

@@ -138,17 +138,15 @@ class Feature:
         if self.cached_display_name is not None and cache:
             return self.cached_display_name
-        should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
-        prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
         if self.alias:
             components = ["f_autofe", self.alias]
-        elif shorten and (not self.op.is_unary or should_stack_op):
-            components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
+        elif shorten and not self.op.is_unary:
+            components = ["f_autofe", self.get_op_display_name()]
         else:
-            components = (
-                ["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe"] + prev_name + [self.get_op_display_name()]
-            )
+            components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
+                "autofe",
+                self.get_op_display_name(),
+            ]
         components.extend([str(self.display_index)] if self.display_index is not None else [])
         display_name = "_".join(components)
@@ -323,10 +321,10 @@ class FeatureGroup:
             lower_order_names = [ch.get_display_name() for ch in lower_order_children]
             if any(isinstance(f, Feature) for f in lower_order_children):
                 child_data = pd.concat(
-                    [data[main_column or []]] + [ch.calculate(data) for ch in lower_order_children],
+                    [data[main_column]] + [ch.calculate(data) for ch in lower_order_children],
                     axis=1,
                 )
-                child_data.columns = ([main_column] if main_column is not None else []) + lower_order_names
+                child_data.columns = [main_column] + lower_order_names
             else:
                 child_data = data[columns]

{upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/autofe/unary.py RENAMED Viewed

@@ -125,10 +125,3 @@ class Norm(PandasOperand):
         normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
         normalized_data = normalized_data.reindex(data.index)
         return normalized_data
-class Embeddings(PandasOperand):
-    name = "emb"
-    is_unary = True
-    input_type = "string"
-    output_type = "vector"

{upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/dataset.py RENAMED Viewed

@@ -23,7 +23,9 @@ from pandas.api.types import (
 from upgini.errors import ValidationError
 from upgini.http import ProgressStage, SearchProgress, _RestClient
 from upgini.metadata import (
+    ENTITY_SYSTEM_RECORD_ID,
     EVAL_SET_INDEX,
+    SEARCH_KEY_UNNEST,
     SYSTEM_COLUMNS,
     SYSTEM_RECORD_ID,
     TARGET,
@@ -79,6 +81,7 @@ class Dataset:  # (pd.DataFrame):
         path: Optional[str] = None,
         meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
         search_keys: Optional[List[Tuple[str, ...]]] = None,
+        unnest_search_keys: Optional[Dict[str, str]] = None,
         model_task_type: Optional[ModelTaskType] = None,
         random_state: Optional[int] = None,
         rest_client: Optional[_RestClient] = None,
@@ -113,6 +116,7 @@ class Dataset:  # (pd.DataFrame):
         self.description = description
         self.meaning_types = meaning_types
         self.search_keys = search_keys
+        self.unnest_search_keys = unnest_search_keys
         self.ignore_columns = []
         self.hierarchical_group_keys = []
         self.hierarchical_subgroup_keys = []
@@ -172,7 +176,7 @@ class Dataset:  # (pd.DataFrame):
         new_columns = []
         dup_counter = 0
         for column in self.data.columns:
-            if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
+            if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]:
                 self.columns_renaming[column] = column
                 new_columns.append(column)
                 continue
@@ -353,7 +357,9 @@ class Dataset:  # (pd.DataFrame):
             if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
                 try:
-                    self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
+                    self.data[postal_code] = (
+                        self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
+                    )
                 except Exception:
                     pass
             elif is_float_dtype(self.data[postal_code]):
@@ -803,6 +809,9 @@ class Dataset:  # (pd.DataFrame):
                     meaningType=meaning_type,
                     minMaxValues=min_max_values,
                 )
+                if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
+                    column_meta.isUnnest = True
+                    column_meta.unnestKeyNames = self.unnest_search_keys[column_meta.originalName]
                 columns.append(column_meta)

upgini 1.1.299a3511.dev10__tar.gz → 1.1.300__tar.gz

Potentially problematic release.

upgini 1.1.299a3511.dev10tar.gz → 1.1.300tar.gz