upgini 1.1.312a5__py3-none-any.whl → 1.1.313a3511.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.1.312a5"
1
+ __version__ = "1.1.313a3511.dev1"
@@ -1,6 +1,20 @@
1
1
  from typing import Dict
2
2
 
3
- from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
3
+ from upgini.autofe.binary import (
4
+ Add,
5
+ Combine,
6
+ CombineThenFreq,
7
+ Distance,
8
+ Divide,
9
+ JaroWinklerSim1,
10
+ JaroWinklerSim2,
11
+ LevenshteinSim,
12
+ Max,
13
+ Min,
14
+ Multiply,
15
+ Sim,
16
+ Subtract,
17
+ )
4
18
  from upgini.autofe.date import (
5
19
  DateDiff,
6
20
  DateDiffType2,
@@ -9,9 +23,9 @@ from upgini.autofe.date import (
9
23
  DatePercentile,
10
24
  DatePercentileMethod2,
11
25
  )
12
- from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
26
+ from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
13
27
  from upgini.autofe.operand import Operand
14
- from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
28
+ from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
15
29
  from upgini.autofe.vector import Mean, Sum
16
30
 
17
31
  ALL_OPERANDS: Dict[str, Operand] = {
@@ -39,10 +53,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
39
53
  GroupByThenAgg(name="GroupByThenMedian", agg="median"),
40
54
  GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
41
55
  GroupByThenRank(),
42
- Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
43
- Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
44
- Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
45
- Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
56
+ Combine(),
57
+ CombineThenFreq(),
58
+ GroupByThenNUnique(),
59
+ GroupByThenFreq(),
46
60
  Sim(),
47
61
  DateDiff(),
48
62
  DateDiffType2(),
@@ -59,6 +73,11 @@ ALL_OPERANDS: Dict[str, Operand] = {
59
73
  DatePercentile(),
60
74
  DatePercentileMethod2(),
61
75
  Norm(),
76
+ JaroWinklerSim1(),
77
+ JaroWinklerSim2(),
78
+ LevenshteinSim(),
79
+ Distance(),
80
+ Embeddings(),
62
81
  ]
63
82
  }
64
83
 
upgini/autofe/binary.py CHANGED
@@ -1,7 +1,9 @@
1
+ import abc
2
+ from typing import Optional
3
+ import Levenshtein
1
4
  import numpy as np
2
5
  import pandas as pd
3
- from numpy import dot
4
- from numpy.linalg import norm
6
+ from jarowinkler import jarowinkler_similarity
5
7
 
6
8
  from upgini.autofe.operand import PandasOperand, VectorizableMixin
7
9
 
@@ -130,7 +132,29 @@ class CombineThenFreq(PandasOperand):
130
132
  self._loc(temp, value_counts)
131
133
 
132
134
 
133
- class Sim(PandasOperand):
135
+ class Distance(PandasOperand):
136
+ name = "dist"
137
+ is_binary = True
138
+ output_type = "float"
139
+ is_symmetrical = True
140
+ has_symmetry_importance = True
141
+
142
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
143
+ return pd.Series(
144
+ 1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
145
+ )
146
+
147
+ # row-wise dot product
148
+ def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
149
+ left = left.apply(lambda x: np.array(x))
150
+ right = right.apply(lambda x: np.array(x))
151
+ res = (left.dropna() * right.dropna()).apply(np.sum)
152
+ res = res.reindex(left.index.union(right.index))
153
+ return res
154
+
155
+
156
+ # Left for backward compatibility
157
+ class Sim(Distance):
134
158
  name = "sim"
135
159
  is_binary = True
136
160
  output_type = "float"
@@ -138,4 +162,71 @@ class Sim(PandasOperand):
138
162
  has_symmetry_importance = True
139
163
 
140
164
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
141
- return dot(left, right) / (norm(left) * norm(right))
165
+ return 1 - super().calculate_binary(left, right)
166
+
167
+
168
+ class StringSim(PandasOperand, abc.ABC):
169
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
170
+ sims = []
171
+ for i in left.index:
172
+ left_i = self._prepare_value(left.get(i))
173
+ right_i = self._prepare_value(right.get(i))
174
+ if left_i is not None and right_i is not None:
175
+ sims.append(self._similarity(left_i, right_i))
176
+ else:
177
+ sims.append(None)
178
+
179
+ return pd.Series(sims, index=left.index)
180
+
181
+ @abc.abstractmethod
182
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
183
+ pass
184
+
185
+ @abc.abstractmethod
186
+ def _similarity(self, left: str, right: str) -> float:
187
+ pass
188
+
189
+
190
+ class JaroWinklerSim1(StringSim):
191
+ name = "sim_jw1"
192
+ is_binary = True
193
+ input_type = "string"
194
+ output_type = "float"
195
+ is_symmetrical = True
196
+ has_symmetry_importance = True
197
+
198
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
199
+ return value
200
+
201
+ def _similarity(self, left: str, right: str) -> float:
202
+ return jarowinkler_similarity(left, right)
203
+
204
+
205
+ class JaroWinklerSim2(StringSim):
206
+ name = "sim_jw2"
207
+ is_binary = True
208
+ input_type = "string"
209
+ output_type = "float"
210
+ is_symmetrical = True
211
+ has_symmetry_importance = True
212
+
213
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
214
+ return value[::-1] if value is not None else None
215
+
216
+ def _similarity(self, left: str, right: str) -> float:
217
+ return jarowinkler_similarity(left, right)
218
+
219
+
220
+ class LevenshteinSim(StringSim):
221
+ name = "sim_lv"
222
+ is_binary = True
223
+ input_type = "string"
224
+ output_type = "float"
225
+ is_symmetrical = True
226
+ has_symmetry_importance = True
227
+
228
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
229
+ return value
230
+
231
+ def _similarity(self, left: str, right: str) -> float:
232
+ return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
upgini/autofe/date.py CHANGED
@@ -20,7 +20,7 @@ class DateDiffMixin(BaseModel):
20
20
  if isinstance(x, pd.DataFrame):
21
21
  return x.apply(lambda y: self._convert_to_date(y, unit), axis=1)
22
22
 
23
- return pd.to_datetime(x, unit=unit, errors='coerce')
23
+ return pd.to_datetime(x, unit=unit, errors="coerce")
24
24
 
25
25
  def _convert_diff_to_unit(self, diff: Union[pd.Series, TimedeltaArray]) -> Union[pd.Series, TimedeltaArray]:
26
26
  if self.diff_unit == "D":
@@ -43,6 +43,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
43
43
  is_binary = True
44
44
  has_symmetry_importance = True
45
45
 
46
+ replace_negative: bool = False
47
+
46
48
  def get_params(self) -> Dict[str, Optional[str]]:
47
49
  res = super().get_params()
48
50
  res.update(
@@ -50,6 +52,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
50
52
  "diff_unit": self.diff_unit,
51
53
  "left_unit": self.left_unit,
52
54
  "right_unit": self.right_unit,
55
+ "replace_negative": self.replace_negative,
53
56
  }
54
57
  )
55
58
  return res
@@ -61,7 +64,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
61
64
  return self.__replace_negative(diff)
62
65
 
63
66
  def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
64
- x[x < 0] = None
67
+ if self.replace_negative:
68
+ x[x < 0] = None
65
69
  return x
66
70
 
67
71
 
@@ -96,18 +100,25 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
96
100
 
97
101
 
98
102
  _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len, 0)}
103
+ _count_aggregations = ["nunique", "count"]
99
104
 
100
105
 
101
106
  class DateListDiff(PandasOperand, DateDiffMixin):
102
107
  is_binary = True
103
108
  has_symmetry_importance = True
109
+
104
110
  aggregation: str
111
+ replace_negative: bool = False
105
112
 
106
113
  def get_params(self) -> Dict[str, Optional[str]]:
107
114
  res = super().get_params()
108
115
  res.update(
109
116
  {
110
117
  "aggregation": self.aggregation,
118
+ "diff_unit": self.diff_unit,
119
+ "left_unit": self.left_unit,
120
+ "right_unit": self.right_unit,
121
+ "replace_negative": self.replace_negative,
111
122
  }
112
123
  )
113
124
  return res
@@ -119,13 +130,19 @@ class DateListDiff(PandasOperand, DateDiffMixin):
119
130
 
120
131
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
121
132
  left = self._convert_to_date(left, self.left_unit)
122
- right = right.apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
133
+ right_mask = right.apply(lambda x: len(x) > 0)
134
+ mask = left.notna() & right.notna() & right_mask
135
+ right_masked = right[mask].apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
136
+ res_masked = pd.Series(left[mask] - right_masked.values).apply(lambda x: self._agg(self._diff(x)))
137
+ res = res_masked.reindex(left.index.union(right.index))
138
+ if self.aggregation in _count_aggregations:
139
+ res[~right_mask] = 0.0
123
140
 
124
- return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
141
+ return res
125
142
 
126
143
  def _diff(self, x: TimedeltaArray):
127
144
  x = self._convert_diff_to_unit(x)
128
- return x[x > 0]
145
+ return x[x > 0] if self.replace_negative else x
129
146
 
130
147
  def _agg(self, x):
131
148
  method = getattr(np, self.aggregation, None)
@@ -157,7 +174,10 @@ class DateListDiffBounded(DateListDiff):
157
174
  super().__init__(**data)
158
175
 
159
176
  def _agg(self, x):
160
- x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
177
+ x = x[
178
+ (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
179
+ & (x < (self.upper_bound if self.upper_bound is not None else np.inf))
180
+ ]
161
181
  return super()._agg(x)
162
182
 
163
183
 
upgini/autofe/feature.py CHANGED
@@ -138,15 +138,17 @@ class Feature:
138
138
  if self.cached_display_name is not None and cache:
139
139
  return self.cached_display_name
140
140
 
141
+ should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
142
+ prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
143
+
141
144
  if self.alias:
142
145
  components = ["f_autofe", self.alias]
143
- elif shorten and not self.op.is_unary:
144
- components = ["f_autofe", self.get_op_display_name()]
146
+ elif shorten and (not self.op.is_unary or should_stack_op):
147
+ components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
145
148
  else:
146
- components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
147
- "autofe",
148
- self.get_op_display_name(),
149
- ]
149
+ components = (
150
+ ["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe"] + prev_name + [self.get_op_display_name()]
151
+ )
150
152
  components.extend([str(self.display_index)] if self.display_index is not None else [])
151
153
  display_name = "_".join(components)
152
154
 
@@ -237,12 +239,19 @@ class Feature:
237
239
 
238
240
  @staticmethod
239
241
  def from_formula(string: str) -> Union[Column, "Feature"]:
240
- if string[-1] != ")":
241
- return Column(string)
242
242
 
243
243
  def is_trivial_char(c: str) -> bool:
244
244
  return c not in "()+-*/,"
245
245
 
246
+ if string[-1] != ")":
247
+ if all(is_trivial_char(c) for c in string):
248
+ return Column(string)
249
+ else:
250
+ raise ValueError(
251
+ f"Unsupported column name: {string}. Column names should not have characters: "
252
+ "['(', ')', '+', '-', '*', '/', ',']"
253
+ )
254
+
246
255
  def find_prev(string: str) -> int:
247
256
  if string[-1] != ")":
248
257
  return max([(0 if is_trivial_char(c) else i + 1) for i, c in enumerate(string)])
@@ -264,8 +273,11 @@ class Feature:
264
273
  return Feature(find_op(string[: p2 - 1]), [Feature.from_formula(string[p2:-1])])
265
274
  p1 = find_prev(string[: p2 - 1])
266
275
  if string[0] == "(":
276
+ op = find_op(string[p2 - 1])
277
+ if op is None:
278
+ raise ValueError(f"Unsupported operand: {string[p2 - 1]}")
267
279
  return Feature(
268
- find_op(string[p2 - 1]),
280
+ op,
269
281
  [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
270
282
  )
271
283
  else:
@@ -276,6 +288,8 @@ class Feature:
276
288
  [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
277
289
  )
278
290
  else:
291
+ if string[p1 - 1] == "(":
292
+ raise ValueError(f"Unsupported operand: {string[: p1 - 1]}")
279
293
  base_features = [
280
294
  Feature.from_formula(string[p2:-1]),
281
295
  Feature.from_formula(string[p1 : p2 - 1]),
@@ -321,10 +335,10 @@ class FeatureGroup:
321
335
  lower_order_names = [ch.get_display_name() for ch in lower_order_children]
322
336
  if any(isinstance(f, Feature) for f in lower_order_children):
323
337
  child_data = pd.concat(
324
- [data[main_column]] + [ch.calculate(data) for ch in lower_order_children],
338
+ [data[main_column or []]] + [ch.calculate(data) for ch in lower_order_children],
325
339
  axis=1,
326
340
  )
327
- child_data.columns = [main_column] + lower_order_names
341
+ child_data.columns = ([main_column] if main_column is not None else []) + lower_order_names
328
342
  else:
329
343
  child_data = data[columns]
330
344
 
upgini/autofe/unary.py CHANGED
@@ -125,3 +125,10 @@ class Norm(PandasOperand):
125
125
  normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
126
126
  normalized_data = normalized_data.reindex(data.index)
127
127
  return normalized_data
128
+
129
+
130
+ class Embeddings(PandasOperand):
131
+ name = "emb"
132
+ is_unary = True
133
+ input_type = "string"
134
+ output_type = "vector"