upgini 1.1.311__tar.gz → 1.1.312__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (65) hide show
  1. {upgini-1.1.311 → upgini-1.1.312}/PKG-INFO +3 -1
  2. {upgini-1.1.311 → upgini-1.1.312}/pyproject.toml +2 -0
  3. upgini-1.1.312/src/upgini/__about__.py +1 -0
  4. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/autofe/all_operands.py +26 -7
  5. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/autofe/binary.py +95 -4
  6. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/autofe/date.py +16 -3
  7. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/autofe/feature.py +25 -11
  8. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/autofe/unary.py +7 -0
  9. upgini-1.1.311/src/upgini/__about__.py +0 -1
  10. {upgini-1.1.311 → upgini-1.1.312}/.gitignore +0 -0
  11. {upgini-1.1.311 → upgini-1.1.312}/LICENSE +0 -0
  12. {upgini-1.1.311 → upgini-1.1.312}/README.md +0 -0
  13. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/__init__.py +0 -0
  14. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/ads.py +0 -0
  15. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/ads_management/__init__.py +0 -0
  16. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/ads_management/ads_manager.py +0 -0
  17. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/autofe/__init__.py +0 -0
  18. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/autofe/groupby.py +0 -0
  19. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/autofe/operand.py +0 -0
  20. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/autofe/vector.py +0 -0
  21. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/data_source/__init__.py +0 -0
  22. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/data_source/data_source_publisher.py +0 -0
  23. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/dataset.py +0 -0
  24. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/errors.py +0 -0
  25. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/features_enricher.py +0 -0
  26. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/http.py +0 -0
  27. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/lazy_import.py +0 -0
  28. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/mdc/__init__.py +0 -0
  29. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/mdc/context.py +0 -0
  30. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/metadata.py +0 -0
  31. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/metrics.py +0 -0
  32. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/normalizer/__init__.py +0 -0
  33. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/normalizer/phone_normalizer.py +0 -0
  34. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/resource_bundle/__init__.py +0 -0
  35. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/resource_bundle/exceptions.py +0 -0
  36. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/resource_bundle/strings.properties +0 -0
  37. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  38. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/sampler/__init__.py +0 -0
  39. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/sampler/base.py +0 -0
  40. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/sampler/random_under_sampler.py +0 -0
  41. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/sampler/utils.py +0 -0
  42. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/search_task.py +0 -0
  43. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/spinner.py +0 -0
  44. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/__init__.py +0 -0
  45. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/base_search_key_detector.py +0 -0
  46. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/blocked_time_series.py +0 -0
  47. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/country_utils.py +0 -0
  48. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/custom_loss_utils.py +0 -0
  49. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/cv_utils.py +0 -0
  50. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/datetime_utils.py +0 -0
  51. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/deduplicate_utils.py +0 -0
  52. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/display_utils.py +0 -0
  53. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/email_utils.py +0 -0
  54. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/fallback_progress_bar.py +0 -0
  55. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/features_validator.py +0 -0
  56. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/format.py +0 -0
  57. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/ip_utils.py +0 -0
  58. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/phone_utils.py +0 -0
  59. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/postal_code_utils.py +0 -0
  60. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/progress_bar.py +0 -0
  61. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/sklearn_ext.py +0 -0
  62. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/target_utils.py +0 -0
  63. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/track_info.py +0 -0
  64. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/utils/warning_counter.py +0 -0
  65. {upgini-1.1.311 → upgini-1.1.312}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.311
3
+ Version: 1.1.312
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -26,6 +26,8 @@ Requires-Python: <3.11,>=3.8
26
26
  Requires-Dist: catboost>=1.0.3
27
27
  Requires-Dist: fastparquet>=0.8.1
28
28
  Requires-Dist: ipywidgets>=8.1.0
29
+ Requires-Dist: jarowinkler>=2.0.0
30
+ Requires-Dist: levenshtein>=0.25.1
29
31
  Requires-Dist: lightgbm>=3.3.2
30
32
  Requires-Dist: numpy>=1.19.0
31
33
  Requires-Dist: pandas<3.0.0,>=1.1.0
@@ -49,6 +49,8 @@ dependencies = [
49
49
  "scikit-learn>=1.3.0",
50
50
  "python-bidi==0.4.2",
51
51
  "xhtml2pdf==0.2.11",
52
+ "jarowinkler>=2.0.0",
53
+ "levenshtein>=0.25.1",
52
54
  ]
53
55
 
54
56
  [project.urls]
@@ -0,0 +1 @@
1
+ __version__ = "1.1.312"
@@ -1,6 +1,20 @@
1
1
  from typing import Dict
2
2
 
3
- from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
3
+ from upgini.autofe.binary import (
4
+ Add,
5
+ Combine,
6
+ CombineThenFreq,
7
+ Distance,
8
+ Divide,
9
+ JaroWinklerSim1,
10
+ JaroWinklerSim2,
11
+ LevenshteinSim,
12
+ Max,
13
+ Min,
14
+ Multiply,
15
+ Sim,
16
+ Subtract,
17
+ )
4
18
  from upgini.autofe.date import (
5
19
  DateDiff,
6
20
  DateDiffType2,
@@ -9,9 +23,9 @@ from upgini.autofe.date import (
9
23
  DatePercentile,
10
24
  DatePercentileMethod2,
11
25
  )
12
- from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
26
+ from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
13
27
  from upgini.autofe.operand import Operand
14
- from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
28
+ from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
15
29
  from upgini.autofe.vector import Mean, Sum
16
30
 
17
31
  ALL_OPERANDS: Dict[str, Operand] = {
@@ -39,10 +53,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
39
53
  GroupByThenAgg(name="GroupByThenMedian", agg="median"),
40
54
  GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
41
55
  GroupByThenRank(),
42
- Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
43
- Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
44
- Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
45
- Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
56
+ Combine(),
57
+ CombineThenFreq(),
58
+ GroupByThenNUnique(),
59
+ GroupByThenFreq(),
46
60
  Sim(),
47
61
  DateDiff(),
48
62
  DateDiffType2(),
@@ -59,6 +73,11 @@ ALL_OPERANDS: Dict[str, Operand] = {
59
73
  DatePercentile(),
60
74
  DatePercentileMethod2(),
61
75
  Norm(),
76
+ JaroWinklerSim1(),
77
+ JaroWinklerSim2(),
78
+ LevenshteinSim(),
79
+ Distance(),
80
+ Embeddings(),
62
81
  ]
63
82
  }
64
83
 
@@ -1,7 +1,9 @@
1
+ import abc
2
+ from typing import Optional
3
+ import Levenshtein
1
4
  import numpy as np
2
5
  import pandas as pd
3
- from numpy import dot
4
- from numpy.linalg import norm
6
+ from jarowinkler import jarowinkler_similarity
5
7
 
6
8
  from upgini.autofe.operand import PandasOperand, VectorizableMixin
7
9
 
@@ -130,7 +132,29 @@ class CombineThenFreq(PandasOperand):
130
132
  self._loc(temp, value_counts)
131
133
 
132
134
 
133
- class Sim(PandasOperand):
135
+ class Distance(PandasOperand):
136
+ name = "dist"
137
+ is_binary = True
138
+ output_type = "float"
139
+ is_symmetrical = True
140
+ has_symmetry_importance = True
141
+
142
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
143
+ return pd.Series(
144
+ 1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
145
+ )
146
+
147
+ # row-wise dot product
148
+ def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
149
+ left = left.apply(lambda x: np.array(x))
150
+ right = right.apply(lambda x: np.array(x))
151
+ res = (left.dropna() * right.dropna()).apply(np.sum)
152
+ res = res.reindex(left.index.union(right.index))
153
+ return res
154
+
155
+
156
+ # Left for backward compatibility
157
+ class Sim(Distance):
134
158
  name = "sim"
135
159
  is_binary = True
136
160
  output_type = "float"
@@ -138,4 +162,71 @@ class Sim(PandasOperand):
138
162
  has_symmetry_importance = True
139
163
 
140
164
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
141
- return dot(left, right) / (norm(left) * norm(right))
165
+ return 1 - super().calculate_binary(left, right)
166
+
167
+
168
+ class StringSim(PandasOperand, abc.ABC):
169
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
170
+ sims = []
171
+ for i in left.index:
172
+ left_i = self._prepare_value(left.get(i))
173
+ right_i = self._prepare_value(right.get(i))
174
+ if left_i is not None and right_i is not None:
175
+ sims.append(self._similarity(left_i, right_i))
176
+ else:
177
+ sims.append(None)
178
+
179
+ return pd.Series(sims, index=left.index)
180
+
181
+ @abc.abstractmethod
182
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
183
+ pass
184
+
185
+ @abc.abstractmethod
186
+ def _similarity(self, left: str, right: str) -> float:
187
+ pass
188
+
189
+
190
+ class JaroWinklerSim1(StringSim):
191
+ name = "sim_jw1"
192
+ is_binary = True
193
+ input_type = "string"
194
+ output_type = "float"
195
+ is_symmetrical = True
196
+ has_symmetry_importance = True
197
+
198
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
199
+ return value
200
+
201
+ def _similarity(self, left: str, right: str) -> float:
202
+ return jarowinkler_similarity(left, right)
203
+
204
+
205
+ class JaroWinklerSim2(StringSim):
206
+ name = "sim_jw2"
207
+ is_binary = True
208
+ input_type = "string"
209
+ output_type = "float"
210
+ is_symmetrical = True
211
+ has_symmetry_importance = True
212
+
213
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
214
+ return value[::-1] if value is not None else None
215
+
216
+ def _similarity(self, left: str, right: str) -> float:
217
+ return jarowinkler_similarity(left, right)
218
+
219
+
220
+ class LevenshteinSim(StringSim):
221
+ name = "sim_lv"
222
+ is_binary = True
223
+ input_type = "string"
224
+ output_type = "float"
225
+ is_symmetrical = True
226
+ has_symmetry_importance = True
227
+
228
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
229
+ return value
230
+
231
+ def _similarity(self, left: str, right: str) -> float:
232
+ return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
@@ -43,6 +43,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
43
43
  is_binary = True
44
44
  has_symmetry_importance = True
45
45
 
46
+ replace_negative: bool = False
47
+
46
48
  def get_params(self) -> Dict[str, Optional[str]]:
47
49
  res = super().get_params()
48
50
  res.update(
@@ -50,6 +52,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
50
52
  "diff_unit": self.diff_unit,
51
53
  "left_unit": self.left_unit,
52
54
  "right_unit": self.right_unit,
55
+ "replace_negative": self.replace_negative,
53
56
  }
54
57
  )
55
58
  return res
@@ -61,7 +64,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
61
64
  return self.__replace_negative(diff)
62
65
 
63
66
  def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
64
- x[x < 0] = None
67
+ if self.replace_negative:
68
+ x[x < 0] = None
65
69
  return x
66
70
 
67
71
 
@@ -101,13 +105,19 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
101
105
  class DateListDiff(PandasOperand, DateDiffMixin):
102
106
  is_binary = True
103
107
  has_symmetry_importance = True
108
+
104
109
  aggregation: str
110
+ replace_negative: bool = False
105
111
 
106
112
  def get_params(self) -> Dict[str, Optional[str]]:
107
113
  res = super().get_params()
108
114
  res.update(
109
115
  {
110
116
  "aggregation": self.aggregation,
117
+ "diff_unit": self.diff_unit,
118
+ "left_unit": self.left_unit,
119
+ "right_unit": self.right_unit,
120
+ "replace_negative": self.replace_negative,
111
121
  }
112
122
  )
113
123
  return res
@@ -125,7 +135,7 @@ class DateListDiff(PandasOperand, DateDiffMixin):
125
135
 
126
136
  def _diff(self, x: TimedeltaArray):
127
137
  x = self._convert_diff_to_unit(x)
128
- return x[x > 0]
138
+ return x[x > 0] if self.replace_negative else x
129
139
 
130
140
  def _agg(self, x):
131
141
  method = getattr(np, self.aggregation, None)
@@ -157,7 +167,10 @@ class DateListDiffBounded(DateListDiff):
157
167
  super().__init__(**data)
158
168
 
159
169
  def _agg(self, x):
160
- x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
170
+ x = x[
171
+ (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
172
+ & (x < (self.upper_bound if self.upper_bound is not None else np.inf))
173
+ ]
161
174
  return super()._agg(x)
162
175
 
163
176
 
@@ -138,15 +138,17 @@ class Feature:
138
138
  if self.cached_display_name is not None and cache:
139
139
  return self.cached_display_name
140
140
 
141
+ should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
142
+ prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
143
+
141
144
  if self.alias:
142
145
  components = ["f_autofe", self.alias]
143
- elif shorten and not self.op.is_unary:
144
- components = ["f_autofe", self.get_op_display_name()]
146
+ elif shorten and (not self.op.is_unary or should_stack_op):
147
+ components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
145
148
  else:
146
- components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
147
- "autofe",
148
- self.get_op_display_name(),
149
- ]
149
+ components = (
150
+ ["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe"] + prev_name + [self.get_op_display_name()]
151
+ )
150
152
  components.extend([str(self.display_index)] if self.display_index is not None else [])
151
153
  display_name = "_".join(components)
152
154
 
@@ -237,12 +239,19 @@ class Feature:
237
239
 
238
240
  @staticmethod
239
241
  def from_formula(string: str) -> Union[Column, "Feature"]:
240
- if string[-1] != ")":
241
- return Column(string)
242
242
 
243
243
  def is_trivial_char(c: str) -> bool:
244
244
  return c not in "()+-*/,"
245
245
 
246
+ if string[-1] != ")":
247
+ if all(is_trivial_char(c) for c in string):
248
+ return Column(string)
249
+ else:
250
+ raise ValueError(
251
+ f"Unsupported column name: {string}. Column names should not have characters: "
252
+ "['(', ')', '+', '-', '*', '/', ',']"
253
+ )
254
+
246
255
  def find_prev(string: str) -> int:
247
256
  if string[-1] != ")":
248
257
  return max([(0 if is_trivial_char(c) else i + 1) for i, c in enumerate(string)])
@@ -264,8 +273,11 @@ class Feature:
264
273
  return Feature(find_op(string[: p2 - 1]), [Feature.from_formula(string[p2:-1])])
265
274
  p1 = find_prev(string[: p2 - 1])
266
275
  if string[0] == "(":
276
+ op = find_op(string[p2 - 1])
277
+ if op is None:
278
+ raise ValueError(f"Unsupported operand: {string[p2 - 1]}")
267
279
  return Feature(
268
- find_op(string[p2 - 1]),
280
+ op,
269
281
  [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
270
282
  )
271
283
  else:
@@ -276,6 +288,8 @@ class Feature:
276
288
  [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
277
289
  )
278
290
  else:
291
+ if string[p1 - 1] == "(":
292
+ raise ValueError(f"Unsupported operand: {string[: p1 - 1]}")
279
293
  base_features = [
280
294
  Feature.from_formula(string[p2:-1]),
281
295
  Feature.from_formula(string[p1 : p2 - 1]),
@@ -321,10 +335,10 @@ class FeatureGroup:
321
335
  lower_order_names = [ch.get_display_name() for ch in lower_order_children]
322
336
  if any(isinstance(f, Feature) for f in lower_order_children):
323
337
  child_data = pd.concat(
324
- [data[main_column]] + [ch.calculate(data) for ch in lower_order_children],
338
+ [data[main_column or []]] + [ch.calculate(data) for ch in lower_order_children],
325
339
  axis=1,
326
340
  )
327
- child_data.columns = [main_column] + lower_order_names
341
+ child_data.columns = ([main_column] if main_column is not None else []) + lower_order_names
328
342
  else:
329
343
  child_data = data[columns]
330
344
 
@@ -125,3 +125,10 @@ class Norm(PandasOperand):
125
125
  normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
126
126
  normalized_data = normalized_data.reindex(data.index)
127
127
  return normalized_data
128
+
129
+
130
+ class Embeddings(PandasOperand):
131
+ name = "emb"
132
+ is_unary = True
133
+ input_type = "string"
134
+ output_type = "vector"
@@ -1 +0,0 @@
1
- __version__ = "1.1.311"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes