upgini 1.1.312__tar.gz → 1.1.312a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {upgini-1.1.312 → upgini-1.1.312a1}/PKG-INFO +1 -3
  2. {upgini-1.1.312 → upgini-1.1.312a1}/pyproject.toml +0 -2
  3. upgini-1.1.312a1/src/upgini/__about__.py +1 -0
  4. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/autofe/all_operands.py +7 -26
  5. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/autofe/binary.py +4 -95
  6. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/autofe/date.py +3 -16
  7. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/autofe/feature.py +11 -25
  8. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/autofe/unary.py +0 -7
  9. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/dataset.py +30 -385
  10. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/features_enricher.py +276 -120
  11. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/metadata.py +16 -1
  12. upgini-1.1.312a1/src/upgini/normalizer/normalize_utils.py +203 -0
  13. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/utils/country_utils.py +16 -0
  14. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/utils/datetime_utils.py +34 -15
  15. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/utils/email_utils.py +19 -5
  16. upgini-1.1.312a1/src/upgini/utils/ip_utils.py +152 -0
  17. upgini-1.1.312/src/upgini/normalizer/phone_normalizer.py → upgini-1.1.312a1/src/upgini/utils/phone_utils.py +41 -25
  18. upgini-1.1.312a1/src/upgini/utils/postal_code_utils.py +45 -0
  19. upgini-1.1.312/src/upgini/__about__.py +0 -1
  20. upgini-1.1.312/src/upgini/utils/ip_utils.py +0 -53
  21. upgini-1.1.312/src/upgini/utils/phone_utils.py +0 -11
  22. upgini-1.1.312/src/upgini/utils/postal_code_utils.py +0 -11
  23. {upgini-1.1.312 → upgini-1.1.312a1}/.gitignore +0 -0
  24. {upgini-1.1.312 → upgini-1.1.312a1}/LICENSE +0 -0
  25. {upgini-1.1.312 → upgini-1.1.312a1}/README.md +0 -0
  26. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/__init__.py +0 -0
  27. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/ads.py +0 -0
  28. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/ads_management/__init__.py +0 -0
  29. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/ads_management/ads_manager.py +0 -0
  30. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/autofe/__init__.py +0 -0
  31. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/autofe/groupby.py +0 -0
  32. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/autofe/operand.py +0 -0
  33. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/autofe/vector.py +0 -0
  34. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/data_source/__init__.py +0 -0
  35. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/data_source/data_source_publisher.py +0 -0
  36. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/errors.py +0 -0
  37. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/http.py +0 -0
  38. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/lazy_import.py +0 -0
  39. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/mdc/__init__.py +0 -0
  40. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/mdc/context.py +0 -0
  41. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/metrics.py +0 -0
  42. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/normalizer/__init__.py +0 -0
  43. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/resource_bundle/__init__.py +0 -0
  44. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/resource_bundle/exceptions.py +0 -0
  45. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/resource_bundle/strings.properties +0 -0
  46. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  47. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/sampler/__init__.py +0 -0
  48. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/sampler/base.py +0 -0
  49. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/sampler/random_under_sampler.py +0 -0
  50. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/sampler/utils.py +0 -0
  51. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/search_task.py +0 -0
  52. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/spinner.py +0 -0
  53. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/utils/__init__.py +0 -0
  54. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/utils/base_search_key_detector.py +0 -0
  55. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/utils/blocked_time_series.py +0 -0
  56. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/utils/custom_loss_utils.py +0 -0
  57. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/utils/cv_utils.py +0 -0
  58. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/utils/deduplicate_utils.py +0 -0
  59. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/utils/display_utils.py +0 -0
  60. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
  61. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/utils/features_validator.py +0 -0
  62. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/utils/format.py +0 -0
  63. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/utils/progress_bar.py +0 -0
  64. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/utils/sklearn_ext.py +0 -0
  65. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/utils/target_utils.py +0 -0
  66. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/utils/track_info.py +0 -0
  67. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/utils/warning_counter.py +0 -0
  68. {upgini-1.1.312 → upgini-1.1.312a1}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.312
3
+ Version: 1.1.312a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -26,8 +26,6 @@ Requires-Python: <3.11,>=3.8
26
26
  Requires-Dist: catboost>=1.0.3
27
27
  Requires-Dist: fastparquet>=0.8.1
28
28
  Requires-Dist: ipywidgets>=8.1.0
29
- Requires-Dist: jarowinkler>=2.0.0
30
- Requires-Dist: levenshtein>=0.25.1
31
29
  Requires-Dist: lightgbm>=3.3.2
32
30
  Requires-Dist: numpy>=1.19.0
33
31
  Requires-Dist: pandas<3.0.0,>=1.1.0
@@ -49,8 +49,6 @@ dependencies = [
49
49
  "scikit-learn>=1.3.0",
50
50
  "python-bidi==0.4.2",
51
51
  "xhtml2pdf==0.2.11",
52
- "jarowinkler>=2.0.0",
53
- "levenshtein>=0.25.1",
54
52
  ]
55
53
 
56
54
  [project.urls]
@@ -0,0 +1 @@
1
+ __version__ = "1.1.312a1"
@@ -1,20 +1,6 @@
1
1
  from typing import Dict
2
2
 
3
- from upgini.autofe.binary import (
4
- Add,
5
- Combine,
6
- CombineThenFreq,
7
- Distance,
8
- Divide,
9
- JaroWinklerSim1,
10
- JaroWinklerSim2,
11
- LevenshteinSim,
12
- Max,
13
- Min,
14
- Multiply,
15
- Sim,
16
- Subtract,
17
- )
3
+ from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
18
4
  from upgini.autofe.date import (
19
5
  DateDiff,
20
6
  DateDiffType2,
@@ -23,9 +9,9 @@ from upgini.autofe.date import (
23
9
  DatePercentile,
24
10
  DatePercentileMethod2,
25
11
  )
26
- from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
12
+ from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
27
13
  from upgini.autofe.operand import Operand
28
- from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
14
+ from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
29
15
  from upgini.autofe.vector import Mean, Sum
30
16
 
31
17
  ALL_OPERANDS: Dict[str, Operand] = {
@@ -53,10 +39,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
53
39
  GroupByThenAgg(name="GroupByThenMedian", agg="median"),
54
40
  GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
55
41
  GroupByThenRank(),
56
- Combine(),
57
- CombineThenFreq(),
58
- GroupByThenNUnique(),
59
- GroupByThenFreq(),
42
+ Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
43
+ Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
44
+ Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
45
+ Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
60
46
  Sim(),
61
47
  DateDiff(),
62
48
  DateDiffType2(),
@@ -73,11 +59,6 @@ ALL_OPERANDS: Dict[str, Operand] = {
73
59
  DatePercentile(),
74
60
  DatePercentileMethod2(),
75
61
  Norm(),
76
- JaroWinklerSim1(),
77
- JaroWinklerSim2(),
78
- LevenshteinSim(),
79
- Distance(),
80
- Embeddings(),
81
62
  ]
82
63
  }
83
64
 
@@ -1,9 +1,7 @@
1
- import abc
2
- from typing import Optional
3
- import Levenshtein
4
1
  import numpy as np
5
2
  import pandas as pd
6
- from jarowinkler import jarowinkler_similarity
3
+ from numpy import dot
4
+ from numpy.linalg import norm
7
5
 
8
6
  from upgini.autofe.operand import PandasOperand, VectorizableMixin
9
7
 
@@ -132,29 +130,7 @@ class CombineThenFreq(PandasOperand):
132
130
  self._loc(temp, value_counts)
133
131
 
134
132
 
135
- class Distance(PandasOperand):
136
- name = "dist"
137
- is_binary = True
138
- output_type = "float"
139
- is_symmetrical = True
140
- has_symmetry_importance = True
141
-
142
- def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
143
- return pd.Series(
144
- 1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
145
- )
146
-
147
- # row-wise dot product
148
- def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
149
- left = left.apply(lambda x: np.array(x))
150
- right = right.apply(lambda x: np.array(x))
151
- res = (left.dropna() * right.dropna()).apply(np.sum)
152
- res = res.reindex(left.index.union(right.index))
153
- return res
154
-
155
-
156
- # Left for backward compatibility
157
- class Sim(Distance):
133
+ class Sim(PandasOperand):
158
134
  name = "sim"
159
135
  is_binary = True
160
136
  output_type = "float"
@@ -162,71 +138,4 @@ class Sim(Distance):
162
138
  has_symmetry_importance = True
163
139
 
164
140
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
165
- return 1 - super().calculate_binary(left, right)
166
-
167
-
168
- class StringSim(PandasOperand, abc.ABC):
169
- def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
170
- sims = []
171
- for i in left.index:
172
- left_i = self._prepare_value(left.get(i))
173
- right_i = self._prepare_value(right.get(i))
174
- if left_i is not None and right_i is not None:
175
- sims.append(self._similarity(left_i, right_i))
176
- else:
177
- sims.append(None)
178
-
179
- return pd.Series(sims, index=left.index)
180
-
181
- @abc.abstractmethod
182
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
183
- pass
184
-
185
- @abc.abstractmethod
186
- def _similarity(self, left: str, right: str) -> float:
187
- pass
188
-
189
-
190
- class JaroWinklerSim1(StringSim):
191
- name = "sim_jw1"
192
- is_binary = True
193
- input_type = "string"
194
- output_type = "float"
195
- is_symmetrical = True
196
- has_symmetry_importance = True
197
-
198
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
199
- return value
200
-
201
- def _similarity(self, left: str, right: str) -> float:
202
- return jarowinkler_similarity(left, right)
203
-
204
-
205
- class JaroWinklerSim2(StringSim):
206
- name = "sim_jw2"
207
- is_binary = True
208
- input_type = "string"
209
- output_type = "float"
210
- is_symmetrical = True
211
- has_symmetry_importance = True
212
-
213
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
214
- return value[::-1] if value is not None else None
215
-
216
- def _similarity(self, left: str, right: str) -> float:
217
- return jarowinkler_similarity(left, right)
218
-
219
-
220
- class LevenshteinSim(StringSim):
221
- name = "sim_lv"
222
- is_binary = True
223
- input_type = "string"
224
- output_type = "float"
225
- is_symmetrical = True
226
- has_symmetry_importance = True
227
-
228
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
229
- return value
230
-
231
- def _similarity(self, left: str, right: str) -> float:
232
- return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
141
+ return dot(left, right) / (norm(left) * norm(right))
@@ -43,8 +43,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
43
43
  is_binary = True
44
44
  has_symmetry_importance = True
45
45
 
46
- replace_negative: bool = False
47
-
48
46
  def get_params(self) -> Dict[str, Optional[str]]:
49
47
  res = super().get_params()
50
48
  res.update(
@@ -52,7 +50,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
52
50
  "diff_unit": self.diff_unit,
53
51
  "left_unit": self.left_unit,
54
52
  "right_unit": self.right_unit,
55
- "replace_negative": self.replace_negative,
56
53
  }
57
54
  )
58
55
  return res
@@ -64,8 +61,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
64
61
  return self.__replace_negative(diff)
65
62
 
66
63
  def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
67
- if self.replace_negative:
68
- x[x < 0] = None
64
+ x[x < 0] = None
69
65
  return x
70
66
 
71
67
 
@@ -105,19 +101,13 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
105
101
  class DateListDiff(PandasOperand, DateDiffMixin):
106
102
  is_binary = True
107
103
  has_symmetry_importance = True
108
-
109
104
  aggregation: str
110
- replace_negative: bool = False
111
105
 
112
106
  def get_params(self) -> Dict[str, Optional[str]]:
113
107
  res = super().get_params()
114
108
  res.update(
115
109
  {
116
110
  "aggregation": self.aggregation,
117
- "diff_unit": self.diff_unit,
118
- "left_unit": self.left_unit,
119
- "right_unit": self.right_unit,
120
- "replace_negative": self.replace_negative,
121
111
  }
122
112
  )
123
113
  return res
@@ -135,7 +125,7 @@ class DateListDiff(PandasOperand, DateDiffMixin):
135
125
 
136
126
  def _diff(self, x: TimedeltaArray):
137
127
  x = self._convert_diff_to_unit(x)
138
- return x[x > 0] if self.replace_negative else x
128
+ return x[x > 0]
139
129
 
140
130
  def _agg(self, x):
141
131
  method = getattr(np, self.aggregation, None)
@@ -167,10 +157,7 @@ class DateListDiffBounded(DateListDiff):
167
157
  super().__init__(**data)
168
158
 
169
159
  def _agg(self, x):
170
- x = x[
171
- (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
172
- & (x < (self.upper_bound if self.upper_bound is not None else np.inf))
173
- ]
160
+ x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
174
161
  return super()._agg(x)
175
162
 
176
163
 
@@ -138,17 +138,15 @@ class Feature:
138
138
  if self.cached_display_name is not None and cache:
139
139
  return self.cached_display_name
140
140
 
141
- should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
142
- prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
143
-
144
141
  if self.alias:
145
142
  components = ["f_autofe", self.alias]
146
- elif shorten and (not self.op.is_unary or should_stack_op):
147
- components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
143
+ elif shorten and not self.op.is_unary:
144
+ components = ["f_autofe", self.get_op_display_name()]
148
145
  else:
149
- components = (
150
- ["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe"] + prev_name + [self.get_op_display_name()]
151
- )
146
+ components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
147
+ "autofe",
148
+ self.get_op_display_name(),
149
+ ]
152
150
  components.extend([str(self.display_index)] if self.display_index is not None else [])
153
151
  display_name = "_".join(components)
154
152
 
@@ -239,19 +237,12 @@ class Feature:
239
237
 
240
238
  @staticmethod
241
239
  def from_formula(string: str) -> Union[Column, "Feature"]:
240
+ if string[-1] != ")":
241
+ return Column(string)
242
242
 
243
243
  def is_trivial_char(c: str) -> bool:
244
244
  return c not in "()+-*/,"
245
245
 
246
- if string[-1] != ")":
247
- if all(is_trivial_char(c) for c in string):
248
- return Column(string)
249
- else:
250
- raise ValueError(
251
- f"Unsupported column name: {string}. Column names should not have characters: "
252
- "['(', ')', '+', '-', '*', '/', ',']"
253
- )
254
-
255
246
  def find_prev(string: str) -> int:
256
247
  if string[-1] != ")":
257
248
  return max([(0 if is_trivial_char(c) else i + 1) for i, c in enumerate(string)])
@@ -273,11 +264,8 @@ class Feature:
273
264
  return Feature(find_op(string[: p2 - 1]), [Feature.from_formula(string[p2:-1])])
274
265
  p1 = find_prev(string[: p2 - 1])
275
266
  if string[0] == "(":
276
- op = find_op(string[p2 - 1])
277
- if op is None:
278
- raise ValueError(f"Unsupported operand: {string[p2 - 1]}")
279
267
  return Feature(
280
- op,
268
+ find_op(string[p2 - 1]),
281
269
  [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
282
270
  )
283
271
  else:
@@ -288,8 +276,6 @@ class Feature:
288
276
  [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
289
277
  )
290
278
  else:
291
- if string[p1 - 1] == "(":
292
- raise ValueError(f"Unsupported operand: {string[: p1 - 1]}")
293
279
  base_features = [
294
280
  Feature.from_formula(string[p2:-1]),
295
281
  Feature.from_formula(string[p1 : p2 - 1]),
@@ -335,10 +321,10 @@ class FeatureGroup:
335
321
  lower_order_names = [ch.get_display_name() for ch in lower_order_children]
336
322
  if any(isinstance(f, Feature) for f in lower_order_children):
337
323
  child_data = pd.concat(
338
- [data[main_column or []]] + [ch.calculate(data) for ch in lower_order_children],
324
+ [data[main_column]] + [ch.calculate(data) for ch in lower_order_children],
339
325
  axis=1,
340
326
  )
341
- child_data.columns = ([main_column] if main_column is not None else []) + lower_order_names
327
+ child_data.columns = [main_column] + lower_order_names
342
328
  else:
343
329
  child_data = data[columns]
344
330
 
@@ -125,10 +125,3 @@ class Norm(PandasOperand):
125
125
  normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
126
126
  normalized_data = normalized_data.reindex(data.index)
127
127
  return normalized_data
128
-
129
-
130
- class Embeddings(PandasOperand):
131
- name = "emb"
132
- is_unary = True
133
- input_type = "string"
134
- output_type = "vector"