upgini 1.1.298__py3-none-any.whl → 1.1.299a3511.dev6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.1.298"
1
+ __version__ = "1.1.299a3511.dev6"
@@ -1,6 +1,20 @@
1
1
  from typing import Dict
2
2
 
3
- from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
3
+ from upgini.autofe.binary import (
4
+ Add,
5
+ Combine,
6
+ CombineThenFreq,
7
+ Distance,
8
+ Divide,
9
+ JaroWinklerSim1,
10
+ JaroWinklerSim2,
11
+ LevenshteinSim,
12
+ Max,
13
+ Min,
14
+ Multiply,
15
+ Sim,
16
+ Subtract,
17
+ )
4
18
  from upgini.autofe.date import (
5
19
  DateDiff,
6
20
  DateDiffType2,
@@ -9,9 +23,9 @@ from upgini.autofe.date import (
9
23
  DatePercentile,
10
24
  DatePercentileMethod2,
11
25
  )
12
- from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
26
+ from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
13
27
  from upgini.autofe.operand import Operand
14
- from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
28
+ from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
15
29
  from upgini.autofe.vector import Mean, Sum
16
30
 
17
31
  ALL_OPERANDS: Dict[str, Operand] = {
@@ -39,10 +53,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
39
53
  GroupByThenAgg(name="GroupByThenMedian", agg="median"),
40
54
  GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
41
55
  GroupByThenRank(),
42
- Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
43
- Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
44
- Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
45
- Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
56
+ Combine(),
57
+ CombineThenFreq(),
58
+ GroupByThenNUnique(),
59
+ GroupByThenFreq(),
46
60
  Sim(),
47
61
  DateDiff(),
48
62
  DateDiffType2(),
@@ -59,6 +73,11 @@ ALL_OPERANDS: Dict[str, Operand] = {
59
73
  DatePercentile(),
60
74
  DatePercentileMethod2(),
61
75
  Norm(),
76
+ JaroWinklerSim1(),
77
+ JaroWinklerSim2(),
78
+ LevenshteinSim(),
79
+ Distance(),
80
+ Embeddings(),
62
81
  ]
63
82
  }
64
83
 
upgini/autofe/binary.py CHANGED
@@ -1,7 +1,11 @@
1
+ import abc
2
+ from typing import Optional
3
+ import Levenshtein
1
4
  import numpy as np
2
5
  import pandas as pd
3
6
  from numpy import dot
4
7
  from numpy.linalg import norm
8
+ from jarowinkler import jarowinkler_similarity
5
9
 
6
10
  from upgini.autofe.operand import PandasOperand, VectorizableMixin
7
11
 
@@ -130,7 +134,25 @@ class CombineThenFreq(PandasOperand):
130
134
  self._loc(temp, value_counts)
131
135
 
132
136
 
133
- class Sim(PandasOperand):
137
+ class Distance(PandasOperand):
138
+ name = "dist"
139
+ is_binary = True
140
+ output_type = "float"
141
+ is_symmetrical = True
142
+ has_symmetry_importance = True
143
+
144
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
145
+ return pd.Series(
146
+ 1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
147
+ )
148
+
149
+ # row-wise dot product
150
+ def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
151
+ return (left * right).apply(np.sum)
152
+
153
+
154
+ # Left for backward compatibility
155
+ class Sim(Distance):
134
156
  name = "sim"
135
157
  is_binary = True
136
158
  output_type = "float"
@@ -138,4 +160,71 @@ class Sim(PandasOperand):
138
160
  has_symmetry_importance = True
139
161
 
140
162
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
141
- return dot(left, right) / (norm(left) * norm(right))
163
+ return 1 - super().calculate_binary(left, right)
164
+
165
+
166
+ class StringSim(PandasOperand, abc.ABC):
167
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
168
+ sims = []
169
+ for i in left.index:
170
+ left_i = self._prepare_value(left.get(i))
171
+ right_i = self._prepare_value(right.get(i))
172
+ if left_i is not None and right_i is not None:
173
+ sims.append(self._similarity(left_i, right_i))
174
+ else:
175
+ sims.append(None)
176
+
177
+ return pd.Series(sims, index=left.index)
178
+
179
+ @abc.abstractmethod
180
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
181
+ pass
182
+
183
+ @abc.abstractmethod
184
+ def _similarity(self, left: str, right: str) -> float:
185
+ pass
186
+
187
+
188
+ class JaroWinklerSim1(StringSim):
189
+ name = "sim_jw1"
190
+ is_binary = True
191
+ input_type = "string"
192
+ output_type = "float"
193
+ is_symmetrical = True
194
+ has_symmetry_importance = True
195
+
196
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
197
+ return value
198
+
199
+ def _similarity(self, left: str, right: str) -> float:
200
+ return jarowinkler_similarity(left, right)
201
+
202
+
203
+ class JaroWinklerSim2(StringSim):
204
+ name = "sim_jw2"
205
+ is_binary = True
206
+ input_type = "string"
207
+ output_type = "float"
208
+ is_symmetrical = True
209
+ has_symmetry_importance = True
210
+
211
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
212
+ return value[::-1] if value is not None else None
213
+
214
+ def _similarity(self, left: str, right: str) -> float:
215
+ return jarowinkler_similarity(left, right)
216
+
217
+
218
+ class LevenshteinSim(StringSim):
219
+ name = "sim_lv"
220
+ is_binary = True
221
+ input_type = "string"
222
+ output_type = "float"
223
+ is_symmetrical = True
224
+ has_symmetry_importance = True
225
+
226
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
227
+ return value
228
+
229
+ def _similarity(self, left: str, right: str) -> float:
230
+ return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
upgini/autofe/date.py CHANGED
@@ -43,6 +43,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
43
43
  is_binary = True
44
44
  has_symmetry_importance = True
45
45
 
46
+ replace_negative: bool = False
47
+
46
48
  def get_params(self) -> Dict[str, Optional[str]]:
47
49
  res = super().get_params()
48
50
  res.update(
@@ -50,6 +52,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
50
52
  "diff_unit": self.diff_unit,
51
53
  "left_unit": self.left_unit,
52
54
  "right_unit": self.right_unit,
55
+ "replace_negative": self.replace_negative,
53
56
  }
54
57
  )
55
58
  return res
@@ -61,7 +64,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
61
64
  return self.__replace_negative(diff)
62
65
 
63
66
  def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
64
- x[x < 0] = None
67
+ if self.replace_negative:
68
+ x[x < 0] = None
65
69
  return x
66
70
 
67
71
 
@@ -101,13 +105,19 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
101
105
  class DateListDiff(PandasOperand, DateDiffMixin):
102
106
  is_binary = True
103
107
  has_symmetry_importance = True
108
+
104
109
  aggregation: str
110
+ replace_negative: bool = False
105
111
 
106
112
  def get_params(self) -> Dict[str, Optional[str]]:
107
113
  res = super().get_params()
108
114
  res.update(
109
115
  {
110
116
  "aggregation": self.aggregation,
117
+ "diff_unit": self.diff_unit,
118
+ "left_unit": self.left_unit,
119
+ "right_unit": self.right_unit,
120
+ "replace_negative": self.replace_negative,
111
121
  }
112
122
  )
113
123
  return res
@@ -125,7 +135,7 @@ class DateListDiff(PandasOperand, DateDiffMixin):
125
135
 
126
136
  def _diff(self, x: TimedeltaArray):
127
137
  x = self._convert_diff_to_unit(x)
128
- return x[x > 0]
138
+ return x[x > 0] if self.replace_negative else x
129
139
 
130
140
  def _agg(self, x):
131
141
  method = getattr(np, self.aggregation, None)
@@ -157,7 +167,10 @@ class DateListDiffBounded(DateListDiff):
157
167
  super().__init__(**data)
158
168
 
159
169
  def _agg(self, x):
160
- x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
170
+ x = x[
171
+ (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
172
+ & (x < (self.upper_bound if self.upper_bound is not None else np.inf))
173
+ ]
161
174
  return super()._agg(x)
162
175
 
163
176
 
upgini/autofe/feature.py CHANGED
@@ -140,8 +140,9 @@ class Feature:
140
140
 
141
141
  if self.alias:
142
142
  components = ["f_autofe", self.alias]
143
- elif shorten and not self.op.is_unary:
144
- components = ["f_autofe", self.get_op_display_name()]
143
+ elif shorten and not (self.op.is_unary and all(isinstance(c, Column) for c in self.children)):
144
+ prev_name = [self.children[0].get_op_display_name()] if self.op.is_unary else []
145
+ components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
145
146
  else:
146
147
  components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
147
148
  "autofe",
upgini/autofe/unary.py CHANGED
@@ -125,3 +125,10 @@ class Norm(PandasOperand):
125
125
  normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
126
126
  normalized_data = normalized_data.reindex(data.index)
127
127
  return normalized_data
128
+
129
+
130
+ class Embeddings(PandasOperand):
131
+ name = "emb"
132
+ is_unary = True
133
+ input_type = "string"
134
+ output_type = "vector"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.298
3
+ Version: 1.1.299a3511.dev6
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -26,6 +26,8 @@ Requires-Python: <3.11,>=3.8
26
26
  Requires-Dist: catboost>=1.0.3
27
27
  Requires-Dist: fastparquet>=0.8.1
28
28
  Requires-Dist: ipywidgets>=8.1.0
29
+ Requires-Dist: jarowinkler>=2.0.0
30
+ Requires-Dist: levenshtein>=0.25.1
29
31
  Requires-Dist: lightgbm>=3.3.2
30
32
  Requires-Dist: numpy>=1.19.0
31
33
  Requires-Dist: pandas<3.0.0,>=1.1.0
@@ -131,7 +133,7 @@ Description-Content-Type: text/markdown
131
133
  |Consumer Confidence index| 44 |22|-|Monthly|date, country|No
132
134
  |World economic indicators|191 |41|-|Monthly|date, country|No
133
135
  |Markets data|-|17|-|Monthly|date, datetime|No
134
- |World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
136
+ |World mobile & fixed broadband network coverage and perfomance |167|-|3|Monthly|country, postal/ZIP code|No
135
137
  |World demographic data |90|-|2|Annual|country, postal/ZIP code|No
136
138
  |World house prices |44|-|3|Annual|country, postal/ZIP code|No
137
139
  |Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
@@ -840,4 +842,4 @@ Some convenient ways to start contributing are:
840
842
  - [More perks for registered users](https://profile.upgini.com)
841
843
 
842
844
  <sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
843
- Please report it here</a></sup>
845
+ Please report it here</a></sup>
@@ -1,4 +1,4 @@
1
- upgini/__about__.py,sha256=DrcQXIql_sheOlvPg987gFNiLY_i-M0Y7coMswGwCfo,24
1
+ upgini/__about__.py,sha256=_0z3wkU1Qyf7uc0tWztaZ9d93IS373XBtHXVE9Apmzw,34
2
2
  upgini/__init__.py,sha256=ObEtjFkIssl83qeKNMLpIQygfwK8TzztwiI43YTsAP0,353
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=7TLVVhGtjgx_9yaiaIUK3kZSe_R9wg5dY0d4F5qCGM4,45636
@@ -14,13 +14,13 @@ upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1
14
14
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
15
15
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- upgini/autofe/all_operands.py,sha256=XbvgX2IU4aee9rJZ--d5MdmrfKhON_emle5-RU1qlEY,2506
18
- upgini/autofe/binary.py,sha256=8FXPJxN7fnC5wphO0Dp1tQCa0lFMSDGQGvBMkSIVAcE,4155
19
- upgini/autofe/date.py,sha256=8zYVhjl7jVS4xt-IjCgk9px2LHnACX2YlMlmDELlRTc,7943
20
- upgini/autofe/feature.py,sha256=ayxiF8Ip1ww_pt_BC9Pk127fAHZ_3fuluulS1EYLolk,13423
17
+ upgini/autofe/all_operands.py,sha256=3LiH9iU-ArGmYpS8FHWH7yCFx40ILfvlSXJlKIa75BQ,2542
18
+ upgini/autofe/binary.py,sha256=ml0MszLARZqp3UGUqTGsVjT4DD69zTisfBBEqbZ7klU,6767
19
+ upgini/autofe/date.py,sha256=Qq11EGLFHJxy5DQF2V1CBMtH2j4g5RpinRcw-7SobMs,8442
20
+ upgini/autofe/feature.py,sha256=cPbLJYAfzT8VqMDOGuEOBslJEDTdVphozQf6fCD8uuk,13587
21
21
  upgini/autofe/groupby.py,sha256=4WjDzQxqpZxB79Ih4ihMMI5GDxaFqiH6ZelfV82ClT4,3091
22
22
  upgini/autofe/operand.py,sha256=MKEsl3zxpWzRDpTkE0sNJxTu62U20sWOvEKhPjUWS6s,2915
23
- upgini/autofe/unary.py,sha256=ZWjLd-CUkNt_PpM8YuWLLipW1v_RdBlsl4JxXIVo9aM,3652
23
+ upgini/autofe/unary.py,sha256=B4wp8oKnlJ0nUng-DRMKSiF8MHlhAFYbgmo9Nd_0ZaA,3777
24
24
  upgini/autofe/vector.py,sha256=dLxfAstJs-gw_OQ1xxoxcM6pVzORlV0HVzdzt7cLXVQ,606
25
25
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
26
  upgini/data_source/data_source_publisher.py,sha256=1cQZrK630VztwGGDp41ec9gqIeUtkefaqSSQEitVWiM,19581
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
57
57
  upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.1.298.dist-info/METADATA,sha256=N-rsVNNjhl1dYRc2xSUZsTw3S0-6vQuyBr_gvMhXhkA,48153
61
- upgini-1.1.298.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
62
- upgini-1.1.298.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.1.298.dist-info/RECORD,,
60
+ upgini-1.1.299a3511.dev6.dist-info/METADATA,sha256=KzZj0GPmhe4dHrujcrKXrqe3xtQCN7OMGYPUjLKJGpA,48230
61
+ upgini-1.1.299a3511.dev6.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
62
+ upgini-1.1.299a3511.dev6.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.1.299a3511.dev6.dist-info/RECORD,,