upgini 1.1.296a3511.dev4__py3-none-any.whl → 1.1.296a3521.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.1.296a3511.dev4"
1
+ __version__ = "1.1.296a3521.dev1"
@@ -1,24 +1,17 @@
1
1
  from typing import Dict
2
2
 
3
- from upgini.autofe.binary import (
4
- Add,
5
- Combine,
6
- CombineThenFreq,
7
- Distance,
8
- Divide,
9
- JaroWinklerSim1,
10
- JaroWinklerSim2,
11
- LevenshteinSim,
12
- Max,
13
- Min,
14
- Multiply,
15
- Sim,
16
- Subtract,
3
+ from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
4
+ from upgini.autofe.date import (
5
+ DateDiff,
6
+ DateDiffType2,
7
+ DateListDiff,
8
+ DateListDiffBounded,
9
+ DatePercentile,
10
+ DatePercentileType2,
17
11
  )
18
- from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded, DatePercentile
19
- from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
12
+ from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
20
13
  from upgini.autofe.operand import Operand
21
- from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
14
+ from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
22
15
  from upgini.autofe.vector import Mean, Sum
23
16
 
24
17
  ALL_OPERANDS: Dict[str, Operand] = {
@@ -46,10 +39,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
46
39
  GroupByThenAgg(name="GroupByThenMedian", agg="median"),
47
40
  GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
48
41
  GroupByThenRank(),
49
- Combine(),
50
- CombineThenFreq(),
51
- GroupByThenNUnique(),
52
- GroupByThenFreq(),
42
+ Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
43
+ Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
44
+ Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
45
+ Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
53
46
  Sim(),
54
47
  DateDiff(),
55
48
  DateDiffType2(),
@@ -64,12 +57,8 @@ ALL_OPERANDS: Dict[str, Operand] = {
64
57
  DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
65
58
  DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
66
59
  DatePercentile(),
60
+ DatePercentileType2(),
67
61
  Norm(),
68
- JaroWinklerSim1(),
69
- JaroWinklerSim2(),
70
- LevenshteinSim(),
71
- Distance(),
72
- Embeddings(),
73
62
  ]
74
63
  }
75
64
 
upgini/autofe/binary.py CHANGED
@@ -1,11 +1,7 @@
1
- import abc
2
- from typing import Optional
3
- import Levenshtein
4
1
  import numpy as np
5
2
  import pandas as pd
6
3
  from numpy import dot
7
4
  from numpy.linalg import norm
8
- from jarowinkler import jarowinkler_similarity
9
5
 
10
6
  from upgini.autofe.operand import PandasOperand, VectorizableMixin
11
7
 
@@ -134,25 +130,7 @@ class CombineThenFreq(PandasOperand):
134
130
  self._loc(temp, value_counts)
135
131
 
136
132
 
137
- class Distance(PandasOperand):
138
- name = "dist"
139
- is_binary = True
140
- output_type = "float"
141
- is_symmetrical = True
142
- has_symmetry_importance = True
143
-
144
- def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
145
- return pd.Series(
146
- 1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
147
- )
148
-
149
- # row-wise dot product
150
- def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
151
- return (left * right).apply(np.sum)
152
-
153
-
154
- # Left for backward compatibility
155
- class Sim(Distance):
133
+ class Sim(PandasOperand):
156
134
  name = "sim"
157
135
  is_binary = True
158
136
  output_type = "float"
@@ -160,71 +138,4 @@ class Sim(Distance):
160
138
  has_symmetry_importance = True
161
139
 
162
140
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
163
- return 1 - super().calculate_binary(left, right)
164
-
165
-
166
- class StringSim(PandasOperand, abc.ABC):
167
- def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
168
- sims = []
169
- for i in left.index:
170
- left_i = self._prepare_value(left.get(i))
171
- right_i = self._prepare_value(right.get(i))
172
- if left_i is not None and right_i is not None:
173
- sims.append(self._similarity(left_i, right_i))
174
- else:
175
- sims.append(None)
176
-
177
- return pd.Series(sims, index=left.index)
178
-
179
- @abc.abstractmethod
180
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
181
- pass
182
-
183
- @abc.abstractmethod
184
- def _similarity(self, left: str, right: str) -> float:
185
- pass
186
-
187
-
188
- class JaroWinklerSim1(StringSim):
189
- name = "sim_jw1"
190
- is_binary = True
191
- input_type = "string"
192
- output_type = "float"
193
- is_symmetrical = True
194
- has_symmetry_importance = True
195
-
196
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
197
- return value
198
-
199
- def _similarity(self, left: str, right: str) -> float:
200
- return jarowinkler_similarity(left, right)
201
-
202
-
203
- class JaroWinklerSim2(StringSim):
204
- name = "sim_jw2"
205
- is_binary = True
206
- input_type = "string"
207
- output_type = "float"
208
- is_symmetrical = True
209
- has_symmetry_importance = True
210
-
211
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
212
- return value[::-1] if value is not None else None
213
-
214
- def _similarity(self, left: str, right: str) -> float:
215
- return jarowinkler_similarity(left, right)
216
-
217
-
218
- class LevenshteinSim(StringSim):
219
- name = "sim_lv"
220
- is_binary = True
221
- input_type = "string"
222
- output_type = "float"
223
- is_symmetrical = True
224
- has_symmetry_importance = True
225
-
226
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
227
- return value
228
-
229
- def _similarity(self, left: str, right: str) -> float:
230
- return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
141
+ return dot(left, right) / (norm(left) * norm(right))
upgini/autofe/date.py CHANGED
@@ -1,3 +1,4 @@
1
+ import abc
1
2
  from typing import Any, Dict, List, Optional, Union
2
3
 
3
4
  import numpy as np
@@ -159,11 +160,33 @@ class DateListDiffBounded(DateListDiff):
159
160
  return super()._agg(x)
160
161
 
161
162
 
162
- class DatePercentile(PandasOperand):
163
- name = "date_per"
163
+ class DatePercentileBase(PandasOperand, abc.ABC):
164
164
  is_binary = True
165
165
  output_type = "float"
166
166
 
167
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
168
+ # Assuming that left is a date column, right is a feature column
169
+ left = pd.to_datetime(left, unit=self.date_unit)
170
+
171
+ bounds = self._get_bounds(left)
172
+
173
+ return right.index.to_series().apply(lambda i: self.__perc(right[i], bounds[i]))
174
+
175
+ @abc.abstractmethod
176
+ def _get_bounds(self, date_col: pd.Series) -> pd.Series:
177
+ pass
178
+
179
+ def __perc(self, f, bounds):
180
+ hit = np.where(f >= bounds)[0]
181
+ if hit.size > 0:
182
+ return np.max(hit) + 1
183
+ else:
184
+ return np.nan
185
+
186
+
187
+ class DatePercentile(DatePercentileBase):
188
+ name = "date_per"
189
+
167
190
  date_unit: Optional[str] = None
168
191
  zero_month: Optional[int]
169
192
  zero_year: Optional[int]
@@ -190,22 +213,15 @@ class DatePercentile(PandasOperand):
190
213
  elif isinstance(value, str):
191
214
  return value[1:-1].split(", ")
192
215
 
193
- def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
194
- # Assuming that left is a date column, right is a feature column
195
- left = pd.to_datetime(left, unit=self.date_unit)
196
- months = left.dt.month
197
- years = left.dt.year
216
+ def _get_bounds(self, date_col: pd.Series) -> pd.Series:
217
+ months = date_col.dt.month
218
+ years = date_col.dt.year
198
219
 
199
220
  month_diffs = 12 * (years - (self.zero_year or 0)) + (months - (self.zero_month or 0))
200
- bounds = month_diffs.apply(
221
+ return month_diffs.apply(
201
222
  lambda d: np.array(self.zero_bounds if self.zero_bounds is not None else []) + d * self.step
202
223
  )
203
224
 
204
- return right.index.to_series().apply(lambda i: self.__perc(right[i], bounds[i]))
205
225
 
206
- def __perc(self, f, bounds):
207
- hit = np.where(f >= bounds)[0]
208
- if hit.size > 0:
209
- return np.max(hit) + 1
210
- else:
211
- return np.nan
226
+ class DatePercentileType2(DatePercentileBase):
227
+ name = "date_per_type2"
upgini/autofe/unary.py CHANGED
@@ -125,10 +125,3 @@ class Norm(PandasOperand):
125
125
  normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
126
126
  normalized_data = normalized_data.reindex(data.index)
127
127
  return normalized_data
128
-
129
-
130
- class Embeddings(PandasOperand):
131
- name = "emb"
132
- is_unary = True
133
- input_type = "string"
134
- output_type = "vector"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.296a3511.dev4
3
+ Version: 1.1.296a3521.dev1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -26,8 +26,6 @@ Requires-Python: <3.11,>=3.8
26
26
  Requires-Dist: catboost>=1.0.3
27
27
  Requires-Dist: fastparquet>=0.8.1
28
28
  Requires-Dist: ipywidgets>=8.1.0
29
- Requires-Dist: jarowinkler>=2.0.0
30
- Requires-Dist: levenshtein>=0.25.1
31
29
  Requires-Dist: lightgbm>=3.3.2
32
30
  Requires-Dist: numpy>=1.19.0
33
31
  Requires-Dist: pandas<3.0.0,>=1.1.0
@@ -1,4 +1,4 @@
1
- upgini/__about__.py,sha256=-FNIwoymXDyCsBSXhzfzKHsj5wYGDAzEzuAYsHXTk_E,34
1
+ upgini/__about__.py,sha256=Bw_daqwDyDp4zgY8gHVi9qMRi1mWRmyMwv7UETDUuIE,34
2
2
  upgini/__init__.py,sha256=ObEtjFkIssl83qeKNMLpIQygfwK8TzztwiI43YTsAP0,353
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=7TLVVhGtjgx_9yaiaIUK3kZSe_R9wg5dY0d4F5qCGM4,45636
@@ -14,13 +14,13 @@ upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1
14
14
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
15
15
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- upgini/autofe/all_operands.py,sha256=-6gzp6nWBfzKmRRSvyMYUkubH7IwxL3Lrh9wVD85Baw,2457
18
- upgini/autofe/binary.py,sha256=ml0MszLARZqp3UGUqTGsVjT4DD69zTisfBBEqbZ7klU,6767
19
- upgini/autofe/date.py,sha256=qzk0NT332Q0vR1eRwTuNiMSrGE3ulh6Ic3QLBZqSdvw,7284
17
+ upgini/autofe/all_operands.py,sha256=WJOiNVNVyrWc3vYGMvJtrpGRrLEQ237YMTsWnkUbLNw,2502
18
+ upgini/autofe/binary.py,sha256=8FXPJxN7fnC5wphO0Dp1tQCa0lFMSDGQGvBMkSIVAcE,4155
19
+ upgini/autofe/date.py,sha256=UHJLyTKYCKwu29J_wsT-wQjPMlPLw1M69TpWVKZPVIM,7637
20
20
  upgini/autofe/feature.py,sha256=_V9B74B3ue7eAYXSOt9JKhVC9klkAKks22MwnBRye_w,12487
21
21
  upgini/autofe/groupby.py,sha256=4WjDzQxqpZxB79Ih4ihMMI5GDxaFqiH6ZelfV82ClT4,3091
22
22
  upgini/autofe/operand.py,sha256=MKEsl3zxpWzRDpTkE0sNJxTu62U20sWOvEKhPjUWS6s,2915
23
- upgini/autofe/unary.py,sha256=B4wp8oKnlJ0nUng-DRMKSiF8MHlhAFYbgmo9Nd_0ZaA,3777
23
+ upgini/autofe/unary.py,sha256=ZWjLd-CUkNt_PpM8YuWLLipW1v_RdBlsl4JxXIVo9aM,3652
24
24
  upgini/autofe/vector.py,sha256=dLxfAstJs-gw_OQ1xxoxcM6pVzORlV0HVzdzt7cLXVQ,606
25
25
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
26
  upgini/data_source/data_source_publisher.py,sha256=6paupnciqagACnSzjGSkA2a5i-c9ETvZheLqBYOJemk,17810
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
57
57
  upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.1.296a3511.dev4.dist-info/METADATA,sha256=AtvwqLGdDdGvGYETXs7A6olbBCE-p7YsE82OeNcSq2g,48196
61
- upgini-1.1.296a3511.dev4.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
62
- upgini-1.1.296a3511.dev4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.1.296a3511.dev4.dist-info/RECORD,,
60
+ upgini-1.1.296a3521.dev1.dist-info/METADATA,sha256=CoAnME2f-0ZJf3tlkAQyyc0oUaDu26veU3LbeyTroQg,48127
61
+ upgini-1.1.296a3521.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
62
+ upgini-1.1.296a3521.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.1.296a3521.dev1.dist-info/RECORD,,