upgini 1.2.71a3810.dev2__py3-none-any.whl → 1.2.71a3810.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.71a3810.dev2"
1
+ __version__ = "1.2.71a3810.dev4"
upgini/autofe/date.py CHANGED
@@ -187,16 +187,21 @@ class DateListDiff(PandasOperator, DateDiffMixin, ParametrizedOperator):
187
187
  class DateListDiffBounded(DateListDiff, ParametrizedOperator):
188
188
  lower_bound: Optional[int] = None
189
189
  upper_bound: Optional[int] = None
190
+ normalize: Optional[bool] = None
190
191
 
191
192
  def to_formula(self) -> str:
192
193
  lower_bound = "minusinf" if self.lower_bound is None else self.lower_bound
193
194
  upper_bound = "plusinf" if self.upper_bound is None else self.upper_bound
194
- return f"date_diff_{self.diff_unit}_{lower_bound}_{upper_bound}_{self.aggregation}"
195
+ norm = "_norm" if self.normalize else ""
196
+ return f"date_diff_{self.diff_unit}_{lower_bound}_{upper_bound}_{self.aggregation}{norm}"
195
197
 
196
198
  @classmethod
197
199
  def from_formula(cls, formula: str) -> Optional["DateListDiffBounded"]:
198
200
  import re
199
201
 
202
+ normalize = formula.endswith("_norm")
203
+ formula = formula.replace("_norm", "")
204
+
200
205
  pattern = r"^date_diff_([^_]+)_((minusinf|\d+))_((plusinf|\d+))_(\w+)$"
201
206
  match = re.match(pattern, formula)
202
207
 
@@ -207,8 +212,13 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
207
212
  lower_bound = None if match.group(2) == "minusinf" else int(match.group(2))
208
213
  upper_bound = None if match.group(4) == "plusinf" else int(match.group(4))
209
214
  aggregation = match.group(6)
210
-
211
- return cls(diff_unit=diff_unit, lower_bound=lower_bound, upper_bound=upper_bound, aggregation=aggregation)
215
+ return cls(
216
+ diff_unit=diff_unit,
217
+ lower_bound=lower_bound,
218
+ upper_bound=upper_bound,
219
+ aggregation=aggregation,
220
+ normalize=normalize,
221
+ )
212
222
 
213
223
  def get_params(self) -> Dict[str, Optional[str]]:
214
224
  res = super().get_params()
@@ -216,14 +226,20 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
216
226
  res["lower_bound"] = str(self.lower_bound)
217
227
  if self.upper_bound is not None:
218
228
  res["upper_bound"] = str(self.upper_bound)
229
+ if self.normalize is not None:
230
+ res["normalize"] = str(self.normalize)
219
231
  return res
220
232
 
221
233
  def _agg(self, x):
234
+ orig_len = len(x)
222
235
  x = x[
223
236
  (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
224
237
  & (x < (self.upper_bound if self.upper_bound is not None else np.inf))
225
238
  ]
226
- return super()._agg(x)
239
+ agg_res = super()._agg(x)
240
+ if self.normalize and orig_len > 0:
241
+ return agg_res / orig_len
242
+ return agg_res
227
243
 
228
244
 
229
245
  class DatePercentileBase(PandasOperator, abc.ABC):
upgini/autofe/unary.py CHANGED
@@ -157,15 +157,15 @@ class Embeddings(PandasOperator):
157
157
  output_type: Optional[str] = "vector"
158
158
 
159
159
 
160
- class BinCat(PandasOperator):
160
+ class Bin(PandasOperator):
161
161
  name: str = "bin"
162
162
  is_unary: bool = True
163
- output_type: Optional[str] = "string"
164
- bin_bounds: List[int]
163
+ output_type: Optional[str] = "category"
164
+ bin_bounds: List[int] = []
165
165
  is_categorical: bool = True
166
166
 
167
167
  def calculate_unary(self, data: pd.Series) -> pd.Series:
168
- return data.apply(self._bin, bounds=self.bin_bounds).fillna(-1).astype(int).astype(str)
168
+ return data.apply(self._bin, bounds=self.bin_bounds).fillna(-1).astype(int).astype("category")
169
169
 
170
170
  def _bin(self, f, bounds):
171
171
  if f is None or np.isnan(f):
upgini/autofe/vector.py CHANGED
@@ -22,3 +22,12 @@ class Sum(PandasOperator, VectorizableMixin):
22
22
 
23
23
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
24
24
  return pd.DataFrame(data).T.fillna(0).sum(axis=1)
25
+
26
+
27
+ class Vectorize(PandasOperator, VectorizableMixin):
28
+ name: str = "vectorize"
29
+ is_vector: bool = True
30
+ group_index: int = 0
31
+
32
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
33
+ return pd.DataFrame(data).T.apply(lambda x: x.to_list(), axis=1)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.71a3810.dev2
3
+ Version: 1.2.71a3810.dev4
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,4 +1,4 @@
1
- upgini/__about__.py,sha256=MTGUBBTe5h0uDXYCCEi_Ls0ph00v8U1H8Ryg234maxU,33
1
+ upgini/__about__.py,sha256=LP81_wgdiIYkisJXGLW7oX7fcgEPYOBkpyITBahIEVo,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=nCPfkQIlAanLgCpcmsDfxFXmg99dRm9m0K_ibdLUr-4,35365
@@ -16,13 +16,13 @@ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
18
18
  upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
19
- upgini/autofe/date.py,sha256=C86F7sPiscUGq2a45UtQA9ADWBWg0kt54mePHHzjbLE,10633
19
+ upgini/autofe/date.py,sha256=MM1S-6imNSzCDOhbNnmsc_bwSqUWBcS8vWAdHF8j1kY,11134
20
20
  upgini/autofe/feature.py,sha256=y1x3wijhTVBmloayQAHiscqKU9Ll8kLcGm1PdvS357I,14910
21
21
  upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
22
22
  upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
23
- upgini/autofe/unary.py,sha256=_4F3ZyuPUz2nbkJFMJi2Dk5FirGZngUammstgK1Fq34,5720
23
+ upgini/autofe/unary.py,sha256=Sx11IoHRh5nwyALzjgG9GQOrVNIs8NZ1JzunAJuN66A,5731
24
24
  upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
25
- upgini/autofe/vector.py,sha256=l0KdKg-txlZxDSE4hPPfCtfGQofYbl7oaABPr830sPI,667
25
+ upgini/autofe/vector.py,sha256=zehv1J9ChHdZKWjKlkRf6RpfQMCJduZmqCEePYNUfkQ,943
26
26
  upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
27
27
  upgini/autofe/timeseries/base.py,sha256=MYK260n3h9kEbgunbyp0cpR0pgNHml3N2WDLGW5BLDU,3603
28
28
  upgini/autofe/timeseries/cross.py,sha256=xpHHVITXYUK20BgEZlqKN1Uy2uxKnHz72gngjt7BxVE,5316
@@ -71,7 +71,7 @@ upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,
71
71
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
72
72
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
73
73
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
74
- upgini-1.2.71a3810.dev2.dist-info/METADATA,sha256=KShCDNaZiUeH7OC7TETgJwx-UCZ9QWlaMcML-eZPJGY,49075
75
- upgini-1.2.71a3810.dev2.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
76
- upgini-1.2.71a3810.dev2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
- upgini-1.2.71a3810.dev2.dist-info/RECORD,,
74
+ upgini-1.2.71a3810.dev4.dist-info/METADATA,sha256=N2b-C2Z-kt5bPFhG-XK-IpHNHIce6PLNAL_VNtixQ_s,49075
75
+ upgini-1.2.71a3810.dev4.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
76
+ upgini-1.2.71a3810.dev4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
+ upgini-1.2.71a3810.dev4.dist-info/RECORD,,