upgini 1.2.71a3810.dev3__py3-none-any.whl → 1.2.71a3810.dev5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/date.py +20 -4
- upgini/autofe/feature.py +9 -7
- upgini/autofe/unary.py +2 -2
- upgini/autofe/vector.py +9 -0
- upgini/metrics.py +6 -0
- {upgini-1.2.71a3810.dev3.dist-info → upgini-1.2.71a3810.dev5.dist-info}/METADATA +1 -1
- {upgini-1.2.71a3810.dev3.dist-info → upgini-1.2.71a3810.dev5.dist-info}/RECORD +10 -10
- {upgini-1.2.71a3810.dev3.dist-info → upgini-1.2.71a3810.dev5.dist-info}/WHEEL +0 -0
- {upgini-1.2.71a3810.dev3.dist-info → upgini-1.2.71a3810.dev5.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.71a3810.
|
|
1
|
+
__version__ = "1.2.71a3810.dev5"
|
upgini/autofe/date.py
CHANGED
|
@@ -187,16 +187,21 @@ class DateListDiff(PandasOperator, DateDiffMixin, ParametrizedOperator):
|
|
|
187
187
|
class DateListDiffBounded(DateListDiff, ParametrizedOperator):
|
|
188
188
|
lower_bound: Optional[int] = None
|
|
189
189
|
upper_bound: Optional[int] = None
|
|
190
|
+
normalize: Optional[bool] = None
|
|
190
191
|
|
|
191
192
|
def to_formula(self) -> str:
|
|
192
193
|
lower_bound = "minusinf" if self.lower_bound is None else self.lower_bound
|
|
193
194
|
upper_bound = "plusinf" if self.upper_bound is None else self.upper_bound
|
|
194
|
-
|
|
195
|
+
norm = "_norm" if self.normalize else ""
|
|
196
|
+
return f"date_diff_{self.diff_unit}_{lower_bound}_{upper_bound}_{self.aggregation}{norm}"
|
|
195
197
|
|
|
196
198
|
@classmethod
|
|
197
199
|
def from_formula(cls, formula: str) -> Optional["DateListDiffBounded"]:
|
|
198
200
|
import re
|
|
199
201
|
|
|
202
|
+
normalize = formula.endswith("_norm")
|
|
203
|
+
formula = formula.replace("_norm", "")
|
|
204
|
+
|
|
200
205
|
pattern = r"^date_diff_([^_]+)_((minusinf|\d+))_((plusinf|\d+))_(\w+)$"
|
|
201
206
|
match = re.match(pattern, formula)
|
|
202
207
|
|
|
@@ -207,8 +212,13 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
|
|
|
207
212
|
lower_bound = None if match.group(2) == "minusinf" else int(match.group(2))
|
|
208
213
|
upper_bound = None if match.group(4) == "plusinf" else int(match.group(4))
|
|
209
214
|
aggregation = match.group(6)
|
|
210
|
-
|
|
211
|
-
|
|
215
|
+
return cls(
|
|
216
|
+
diff_unit=diff_unit,
|
|
217
|
+
lower_bound=lower_bound,
|
|
218
|
+
upper_bound=upper_bound,
|
|
219
|
+
aggregation=aggregation,
|
|
220
|
+
normalize=normalize,
|
|
221
|
+
)
|
|
212
222
|
|
|
213
223
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
214
224
|
res = super().get_params()
|
|
@@ -216,14 +226,20 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
|
|
|
216
226
|
res["lower_bound"] = str(self.lower_bound)
|
|
217
227
|
if self.upper_bound is not None:
|
|
218
228
|
res["upper_bound"] = str(self.upper_bound)
|
|
229
|
+
if self.normalize is not None:
|
|
230
|
+
res["normalize"] = str(self.normalize)
|
|
219
231
|
return res
|
|
220
232
|
|
|
221
233
|
def _agg(self, x):
|
|
234
|
+
orig_len = len(x)
|
|
222
235
|
x = x[
|
|
223
236
|
(x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
|
|
224
237
|
& (x < (self.upper_bound if self.upper_bound is not None else np.inf))
|
|
225
238
|
]
|
|
226
|
-
|
|
239
|
+
agg_res = super()._agg(x)
|
|
240
|
+
if self.normalize and orig_len > 0:
|
|
241
|
+
return agg_res / orig_len
|
|
242
|
+
return agg_res
|
|
227
243
|
|
|
228
244
|
|
|
229
245
|
class DatePercentileBase(PandasOperator, abc.ABC):
|
upgini/autofe/feature.py
CHANGED
|
@@ -162,16 +162,18 @@ class Feature:
|
|
|
162
162
|
return self.cached_display_name
|
|
163
163
|
|
|
164
164
|
should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
|
|
165
|
-
|
|
165
|
+
components = []
|
|
166
166
|
|
|
167
167
|
if self.alias:
|
|
168
|
-
components
|
|
169
|
-
elif
|
|
170
|
-
components
|
|
171
|
-
|
|
172
|
-
components = (
|
|
173
|
-
["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe"] + prev_name + [self.get_op_display_name()]
|
|
168
|
+
components.extend(["f_autofe", self.alias])
|
|
169
|
+
elif should_stack_op:
|
|
170
|
+
components.extend(
|
|
171
|
+
[self.children[0].get_display_name(cache=cache, shorten=shorten, **kwargs), self.get_op_display_name()]
|
|
174
172
|
)
|
|
173
|
+
elif shorten:
|
|
174
|
+
components.extend(["f_autofe", self.get_op_display_name()])
|
|
175
|
+
else:
|
|
176
|
+
components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe", self.get_op_display_name()]
|
|
175
177
|
components.extend([str(self.display_index)] if self.display_index is not None else [])
|
|
176
178
|
display_name = "_".join(components)
|
|
177
179
|
|
upgini/autofe/unary.py
CHANGED
|
@@ -160,12 +160,12 @@ class Embeddings(PandasOperator):
|
|
|
160
160
|
class Bin(PandasOperator):
|
|
161
161
|
name: str = "bin"
|
|
162
162
|
is_unary: bool = True
|
|
163
|
-
output_type: Optional[str] = "
|
|
163
|
+
output_type: Optional[str] = "category"
|
|
164
164
|
bin_bounds: List[int] = []
|
|
165
165
|
is_categorical: bool = True
|
|
166
166
|
|
|
167
167
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
168
|
-
return data.apply(self._bin, bounds=self.bin_bounds).fillna(-1).astype(int).astype(
|
|
168
|
+
return data.apply(self._bin, bounds=self.bin_bounds).fillna(-1).astype(int).astype("category")
|
|
169
169
|
|
|
170
170
|
def _bin(self, f, bounds):
|
|
171
171
|
if f is None or np.isnan(f):
|
upgini/autofe/vector.py
CHANGED
|
@@ -22,3 +22,12 @@ class Sum(PandasOperator, VectorizableMixin):
|
|
|
22
22
|
|
|
23
23
|
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
24
24
|
return pd.DataFrame(data).T.fillna(0).sum(axis=1)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class Vectorize(PandasOperator, VectorizableMixin):
|
|
28
|
+
name: str = "vectorize"
|
|
29
|
+
is_vector: bool = True
|
|
30
|
+
group_index: int = 0
|
|
31
|
+
|
|
32
|
+
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
33
|
+
return pd.DataFrame(data).T.apply(lambda x: x.to_list(), axis=1)
|
upgini/metrics.py
CHANGED
|
@@ -697,6 +697,9 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
697
697
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
|
698
698
|
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
699
699
|
self.cat_features = _get_cat_features(x)
|
|
700
|
+
print("prepare to fit")
|
|
701
|
+
print(x.dtypes.to_dict())
|
|
702
|
+
print(self.cat_features)
|
|
700
703
|
x = fill_na_cat_features(x, self.cat_features)
|
|
701
704
|
for feature in self.cat_features:
|
|
702
705
|
x[feature] = x[feature].astype("category").cat.codes
|
|
@@ -707,6 +710,9 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
707
710
|
|
|
708
711
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
709
712
|
x, y, params = super()._prepare_to_calculate(x, y)
|
|
713
|
+
print("prepare to calculate")
|
|
714
|
+
print(x.dtypes.to_dict())
|
|
715
|
+
print(self.cat_features)
|
|
710
716
|
if self.cat_features is not None:
|
|
711
717
|
x = fill_na_cat_features(x, self.cat_features)
|
|
712
718
|
for feature in self.cat_features:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.71a3810.
|
|
3
|
+
Version: 1.2.71a3810.dev5
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=QR5uw20nhIxDI34CnHAuXHBaXYnOBpyCMHIjl0vktNQ,33
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=nCPfkQIlAanLgCpcmsDfxFXmg99dRm9m0K_ibdLUr-4,35365
|
|
@@ -7,7 +7,7 @@ upgini/features_enricher.py,sha256=KqDQ29sU1Aty5Z40DDqO869Y_CClQfmU58nE9rScxRc,2
|
|
|
7
7
|
upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
|
|
10
|
-
upgini/metrics.py,sha256=
|
|
10
|
+
upgini/metrics.py,sha256=KqSoT-TOnSpYGEY3ZC7Hq8YrYdxNXbjtyorCAk86MzU,35681
|
|
11
11
|
upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
13
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
|
@@ -16,13 +16,13 @@ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo
|
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
|
|
18
18
|
upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
|
|
19
|
-
upgini/autofe/date.py,sha256=
|
|
20
|
-
upgini/autofe/feature.py,sha256=
|
|
19
|
+
upgini/autofe/date.py,sha256=MM1S-6imNSzCDOhbNnmsc_bwSqUWBcS8vWAdHF8j1kY,11134
|
|
20
|
+
upgini/autofe/feature.py,sha256=md43NwDof0s_nWn_WfOO0l2wYItQ416nEzHm5u29XOA,14945
|
|
21
21
|
upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
|
|
22
22
|
upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
|
|
23
|
-
upgini/autofe/unary.py,sha256=
|
|
23
|
+
upgini/autofe/unary.py,sha256=Sx11IoHRh5nwyALzjgG9GQOrVNIs8NZ1JzunAJuN66A,5731
|
|
24
24
|
upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
|
|
25
|
-
upgini/autofe/vector.py,sha256=
|
|
25
|
+
upgini/autofe/vector.py,sha256=zehv1J9ChHdZKWjKlkRf6RpfQMCJduZmqCEePYNUfkQ,943
|
|
26
26
|
upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
|
|
27
27
|
upgini/autofe/timeseries/base.py,sha256=MYK260n3h9kEbgunbyp0cpR0pgNHml3N2WDLGW5BLDU,3603
|
|
28
28
|
upgini/autofe/timeseries/cross.py,sha256=xpHHVITXYUK20BgEZlqKN1Uy2uxKnHz72gngjt7BxVE,5316
|
|
@@ -71,7 +71,7 @@ upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,
|
|
|
71
71
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
72
72
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
73
73
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
74
|
-
upgini-1.2.71a3810.
|
|
75
|
-
upgini-1.2.71a3810.
|
|
76
|
-
upgini-1.2.71a3810.
|
|
77
|
-
upgini-1.2.71a3810.
|
|
74
|
+
upgini-1.2.71a3810.dev5.dist-info/METADATA,sha256=F7wqhbZWwxUebgf0hxfovlBTpUh9-mz4d5LX8TcJP5Q,49075
|
|
75
|
+
upgini-1.2.71a3810.dev5.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
76
|
+
upgini-1.2.71a3810.dev5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
77
|
+
upgini-1.2.71a3810.dev5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|