upgini 1.2.56a3675.dev2__py3-none-any.whl → 1.2.56a3818.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/date.py +0 -8
- upgini/autofe/feature.py +1 -1
- upgini/autofe/vector.py +82 -5
- upgini/features_enricher.py +1 -1
- {upgini-1.2.56a3675.dev2.dist-info → upgini-1.2.56a3818.dev2.dist-info}/METADATA +1 -1
- {upgini-1.2.56a3675.dev2.dist-info → upgini-1.2.56a3818.dev2.dist-info}/RECORD +9 -9
- {upgini-1.2.56a3675.dev2.dist-info → upgini-1.2.56a3818.dev2.dist-info}/WHEEL +0 -0
- {upgini-1.2.56a3675.dev2.dist-info → upgini-1.2.56a3818.dev2.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.56a3818.dev2"
|
upgini/autofe/date.py
CHANGED
|
@@ -64,9 +64,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
64
64
|
return res
|
|
65
65
|
|
|
66
66
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
67
|
-
if left.isna().all() or right.isna().all():
|
|
68
|
-
return pd.Series([None] * len(left))
|
|
69
|
-
|
|
70
67
|
left = self._convert_to_date(left, self.left_unit)
|
|
71
68
|
right = self._convert_to_date(right, self.right_unit)
|
|
72
69
|
diff = self._convert_diff_to_unit(left.dt.date - right.dt.date)
|
|
@@ -145,9 +142,6 @@ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
|
|
|
145
142
|
return cls(aggregation=aggregation)
|
|
146
143
|
|
|
147
144
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
148
|
-
if left.isna().all() or right.isna().all():
|
|
149
|
-
return pd.Series([None] * len(left), dtype=np.float64)
|
|
150
|
-
|
|
151
145
|
left = self._convert_to_date(left, self.left_unit)
|
|
152
146
|
right_mask = right.apply(lambda x: len(x) > 0)
|
|
153
147
|
mask = left.notna() & right.notna() & right_mask
|
|
@@ -236,8 +230,6 @@ class DatePercentileBase(PandasOperand, abc.ABC):
|
|
|
236
230
|
pass
|
|
237
231
|
|
|
238
232
|
def _perc(self, f, bounds):
|
|
239
|
-
if f is None or np.isnan(f):
|
|
240
|
-
return np.nan
|
|
241
233
|
hit = np.where(f >= np.array(bounds))[0]
|
|
242
234
|
if hit.size > 0:
|
|
243
235
|
return np.max(hit) + 1
|
upgini/autofe/feature.py
CHANGED
upgini/autofe/vector.py
CHANGED
|
@@ -2,7 +2,11 @@ import abc
|
|
|
2
2
|
from typing import Dict, List, Optional
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
|
-
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
from pydantic import field_validator as validator # V2
|
|
8
|
+
except ImportError:
|
|
9
|
+
from pydantic import validator # V1
|
|
6
10
|
|
|
7
11
|
from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
|
|
8
12
|
|
|
@@ -29,12 +33,16 @@ class Sum(PandasOperand, VectorizableMixin):
|
|
|
29
33
|
class TimeSeriesBase(PandasOperand, abc.ABC):
|
|
30
34
|
is_vector: bool = True
|
|
31
35
|
date_unit: Optional[str] = None
|
|
36
|
+
offset_size: int = 0
|
|
37
|
+
offset_unit: str = "D"
|
|
32
38
|
|
|
33
39
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
34
40
|
res = super().get_params()
|
|
35
41
|
res.update(
|
|
36
42
|
{
|
|
37
43
|
"date_unit": self.date_unit,
|
|
44
|
+
"offset_size": self.offset_size,
|
|
45
|
+
"offset_unit": self.offset_unit,
|
|
38
46
|
}
|
|
39
47
|
)
|
|
40
48
|
return res
|
|
@@ -46,13 +54,31 @@ class TimeSeriesBase(PandasOperand, abc.ABC):
|
|
|
46
54
|
ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
|
|
47
55
|
ts.set_index(date.name, inplace=True)
|
|
48
56
|
ts = ts[ts.index.notna()].sort_index()
|
|
49
|
-
ts =
|
|
57
|
+
ts = (
|
|
58
|
+
ts.groupby([c.name for c in data[1:-1]])
|
|
59
|
+
.apply(self._shift)[data[-1].name]
|
|
60
|
+
.to_frame()
|
|
61
|
+
.reset_index()
|
|
62
|
+
.set_index(date.name)
|
|
63
|
+
.groupby([c.name for c in data[1:-1]])
|
|
64
|
+
if len(data) > 2
|
|
65
|
+
else self._shift(ts)
|
|
66
|
+
)
|
|
50
67
|
ts = self._aggregate(ts)
|
|
51
68
|
ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
|
|
52
69
|
ts.index = date.index
|
|
53
70
|
|
|
54
71
|
return ts.iloc[:, -1]
|
|
55
72
|
|
|
73
|
+
def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
74
|
+
if self.offset_size > 0:
|
|
75
|
+
return ts.iloc[:, :-1].merge(
|
|
76
|
+
ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
|
|
77
|
+
left_index=True,
|
|
78
|
+
right_index=True,
|
|
79
|
+
)
|
|
80
|
+
return ts
|
|
81
|
+
|
|
56
82
|
@abc.abstractmethod
|
|
57
83
|
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
58
84
|
pass
|
|
@@ -67,6 +93,7 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
|
|
|
67
93
|
window_unit: str = "D"
|
|
68
94
|
|
|
69
95
|
@validator("window_unit")
|
|
96
|
+
@classmethod
|
|
70
97
|
def validate_window_unit(cls, v: str) -> str:
|
|
71
98
|
try:
|
|
72
99
|
pd.tseries.frequencies.to_offset(v)
|
|
@@ -77,12 +104,35 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
|
|
|
77
104
|
)
|
|
78
105
|
|
|
79
106
|
def to_formula(self) -> str:
|
|
80
|
-
|
|
107
|
+
roll_component = f"roll_{self.window_size}{self.window_unit}"
|
|
108
|
+
if self.offset_size > 0:
|
|
109
|
+
roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
|
|
110
|
+
return f"{roll_component}_{self.aggregation}"
|
|
81
111
|
|
|
82
112
|
@classmethod
|
|
83
113
|
def from_formula(cls, formula: str) -> Optional["Roll"]:
|
|
84
114
|
import re
|
|
85
115
|
|
|
116
|
+
# Try matching pattern with offset first
|
|
117
|
+
pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
|
|
118
|
+
match_with_offset = re.match(pattern_with_offset, formula)
|
|
119
|
+
|
|
120
|
+
if match_with_offset:
|
|
121
|
+
window_size = int(match_with_offset.group(1))
|
|
122
|
+
window_unit = match_with_offset.group(2)
|
|
123
|
+
offset_size = int(match_with_offset.group(3))
|
|
124
|
+
offset_unit = match_with_offset.group(4)
|
|
125
|
+
aggregation = match_with_offset.group(5)
|
|
126
|
+
|
|
127
|
+
return cls(
|
|
128
|
+
window_size=window_size,
|
|
129
|
+
window_unit=window_unit,
|
|
130
|
+
offset_size=offset_size,
|
|
131
|
+
offset_unit=offset_unit,
|
|
132
|
+
aggregation=aggregation,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# If no offset pattern found, try basic pattern
|
|
86
136
|
pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
|
|
87
137
|
match = re.match(pattern, formula)
|
|
88
138
|
|
|
@@ -107,7 +157,7 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
|
|
|
107
157
|
return res
|
|
108
158
|
|
|
109
159
|
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
110
|
-
return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=
|
|
160
|
+
return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
|
|
111
161
|
_roll_aggregations.get(self.aggregation, self.aggregation)
|
|
112
162
|
)
|
|
113
163
|
|
|
@@ -117,12 +167,33 @@ class Lag(TimeSeriesBase, ParametrizedOperand):
|
|
|
117
167
|
lag_unit: str = "D"
|
|
118
168
|
|
|
119
169
|
def to_formula(self) -> str:
|
|
120
|
-
|
|
170
|
+
lag_component = f"lag_{self.lag_size}{self.lag_unit}"
|
|
171
|
+
if self.offset_size > 0:
|
|
172
|
+
lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
|
|
173
|
+
return lag_component
|
|
121
174
|
|
|
122
175
|
@classmethod
|
|
123
176
|
def from_formula(cls, formula: str) -> Optional["Lag"]:
|
|
124
177
|
import re
|
|
125
178
|
|
|
179
|
+
# Try matching pattern with offset first
|
|
180
|
+
pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
|
|
181
|
+
match_with_offset = re.match(pattern_with_offset, formula)
|
|
182
|
+
|
|
183
|
+
if match_with_offset:
|
|
184
|
+
lag_size = int(match_with_offset.group(1))
|
|
185
|
+
lag_unit = match_with_offset.group(2)
|
|
186
|
+
offset_size = int(match_with_offset.group(3))
|
|
187
|
+
offset_unit = match_with_offset.group(4)
|
|
188
|
+
|
|
189
|
+
return cls(
|
|
190
|
+
lag_size=lag_size,
|
|
191
|
+
lag_unit=lag_unit,
|
|
192
|
+
offset_size=offset_size,
|
|
193
|
+
offset_unit=offset_unit,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# If no offset pattern found, try basic pattern
|
|
126
197
|
pattern = r"^lag_(\d+)([a-zA-Z])$"
|
|
127
198
|
match = re.match(pattern, formula)
|
|
128
199
|
|
|
@@ -136,6 +207,12 @@ class Lag(TimeSeriesBase, ParametrizedOperand):
|
|
|
136
207
|
|
|
137
208
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
138
209
|
res = super().get_params()
|
|
210
|
+
res.update(
|
|
211
|
+
{
|
|
212
|
+
"lag_size": self.lag_size,
|
|
213
|
+
"lag_unit": self.lag_unit,
|
|
214
|
+
}
|
|
215
|
+
)
|
|
139
216
|
return res
|
|
140
217
|
|
|
141
218
|
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
upgini/features_enricher.py
CHANGED
|
@@ -3733,7 +3733,7 @@ if response.status_code == 200:
|
|
|
3733
3733
|
features_info_without_links = []
|
|
3734
3734
|
internal_features_info = []
|
|
3735
3735
|
|
|
3736
|
-
original_shaps = {fm.name: fm.shap_value for fm in features_meta}
|
|
3736
|
+
original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
|
|
3737
3737
|
|
|
3738
3738
|
if updated_shaps is not None:
|
|
3739
3739
|
for fm in features_meta:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.56a3818.dev2
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=B35mYNQAFpDko1Bk1FrsuvEXXmEaDk9hG_5GrMTV4IA,33
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=vT4JyHmafLNbj54SySXr93f5hNS6-t94aFslbBy-7No,33535
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=FkAKQV_XOXTobwOXpdy9BPfRkL4fkgoNa2B6NniiCrs,201554
|
|
7
7
|
upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
|
|
@@ -16,12 +16,12 @@ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo
|
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
upgini/autofe/all_operands.py,sha256=v0_NozalvvzeojSAA0d7UJ5INS654ZVaLn4S8djK6Ac,329
|
|
18
18
|
upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
|
|
19
|
-
upgini/autofe/date.py,sha256=
|
|
20
|
-
upgini/autofe/feature.py,sha256=
|
|
19
|
+
upgini/autofe/date.py,sha256=d-sijAD7dETfqIOCaZh1vhuVjsS_nqa-6dhjwkCdny4,10441
|
|
20
|
+
upgini/autofe/feature.py,sha256=WFob1r-E5s1ano_ogzUZ9xnMjTBN26wGv9lcOX8XghI,14763
|
|
21
21
|
upgini/autofe/groupby.py,sha256=G48_sQZw016eGx3cOy8YQrEIOp95puWqYUpFWd-gdeM,3595
|
|
22
22
|
upgini/autofe/operand.py,sha256=8Ttrfxv_H91dMbS7J55zxluzAJHfGXU_Y2xCh4OHwb8,4774
|
|
23
23
|
upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
|
|
24
|
-
upgini/autofe/vector.py,sha256=
|
|
24
|
+
upgini/autofe/vector.py,sha256=udkg4pP7IIeLjt0Cg6rzEKUmGaubOnqsEz3bz9R6E44,7110
|
|
25
25
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
26
|
upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lYQaGZbxDnOd4A3Q,22516
|
|
27
27
|
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
59
59
|
upgini/utils/target_utils.py,sha256=RlpKGss9kMibVSlA8iZuO_qxmyeplqzn7X8g6hiGGGs,14341
|
|
60
60
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
61
61
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
62
|
+
upgini-1.2.56a3818.dev2.dist-info/METADATA,sha256=V0b7BNTjV7HPOrPO34dKgjBeqORu5DkE62JZN-ub6gQ,49065
|
|
63
|
+
upgini-1.2.56a3818.dev2.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
64
|
+
upgini-1.2.56a3818.dev2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
65
|
+
upgini-1.2.56a3818.dev2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|