upgini 1.2.57a2__tar.gz → 1.2.57a3675.dev4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/PKG-INFO +1 -1
- upgini-1.2.57a3675.dev4/src/upgini/__about__.py +1 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/autofe/date.py +8 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/autofe/feature.py +1 -10
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/autofe/vector.py +82 -5
- upgini-1.2.57a2/src/upgini/__about__.py +0 -1
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/.gitignore +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/LICENSE +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/README.md +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/pyproject.toml +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/__init__.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/ads.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/dataset.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/errors.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/features_enricher.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/http.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/metadata.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/metrics.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/search_task.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/spinner.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.57a3675.dev4"
|
|
@@ -64,6 +64,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
64
64
|
return res
|
|
65
65
|
|
|
66
66
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
67
|
+
if left.isna().all() or right.isna().all():
|
|
68
|
+
return pd.Series([None] * len(left))
|
|
69
|
+
|
|
67
70
|
left = self._convert_to_date(left, self.left_unit)
|
|
68
71
|
right = self._convert_to_date(right, self.right_unit)
|
|
69
72
|
diff = self._convert_diff_to_unit(left.dt.date - right.dt.date)
|
|
@@ -142,6 +145,9 @@ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
|
|
|
142
145
|
return cls(aggregation=aggregation)
|
|
143
146
|
|
|
144
147
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
148
|
+
if left.isna().all() or right.isna().all():
|
|
149
|
+
return pd.Series([None] * len(left), dtype=np.float64)
|
|
150
|
+
|
|
145
151
|
left = self._convert_to_date(left, self.left_unit)
|
|
146
152
|
right_mask = right.apply(lambda x: len(x) > 0)
|
|
147
153
|
mask = left.notna() & right.notna() & right_mask
|
|
@@ -230,6 +236,8 @@ class DatePercentileBase(PandasOperand, abc.ABC):
|
|
|
230
236
|
pass
|
|
231
237
|
|
|
232
238
|
def _perc(self, f, bounds):
|
|
239
|
+
if f is None or np.isnan(f):
|
|
240
|
+
return np.nan
|
|
233
241
|
hit = np.where(f >= np.array(bounds))[0]
|
|
234
242
|
if hit.size > 0:
|
|
235
243
|
return np.max(hit) + 1
|
|
@@ -26,18 +26,9 @@ class Column:
|
|
|
26
26
|
return dict()
|
|
27
27
|
|
|
28
28
|
def rename_columns(self, mapping: Dict[str, str]) -> "Column":
|
|
29
|
-
self.name =
|
|
29
|
+
self.name = mapping.get(self.name) or self.name
|
|
30
30
|
return self
|
|
31
31
|
|
|
32
|
-
def _unhash(self, feature_name: str) -> str:
|
|
33
|
-
last_component_idx = feature_name.rfind("_")
|
|
34
|
-
if not feature_name.startswith("f_"):
|
|
35
|
-
return feature_name # etalon feature
|
|
36
|
-
elif last_component_idx == 1:
|
|
37
|
-
return feature_name[2:] # fully hashed name, cannot unhash
|
|
38
|
-
else:
|
|
39
|
-
return feature_name[2:last_component_idx]
|
|
40
|
-
|
|
41
32
|
def delete_data(self):
|
|
42
33
|
self.data = None
|
|
43
34
|
|
|
@@ -2,7 +2,11 @@ import abc
|
|
|
2
2
|
from typing import Dict, List, Optional
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
|
-
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
from pydantic import field_validator as validator # V2
|
|
8
|
+
except ImportError:
|
|
9
|
+
from pydantic import validator # V1
|
|
6
10
|
|
|
7
11
|
from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
|
|
8
12
|
|
|
@@ -29,12 +33,16 @@ class Sum(PandasOperand, VectorizableMixin):
|
|
|
29
33
|
class TimeSeriesBase(PandasOperand, abc.ABC):
|
|
30
34
|
is_vector: bool = True
|
|
31
35
|
date_unit: Optional[str] = None
|
|
36
|
+
offset_size: int = 0
|
|
37
|
+
offset_unit: str = "D"
|
|
32
38
|
|
|
33
39
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
34
40
|
res = super().get_params()
|
|
35
41
|
res.update(
|
|
36
42
|
{
|
|
37
43
|
"date_unit": self.date_unit,
|
|
44
|
+
"offset_size": self.offset_size,
|
|
45
|
+
"offset_unit": self.offset_unit,
|
|
38
46
|
}
|
|
39
47
|
)
|
|
40
48
|
return res
|
|
@@ -46,13 +54,31 @@ class TimeSeriesBase(PandasOperand, abc.ABC):
|
|
|
46
54
|
ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
|
|
47
55
|
ts.set_index(date.name, inplace=True)
|
|
48
56
|
ts = ts[ts.index.notna()].sort_index()
|
|
49
|
-
ts =
|
|
57
|
+
ts = (
|
|
58
|
+
ts.groupby([c.name for c in data[1:-1]])
|
|
59
|
+
.apply(self._shift)[data[-1].name]
|
|
60
|
+
.to_frame()
|
|
61
|
+
.reset_index()
|
|
62
|
+
.set_index(date.name)
|
|
63
|
+
.groupby([c.name for c in data[1:-1]])
|
|
64
|
+
if len(data) > 2
|
|
65
|
+
else self._shift(ts)
|
|
66
|
+
)
|
|
50
67
|
ts = self._aggregate(ts)
|
|
51
68
|
ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
|
|
52
69
|
ts.index = date.index
|
|
53
70
|
|
|
54
71
|
return ts.iloc[:, -1]
|
|
55
72
|
|
|
73
|
+
def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
74
|
+
if self.offset_size > 0:
|
|
75
|
+
return ts.iloc[:, :-1].merge(
|
|
76
|
+
ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
|
|
77
|
+
left_index=True,
|
|
78
|
+
right_index=True,
|
|
79
|
+
)
|
|
80
|
+
return ts
|
|
81
|
+
|
|
56
82
|
@abc.abstractmethod
|
|
57
83
|
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
58
84
|
pass
|
|
@@ -67,6 +93,7 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
|
|
|
67
93
|
window_unit: str = "D"
|
|
68
94
|
|
|
69
95
|
@validator("window_unit")
|
|
96
|
+
@classmethod
|
|
70
97
|
def validate_window_unit(cls, v: str) -> str:
|
|
71
98
|
try:
|
|
72
99
|
pd.tseries.frequencies.to_offset(v)
|
|
@@ -77,12 +104,35 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
|
|
|
77
104
|
)
|
|
78
105
|
|
|
79
106
|
def to_formula(self) -> str:
|
|
80
|
-
|
|
107
|
+
roll_component = f"roll_{self.window_size}{self.window_unit}"
|
|
108
|
+
if self.offset_size > 0:
|
|
109
|
+
roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
|
|
110
|
+
return f"{roll_component}_{self.aggregation}"
|
|
81
111
|
|
|
82
112
|
@classmethod
|
|
83
113
|
def from_formula(cls, formula: str) -> Optional["Roll"]:
|
|
84
114
|
import re
|
|
85
115
|
|
|
116
|
+
# Try matching pattern with offset first
|
|
117
|
+
pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
|
|
118
|
+
match_with_offset = re.match(pattern_with_offset, formula)
|
|
119
|
+
|
|
120
|
+
if match_with_offset:
|
|
121
|
+
window_size = int(match_with_offset.group(1))
|
|
122
|
+
window_unit = match_with_offset.group(2)
|
|
123
|
+
offset_size = int(match_with_offset.group(3))
|
|
124
|
+
offset_unit = match_with_offset.group(4)
|
|
125
|
+
aggregation = match_with_offset.group(5)
|
|
126
|
+
|
|
127
|
+
return cls(
|
|
128
|
+
window_size=window_size,
|
|
129
|
+
window_unit=window_unit,
|
|
130
|
+
offset_size=offset_size,
|
|
131
|
+
offset_unit=offset_unit,
|
|
132
|
+
aggregation=aggregation,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# If no offset pattern found, try basic pattern
|
|
86
136
|
pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
|
|
87
137
|
match = re.match(pattern, formula)
|
|
88
138
|
|
|
@@ -107,7 +157,7 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
|
|
|
107
157
|
return res
|
|
108
158
|
|
|
109
159
|
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
110
|
-
return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=
|
|
160
|
+
return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
|
|
111
161
|
_roll_aggregations.get(self.aggregation, self.aggregation)
|
|
112
162
|
)
|
|
113
163
|
|
|
@@ -117,12 +167,33 @@ class Lag(TimeSeriesBase, ParametrizedOperand):
|
|
|
117
167
|
lag_unit: str = "D"
|
|
118
168
|
|
|
119
169
|
def to_formula(self) -> str:
|
|
120
|
-
|
|
170
|
+
lag_component = f"lag_{self.lag_size}{self.lag_unit}"
|
|
171
|
+
if self.offset_size > 0:
|
|
172
|
+
lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
|
|
173
|
+
return lag_component
|
|
121
174
|
|
|
122
175
|
@classmethod
|
|
123
176
|
def from_formula(cls, formula: str) -> Optional["Lag"]:
|
|
124
177
|
import re
|
|
125
178
|
|
|
179
|
+
# Try matching pattern with offset first
|
|
180
|
+
pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
|
|
181
|
+
match_with_offset = re.match(pattern_with_offset, formula)
|
|
182
|
+
|
|
183
|
+
if match_with_offset:
|
|
184
|
+
lag_size = int(match_with_offset.group(1))
|
|
185
|
+
lag_unit = match_with_offset.group(2)
|
|
186
|
+
offset_size = int(match_with_offset.group(3))
|
|
187
|
+
offset_unit = match_with_offset.group(4)
|
|
188
|
+
|
|
189
|
+
return cls(
|
|
190
|
+
lag_size=lag_size,
|
|
191
|
+
lag_unit=lag_unit,
|
|
192
|
+
offset_size=offset_size,
|
|
193
|
+
offset_unit=offset_unit,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# If no offset pattern found, try basic pattern
|
|
126
197
|
pattern = r"^lag_(\d+)([a-zA-Z])$"
|
|
127
198
|
match = re.match(pattern, formula)
|
|
128
199
|
|
|
@@ -136,6 +207,12 @@ class Lag(TimeSeriesBase, ParametrizedOperand):
|
|
|
136
207
|
|
|
137
208
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
138
209
|
res = super().get_params()
|
|
210
|
+
res.update(
|
|
211
|
+
{
|
|
212
|
+
"lag_size": self.lag_size,
|
|
213
|
+
"lag_unit": self.lag_unit,
|
|
214
|
+
}
|
|
215
|
+
)
|
|
139
216
|
return res
|
|
140
217
|
|
|
141
218
|
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.57a2"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/resource_bundle/strings_widget.properties
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|