upgini 1.2.20a3657.dev1__py3-none-any.whl → 1.2.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/__init__.py +0 -1
- upgini/autofe/all_operands.py +84 -2
- upgini/autofe/date.py +6 -33
- upgini/autofe/operand.py +1 -47
- upgini/autofe/vector.py +2 -133
- upgini/features_enricher.py +27 -11
- {upgini-1.2.20a3657.dev1.dist-info → upgini-1.2.22.dist-info}/METADATA +1 -1
- {upgini-1.2.20a3657.dev1.dist-info → upgini-1.2.22.dist-info}/RECORD +11 -11
- {upgini-1.2.20a3657.dev1.dist-info → upgini-1.2.22.dist-info}/WHEEL +1 -1
- {upgini-1.2.20a3657.dev1.dist-info → upgini-1.2.22.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.22"
|
upgini/__init__.py
CHANGED
|
@@ -2,7 +2,6 @@ import os
|
|
|
2
2
|
|
|
3
3
|
from upgini.features_enricher import FeaturesEnricher # noqa: F401
|
|
4
4
|
from upgini.metadata import SearchKey, CVType, RuntimeParameters, ModelTaskType # noqa: F401
|
|
5
|
-
|
|
6
5
|
# from .lazy_import import LazyImport
|
|
7
6
|
|
|
8
7
|
os.environ["SETUPTOOLS_USE_DISTUTILS"] = "stdlib"
|
upgini/autofe/all_operands.py
CHANGED
|
@@ -1,5 +1,87 @@
|
|
|
1
|
-
from
|
|
1
|
+
from copy import deepcopy
|
|
2
|
+
from typing import Dict
|
|
3
|
+
|
|
4
|
+
from upgini.autofe.binary import (
|
|
5
|
+
Add,
|
|
6
|
+
Combine,
|
|
7
|
+
CombineThenFreq,
|
|
8
|
+
Distance,
|
|
9
|
+
Divide,
|
|
10
|
+
JaroWinklerSim1,
|
|
11
|
+
JaroWinklerSim2,
|
|
12
|
+
LevenshteinSim,
|
|
13
|
+
Max,
|
|
14
|
+
Min,
|
|
15
|
+
Multiply,
|
|
16
|
+
Sim,
|
|
17
|
+
Subtract,
|
|
18
|
+
)
|
|
19
|
+
from upgini.autofe.date import (
|
|
20
|
+
DateDiff,
|
|
21
|
+
DateDiffType2,
|
|
22
|
+
DateListDiff,
|
|
23
|
+
DateListDiffBounded,
|
|
24
|
+
DatePercentile,
|
|
25
|
+
DatePercentileMethod2,
|
|
26
|
+
)
|
|
27
|
+
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
|
|
28
|
+
from upgini.autofe.operand import Operand
|
|
29
|
+
from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
|
|
30
|
+
from upgini.autofe.vector import Mean, Sum
|
|
31
|
+
|
|
32
|
+
ALL_OPERANDS: Dict[str, Operand] = {
|
|
33
|
+
op.name: op
|
|
34
|
+
for op in [
|
|
35
|
+
Freq(),
|
|
36
|
+
Mean(),
|
|
37
|
+
Sum(),
|
|
38
|
+
Abs(),
|
|
39
|
+
Log(),
|
|
40
|
+
Sqrt(),
|
|
41
|
+
Square(),
|
|
42
|
+
Sigmoid(),
|
|
43
|
+
Floor(),
|
|
44
|
+
Residual(),
|
|
45
|
+
Min(),
|
|
46
|
+
Max(),
|
|
47
|
+
Add(),
|
|
48
|
+
Subtract(),
|
|
49
|
+
Multiply(),
|
|
50
|
+
Divide(),
|
|
51
|
+
GroupByThenAgg(name="GroupByThenMin", agg="min"),
|
|
52
|
+
GroupByThenAgg(name="GroupByThenMax", agg="max"),
|
|
53
|
+
GroupByThenAgg(name="GroupByThenMean", agg="mean"),
|
|
54
|
+
GroupByThenAgg(name="GroupByThenMedian", agg="median"),
|
|
55
|
+
GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
|
|
56
|
+
GroupByThenRank(),
|
|
57
|
+
Combine(),
|
|
58
|
+
CombineThenFreq(),
|
|
59
|
+
GroupByThenNUnique(),
|
|
60
|
+
GroupByThenFreq(),
|
|
61
|
+
Sim(),
|
|
62
|
+
DateDiff(),
|
|
63
|
+
DateDiffType2(),
|
|
64
|
+
DateListDiff(aggregation="min"),
|
|
65
|
+
DateListDiff(aggregation="max"),
|
|
66
|
+
DateListDiff(aggregation="mean"),
|
|
67
|
+
DateListDiff(aggregation="nunique"),
|
|
68
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=0, upper_bound=18),
|
|
69
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=18, upper_bound=23),
|
|
70
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=23, upper_bound=30),
|
|
71
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
|
|
72
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
|
|
73
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
|
|
74
|
+
DatePercentile(),
|
|
75
|
+
DatePercentileMethod2(),
|
|
76
|
+
Norm(),
|
|
77
|
+
JaroWinklerSim1(),
|
|
78
|
+
JaroWinklerSim2(),
|
|
79
|
+
LevenshteinSim(),
|
|
80
|
+
Distance(),
|
|
81
|
+
Embeddings(),
|
|
82
|
+
]
|
|
83
|
+
}
|
|
2
84
|
|
|
3
85
|
|
|
4
86
|
def find_op(name):
|
|
5
|
-
return
|
|
87
|
+
return deepcopy(ALL_OPERANDS.get(name))
|
upgini/autofe/date.py
CHANGED
|
@@ -7,11 +7,11 @@ import pandas as pd
|
|
|
7
7
|
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
8
8
|
from pydantic import BaseModel, __version__ as pydantic_version
|
|
9
9
|
|
|
10
|
-
from upgini.autofe.operand import PandasOperand
|
|
10
|
+
from upgini.autofe.operand import PandasOperand
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def get_pydantic_version():
|
|
14
|
-
major_version = int(pydantic_version.split(
|
|
14
|
+
major_version = int(pydantic_version.split('.')[0])
|
|
15
15
|
return major_version
|
|
16
16
|
|
|
17
17
|
|
|
@@ -109,7 +109,7 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
|
|
|
109
109
|
_count_aggregations = ["nunique", "count"]
|
|
110
110
|
|
|
111
111
|
|
|
112
|
-
class DateListDiff(PandasOperand, DateDiffMixin
|
|
112
|
+
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
113
113
|
is_binary: bool = True
|
|
114
114
|
has_symmetry_importance: bool = True
|
|
115
115
|
|
|
@@ -134,15 +134,6 @@ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
|
|
|
134
134
|
data["name"] = f"date_diff_{data.get('aggregation')}"
|
|
135
135
|
super().__init__(**data)
|
|
136
136
|
|
|
137
|
-
@classmethod
|
|
138
|
-
def from_formula(cls, formula: str) -> Optional["DateListDiff"]:
|
|
139
|
-
if not formula.startswith("date_diff_"):
|
|
140
|
-
return None
|
|
141
|
-
aggregation = formula.replace("date_diff_", "")
|
|
142
|
-
if "_" in aggregation:
|
|
143
|
-
return None
|
|
144
|
-
return cls(aggregation=aggregation)
|
|
145
|
-
|
|
146
137
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
147
138
|
left = self._convert_to_date(left, self.left_unit)
|
|
148
139
|
right_mask = right.apply(lambda x: len(x) > 0)
|
|
@@ -179,7 +170,7 @@ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
|
|
|
179
170
|
return method(x) if len(x) > 0 else default
|
|
180
171
|
|
|
181
172
|
|
|
182
|
-
class DateListDiffBounded(DateListDiff
|
|
173
|
+
class DateListDiffBounded(DateListDiff):
|
|
183
174
|
lower_bound: Optional[int] = None
|
|
184
175
|
upper_bound: Optional[int] = None
|
|
185
176
|
|
|
@@ -197,23 +188,6 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperand):
|
|
|
197
188
|
data["name"] = "_".join(components)
|
|
198
189
|
super().__init__(**data)
|
|
199
190
|
|
|
200
|
-
@classmethod
|
|
201
|
-
def from_formula(cls, formula: str) -> Optional["DateListDiffBounded"]:
|
|
202
|
-
import re
|
|
203
|
-
|
|
204
|
-
pattern = r"^date_diff_([^_]+)_((minusinf|\d+))_((plusinf|\d+))_(\w+)$"
|
|
205
|
-
match = re.match(pattern, formula)
|
|
206
|
-
|
|
207
|
-
if not match:
|
|
208
|
-
return None
|
|
209
|
-
|
|
210
|
-
diff_unit = match.group(1)
|
|
211
|
-
lower_bound = None if match.group(2) == "minusinf" else int(match.group(2))
|
|
212
|
-
upper_bound = None if match.group(4) == "plusinf" else int(match.group(4))
|
|
213
|
-
aggregation = match.group(6)
|
|
214
|
-
|
|
215
|
-
return cls(diff_unit=diff_unit, lower_bound=lower_bound, upper_bound=upper_bound, aggregation=aggregation)
|
|
216
|
-
|
|
217
191
|
def _agg(self, x):
|
|
218
192
|
x = x[
|
|
219
193
|
(x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
|
|
@@ -283,17 +257,16 @@ class DatePercentile(DatePercentileBase):
|
|
|
283
257
|
# Use @field_validator for Pydantic 2.x
|
|
284
258
|
from pydantic import field_validator
|
|
285
259
|
|
|
286
|
-
@field_validator(
|
|
260
|
+
@field_validator('zero_bounds', mode='before')
|
|
287
261
|
def parse_zero_bounds(cls, value):
|
|
288
262
|
if isinstance(value, str):
|
|
289
263
|
return json.loads(value)
|
|
290
264
|
return value
|
|
291
|
-
|
|
292
265
|
else:
|
|
293
266
|
# Use @validator for Pydantic 1.x
|
|
294
267
|
from pydantic import validator
|
|
295
268
|
|
|
296
|
-
@validator(
|
|
269
|
+
@validator('zero_bounds', pre=True)
|
|
297
270
|
def parse_zero_bounds(cls, value):
|
|
298
271
|
if isinstance(value, str):
|
|
299
272
|
return json.loads(value)
|
upgini/autofe/operand.py
CHANGED
|
@@ -6,47 +6,7 @@ import pandas as pd
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
class
|
|
10
|
-
_registry = {}
|
|
11
|
-
_parametrized_registry = []
|
|
12
|
-
|
|
13
|
-
def __new__(cls, name, bases, attrs):
|
|
14
|
-
new_class = super().__new__(cls, name, bases, attrs)
|
|
15
|
-
# Only register if it's a concrete class that inherits from Operand
|
|
16
|
-
base_classes = [b for b in bases]
|
|
17
|
-
base_names = {b.__name__ for b in bases}
|
|
18
|
-
while base_classes:
|
|
19
|
-
base = base_classes.pop()
|
|
20
|
-
base_names.update(b.__name__ for b in base.__bases__)
|
|
21
|
-
base_classes.extend(base.__bases__)
|
|
22
|
-
|
|
23
|
-
if "Operand" in base_names:
|
|
24
|
-
# Track parametrized operands separately
|
|
25
|
-
if "ParametrizedOperand" in base_names:
|
|
26
|
-
cls._parametrized_registry.append(new_class)
|
|
27
|
-
else:
|
|
28
|
-
try:
|
|
29
|
-
instance = new_class()
|
|
30
|
-
cls._registry[instance.name] = new_class
|
|
31
|
-
except Exception:
|
|
32
|
-
pass
|
|
33
|
-
return new_class
|
|
34
|
-
|
|
35
|
-
@classmethod
|
|
36
|
-
def get_operand(cls, name: str) -> Optional["Operand"]:
|
|
37
|
-
# First try to resolve as a parametrized operand formula
|
|
38
|
-
for operand_cls in cls._parametrized_registry:
|
|
39
|
-
resolved = operand_cls.from_formula(name)
|
|
40
|
-
if resolved is not None:
|
|
41
|
-
return resolved
|
|
42
|
-
# Fall back to direct registry lookup
|
|
43
|
-
non_parametrized = cls._registry.get(name)
|
|
44
|
-
if non_parametrized is not None:
|
|
45
|
-
return non_parametrized()
|
|
46
|
-
return None
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
class Operand(BaseModel, metaclass=OperandRegistry):
|
|
9
|
+
class Operand(BaseModel):
|
|
50
10
|
name: str
|
|
51
11
|
alias: Optional[str] = None
|
|
52
12
|
is_unary: bool = False
|
|
@@ -72,12 +32,6 @@ class Operand(BaseModel, metaclass=OperandRegistry):
|
|
|
72
32
|
return res
|
|
73
33
|
|
|
74
34
|
|
|
75
|
-
class ParametrizedOperand(Operand):
|
|
76
|
-
@classmethod
|
|
77
|
-
def from_formula(cls, formula: str) -> Optional["Operand"]:
|
|
78
|
-
pass
|
|
79
|
-
|
|
80
|
-
|
|
81
35
|
MAIN_COLUMN = "main_column"
|
|
82
36
|
|
|
83
37
|
|
upgini/autofe/vector.py
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
|
-
import
|
|
2
|
-
from typing import Any, Dict, List, Optional
|
|
1
|
+
from typing import List, Optional
|
|
3
2
|
|
|
4
3
|
import pandas as pd
|
|
5
|
-
from pydantic import validator
|
|
6
4
|
|
|
7
|
-
from upgini.autofe.operand import PandasOperand,
|
|
5
|
+
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
8
6
|
|
|
9
7
|
|
|
10
8
|
class Mean(PandasOperand, VectorizableMixin):
|
|
@@ -24,132 +22,3 @@ class Sum(PandasOperand, VectorizableMixin):
|
|
|
24
22
|
|
|
25
23
|
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
26
24
|
return pd.DataFrame(data).T.fillna(0).sum(axis=1)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class TimeSeriesBase(PandasOperand, abc.ABC):
|
|
30
|
-
is_vector: bool = True
|
|
31
|
-
date_unit: Optional[str] = None
|
|
32
|
-
|
|
33
|
-
def get_params(self) -> Dict[str, Optional[str]]:
|
|
34
|
-
res = super().get_params()
|
|
35
|
-
res.update(
|
|
36
|
-
{
|
|
37
|
-
"date_unit": self.date_unit,
|
|
38
|
-
}
|
|
39
|
-
)
|
|
40
|
-
return res
|
|
41
|
-
|
|
42
|
-
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
43
|
-
# assuming first is date, last is value, rest is group columns
|
|
44
|
-
date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
|
|
45
|
-
ts = pd.concat([date] + data[1:], axis=1)
|
|
46
|
-
ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
|
|
47
|
-
ts.set_index(date.name, inplace=True)
|
|
48
|
-
ts = ts[ts.index.notna()].sort_index()
|
|
49
|
-
ts = ts.groupby([c.name for c in data[1:-1]]) if len(data) > 2 else ts
|
|
50
|
-
ts = self._aggregate(ts)
|
|
51
|
-
ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
|
|
52
|
-
|
|
53
|
-
return ts.iloc[:, -1]
|
|
54
|
-
|
|
55
|
-
@abc.abstractmethod
|
|
56
|
-
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
57
|
-
pass
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
class Roll(TimeSeriesBase, ParametrizedOperand):
|
|
64
|
-
aggregation: str
|
|
65
|
-
window_size: int = 1
|
|
66
|
-
window_unit: str = "D"
|
|
67
|
-
|
|
68
|
-
@validator("window_unit")
|
|
69
|
-
def validate_window_unit(cls, v: str) -> str:
|
|
70
|
-
try:
|
|
71
|
-
pd.tseries.frequencies.to_offset(v)
|
|
72
|
-
return v
|
|
73
|
-
except ValueError:
|
|
74
|
-
raise ValueError(
|
|
75
|
-
f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
def __init__(self, **data: Any) -> None:
|
|
79
|
-
if "name" not in data:
|
|
80
|
-
components = [
|
|
81
|
-
"roll",
|
|
82
|
-
str(data.get("window_size") or 1) + str(data.get("window_unit") or "D"),
|
|
83
|
-
data.get("aggregation"),
|
|
84
|
-
]
|
|
85
|
-
data["name"] = "_".join(components).lower()
|
|
86
|
-
super().__init__(**data)
|
|
87
|
-
|
|
88
|
-
@classmethod
|
|
89
|
-
def from_formula(cls, formula: str) -> Optional["Roll"]:
|
|
90
|
-
import re
|
|
91
|
-
|
|
92
|
-
pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
|
|
93
|
-
match = re.match(pattern, formula)
|
|
94
|
-
|
|
95
|
-
if not match:
|
|
96
|
-
return None
|
|
97
|
-
|
|
98
|
-
window_size = int(match.group(1))
|
|
99
|
-
window_unit = match.group(2)
|
|
100
|
-
aggregation = match.group(3)
|
|
101
|
-
|
|
102
|
-
return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
|
|
103
|
-
|
|
104
|
-
def get_params(self) -> Dict[str, Optional[str]]:
|
|
105
|
-
res = super().get_params()
|
|
106
|
-
res.update(
|
|
107
|
-
{
|
|
108
|
-
"window_size": self.window_size,
|
|
109
|
-
"window_unit": self.window_unit,
|
|
110
|
-
"aggregation": self.aggregation,
|
|
111
|
-
}
|
|
112
|
-
)
|
|
113
|
-
return res
|
|
114
|
-
|
|
115
|
-
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
116
|
-
return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=self.window_size).agg(
|
|
117
|
-
_roll_aggregations.get(self.aggregation, self.aggregation)
|
|
118
|
-
)
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
class Lag(TimeSeriesBase, ParametrizedOperand):
|
|
122
|
-
lag_size: int
|
|
123
|
-
lag_unit: str = "D"
|
|
124
|
-
|
|
125
|
-
def __init__(self, **data: Any) -> None:
|
|
126
|
-
if "name" not in data:
|
|
127
|
-
components = [
|
|
128
|
-
"lag",
|
|
129
|
-
str(data.get("lag_size") or 1) + str(data.get("lag_unit") or "D"),
|
|
130
|
-
]
|
|
131
|
-
data["name"] = "_".join(components).lower()
|
|
132
|
-
super().__init__(**data)
|
|
133
|
-
|
|
134
|
-
@classmethod
|
|
135
|
-
def from_formula(cls, formula: str) -> Optional["Lag"]:
|
|
136
|
-
import re
|
|
137
|
-
|
|
138
|
-
pattern = r"^lag_(\d+)([a-zA-Z])$"
|
|
139
|
-
match = re.match(pattern, formula)
|
|
140
|
-
|
|
141
|
-
if not match:
|
|
142
|
-
return None
|
|
143
|
-
|
|
144
|
-
lag_size = int(match.group(1))
|
|
145
|
-
lag_unit = match.group(2)
|
|
146
|
-
|
|
147
|
-
return cls(lag_size=lag_size, lag_unit=lag_unit)
|
|
148
|
-
|
|
149
|
-
def get_params(self) -> Dict[str, Optional[str]]:
|
|
150
|
-
res = super().get_params()
|
|
151
|
-
return res
|
|
152
|
-
|
|
153
|
-
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
|
154
|
-
lag_window = self.lag_size + 1
|
|
155
|
-
return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])
|
upgini/features_enricher.py
CHANGED
|
@@ -228,7 +228,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
228
228
|
):
|
|
229
229
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
230
230
|
self._api_key = api_key or os.environ.get(UPGINI_API_KEY)
|
|
231
|
-
if
|
|
231
|
+
if self._api_key is not None and not isinstance(self._api_key, str):
|
|
232
232
|
raise ValidationError(f"api_key should be `string`, but passed: `{api_key}`")
|
|
233
233
|
self.rest_client = get_rest_client(endpoint, self._api_key, client_ip, client_visitorid)
|
|
234
234
|
self.client_ip = client_ip
|
|
@@ -259,7 +259,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
259
259
|
self.eval_set: Optional[List[Tuple]] = None
|
|
260
260
|
self.autodetected_search_keys: Dict[str, SearchKey] = {}
|
|
261
261
|
self.imbalanced = False
|
|
262
|
-
self.__cached_sampled_datasets:
|
|
262
|
+
self.__cached_sampled_datasets: Dict[str, Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict, Dict]] = (
|
|
263
|
+
dict()
|
|
264
|
+
)
|
|
263
265
|
|
|
264
266
|
validate_version(self.logger)
|
|
265
267
|
self.search_keys = search_keys or {}
|
|
@@ -1583,9 +1585,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1583
1585
|
progress_bar: Optional[ProgressBar],
|
|
1584
1586
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
|
1585
1587
|
) -> _SampledDataForMetrics:
|
|
1586
|
-
|
|
1588
|
+
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
|
1589
|
+
cached_sampled_datasets = self.__cached_sampled_datasets.get(datasets_hash)
|
|
1590
|
+
if cached_sampled_datasets is not None and is_input_same_as_fit and remove_outliers_calc_metrics is None:
|
|
1587
1591
|
self.logger.info("Cached enriched dataset found - use it")
|
|
1588
|
-
return self.__get_sampled_cached_enriched(exclude_features_sources)
|
|
1592
|
+
return self.__get_sampled_cached_enriched(datasets_hash, exclude_features_sources)
|
|
1589
1593
|
elif len(self.feature_importances_) == 0:
|
|
1590
1594
|
self.logger.info("No external features selected. So use only input datasets for metrics calculation")
|
|
1591
1595
|
return self.__sample_only_input(validated_X, validated_y, eval_set, is_demo_dataset)
|
|
@@ -1615,9 +1619,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1615
1619
|
progress_callback,
|
|
1616
1620
|
)
|
|
1617
1621
|
|
|
1618
|
-
def __get_sampled_cached_enriched(
|
|
1622
|
+
def __get_sampled_cached_enriched(
|
|
1623
|
+
self, datasets_hash: str, exclude_features_sources: Optional[List[str]]
|
|
1624
|
+
) -> _SampledDataForMetrics:
|
|
1619
1625
|
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
|
|
1620
|
-
self.__cached_sampled_datasets
|
|
1626
|
+
self.__cached_sampled_datasets[datasets_hash]
|
|
1621
1627
|
)
|
|
1622
1628
|
if exclude_features_sources:
|
|
1623
1629
|
enriched_X = enriched_X.drop(columns=exclude_features_sources, errors="ignore")
|
|
@@ -1692,7 +1698,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1692
1698
|
eval_y_sampled = eval_xy_sampled[TARGET].copy()
|
|
1693
1699
|
enriched_eval_X = eval_X_sampled
|
|
1694
1700
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1695
|
-
|
|
1701
|
+
|
|
1702
|
+
datasets_hash = hash_input(X_sampled, y_sampled, eval_set_sampled_dict)
|
|
1703
|
+
self.__cached_sampled_datasets[datasets_hash] = (
|
|
1696
1704
|
X_sampled,
|
|
1697
1705
|
y_sampled,
|
|
1698
1706
|
enriched_X,
|
|
@@ -1770,7 +1778,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1770
1778
|
enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
|
|
1771
1779
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1772
1780
|
|
|
1773
|
-
self.
|
|
1781
|
+
datasets_hash = hash_input(self.X, self.y, self.eval_set)
|
|
1782
|
+
self.__cached_sampled_datasets[datasets_hash] = (
|
|
1774
1783
|
X_sampled,
|
|
1775
1784
|
y_sampled,
|
|
1776
1785
|
enriched_X,
|
|
@@ -1895,7 +1904,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1895
1904
|
y_sampled = enriched_Xy[TARGET].copy()
|
|
1896
1905
|
enriched_X = enriched_Xy.drop(columns=TARGET)
|
|
1897
1906
|
|
|
1898
|
-
|
|
1907
|
+
datasets_hash = hash_input(X_sampled, y_sampled, eval_set_sampled_dict)
|
|
1908
|
+
self.__cached_sampled_datasets[datasets_hash] = (
|
|
1899
1909
|
X_sampled,
|
|
1900
1910
|
y_sampled,
|
|
1901
1911
|
enriched_X,
|
|
@@ -2426,7 +2436,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2426
2436
|
):
|
|
2427
2437
|
self.warning_counter.reset()
|
|
2428
2438
|
self.df_with_original_index = None
|
|
2429
|
-
self.__cached_sampled_datasets =
|
|
2439
|
+
self.__cached_sampled_datasets = dict()
|
|
2430
2440
|
self.metrics = None
|
|
2431
2441
|
self.fit_columns_renaming = None
|
|
2432
2442
|
self.fit_dropped_features = set()
|
|
@@ -2533,7 +2543,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2533
2543
|
# Checks that need validated date
|
|
2534
2544
|
validate_dates_distribution(df, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
|
|
2535
2545
|
|
|
2536
|
-
if
|
|
2546
|
+
if (
|
|
2547
|
+
is_numeric_dtype(df[self.TARGET_NAME])
|
|
2548
|
+
and self.model_task_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
|
|
2549
|
+
and has_date
|
|
2550
|
+
):
|
|
2537
2551
|
self._validate_PSI(df.sort_values(by=maybe_date_column))
|
|
2538
2552
|
|
|
2539
2553
|
normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
|
|
@@ -4196,6 +4210,8 @@ def hash_input(X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optiona
|
|
|
4196
4210
|
if y is not None:
|
|
4197
4211
|
hashed_objects.append(pd.util.hash_pandas_object(y, index=False).values)
|
|
4198
4212
|
if eval_set is not None:
|
|
4213
|
+
if isinstance(eval_set, tuple):
|
|
4214
|
+
eval_set = [eval_set]
|
|
4199
4215
|
for eval_X, eval_y in eval_set:
|
|
4200
4216
|
hashed_objects.append(pd.util.hash_pandas_object(eval_X, index=False).values)
|
|
4201
4217
|
hashed_objects.append(pd.util.hash_pandas_object(eval_y, index=False).values)
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
2
|
-
upgini/__init__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=P6UdnfqZMN8bM1yBQGaUu5LMabVISCCurCBNtZJOvTE,23
|
|
2
|
+
upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=CK_ymyXeS0JxzBxy2y2UJ7miwy0DUcwdJdJBoFNY0IE,193511
|
|
7
7
|
upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
|
|
@@ -14,14 +14,14 @@ upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1
|
|
|
14
14
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
15
15
|
upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
upgini/autofe/all_operands.py,sha256=
|
|
17
|
+
upgini/autofe/all_operands.py,sha256=cCCB44qvkmuWyiRM5Xykx8tkHPIjQthrWyj67STWN80,2578
|
|
18
18
|
upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
|
|
19
|
-
upgini/autofe/date.py,sha256=
|
|
19
|
+
upgini/autofe/date.py,sha256=OpFc3Al0xO3qlESn2Uokfxw51ArVqmh3xngWwdrsaqE,9762
|
|
20
20
|
upgini/autofe/feature.py,sha256=eL7wABUhDKZzv3E-RPJNcyGwSfB0UptcfU2RbvsOks4,15082
|
|
21
21
|
upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
|
|
22
|
-
upgini/autofe/operand.py,sha256=
|
|
22
|
+
upgini/autofe/operand.py,sha256=uk883RaNqgXqtkaRqA1re1d9OFnnpv0JVvelYx09Yw0,2943
|
|
23
23
|
upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
|
|
24
|
-
upgini/autofe/vector.py,sha256=
|
|
24
|
+
upgini/autofe/vector.py,sha256=ehcZUDqV71TfbU8EmKfdYp603gS2dJY_-fpr10ho5sI,663
|
|
25
25
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
26
|
upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lYQaGZbxDnOd4A3Q,22516
|
|
27
27
|
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
57
57
|
upgini/utils/target_utils.py,sha256=qHzZRmICFbLNCrmVqGkaBcjm91L2ERRZMppci36acV4,10085
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.2.
|
|
61
|
-
upgini-1.2.
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
60
|
+
upgini-1.2.22.dist-info/METADATA,sha256=xz213bCp7FlucAgHEqT8KlX7G0E_BMwP3wN444cz3QU,48578
|
|
61
|
+
upgini-1.2.22.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
62
|
+
upgini-1.2.22.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.2.22.dist-info/RECORD,,
|
|
File without changes
|