upgini 1.2.70a3832.dev2__py3-none-any.whl → 1.2.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/date.py +15 -21
- upgini/autofe/feature.py +5 -1
- upgini/autofe/timeseries/base.py +3 -9
- upgini/autofe/timeseries/cross.py +22 -12
- upgini/autofe/timeseries/roll.py +2 -7
- upgini/autofe/timeseries/trend.py +2 -1
- upgini/autofe/utils.py +83 -0
- upgini/dataset.py +8 -1
- upgini/features_enricher.py +340 -275
- upgini/metadata.py +4 -0
- upgini/metrics.py +67 -60
- upgini/resource_bundle/strings.properties +1 -0
- upgini/search_task.py +7 -1
- upgini/utils/mstats.py +1 -1
- upgini/utils/sklearn_ext.py +11 -0
- upgini/utils/sort.py +1 -1
- upgini/utils/target_utils.py +4 -2
- {upgini-1.2.70a3832.dev2.dist-info → upgini-1.2.71.dist-info}/METADATA +3 -4
- {upgini-1.2.70a3832.dev2.dist-info → upgini-1.2.71.dist-info}/RECORD +22 -22
- upgini/lazy_import.py +0 -35
- {upgini-1.2.70a3832.dev2.dist-info → upgini-1.2.71.dist-info}/WHEEL +0 -0
- {upgini-1.2.70a3832.dev2.dist-info → upgini-1.2.71.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.71"
|
upgini/autofe/date.py
CHANGED
|
@@ -8,6 +8,7 @@ from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
|
8
8
|
from pydantic import BaseModel, __version__ as pydantic_version
|
|
9
9
|
|
|
10
10
|
from upgini.autofe.operator import PandasOperator, ParametrizedOperator
|
|
11
|
+
from upgini.autofe.utils import pydantic_validator
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
def get_pydantic_version():
|
|
@@ -209,6 +210,14 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
|
|
|
209
210
|
|
|
210
211
|
return cls(diff_unit=diff_unit, lower_bound=lower_bound, upper_bound=upper_bound, aggregation=aggregation)
|
|
211
212
|
|
|
213
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
214
|
+
res = super().get_params()
|
|
215
|
+
if self.lower_bound is not None:
|
|
216
|
+
res["lower_bound"] = str(self.lower_bound)
|
|
217
|
+
if self.upper_bound is not None:
|
|
218
|
+
res["upper_bound"] = str(self.upper_bound)
|
|
219
|
+
return res
|
|
220
|
+
|
|
212
221
|
def _agg(self, x):
|
|
213
222
|
x = x[
|
|
214
223
|
(x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
|
|
@@ -269,32 +278,17 @@ class DatePercentile(DatePercentileBase):
|
|
|
269
278
|
{
|
|
270
279
|
"zero_month": self.zero_month,
|
|
271
280
|
"zero_year": self.zero_year,
|
|
272
|
-
"zero_bounds": self.zero_bounds,
|
|
281
|
+
"zero_bounds": json.dumps(self.zero_bounds),
|
|
273
282
|
"step": self.step,
|
|
274
283
|
}
|
|
275
284
|
)
|
|
276
285
|
return res
|
|
277
286
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
@field_validator("zero_bounds", mode="before")
|
|
284
|
-
def parse_zero_bounds(cls, value):
|
|
285
|
-
if isinstance(value, str):
|
|
286
|
-
return json.loads(value)
|
|
287
|
-
return value
|
|
288
|
-
|
|
289
|
-
else:
|
|
290
|
-
# Use @validator for Pydantic 1.x
|
|
291
|
-
from pydantic import validator
|
|
292
|
-
|
|
293
|
-
@validator("zero_bounds", pre=True)
|
|
294
|
-
def parse_zero_bounds(cls, value):
|
|
295
|
-
if isinstance(value, str):
|
|
296
|
-
return json.loads(value)
|
|
297
|
-
return value
|
|
287
|
+
@pydantic_validator("zero_bounds", mode="before")
|
|
288
|
+
def parse_zero_bounds(cls, value):
|
|
289
|
+
if isinstance(value, str):
|
|
290
|
+
return json.loads(value)
|
|
291
|
+
return value
|
|
298
292
|
|
|
299
293
|
def _get_bounds(self, date_col: pd.Series) -> pd.Series:
|
|
300
294
|
months = date_col.dt.month
|
upgini/autofe/feature.py
CHANGED
|
@@ -112,7 +112,11 @@ class Feature:
|
|
|
112
112
|
|
|
113
113
|
def get_hash(self) -> str:
|
|
114
114
|
return hashlib.sha256(
|
|
115
|
-
"_".join(
|
|
115
|
+
"_".join(
|
|
116
|
+
[self.op.get_hash_component()]
|
|
117
|
+
+ [ch.op.get_hash_component() for ch in self.children if isinstance(ch, Feature)]
|
|
118
|
+
+ [ch.get_display_name() for ch in self.children]
|
|
119
|
+
).encode("utf-8")
|
|
116
120
|
).hexdigest()[:8]
|
|
117
121
|
|
|
118
122
|
def set_alias(self, alias: str) -> "Feature":
|
upgini/autofe/timeseries/base.py
CHANGED
|
@@ -1,15 +1,9 @@
|
|
|
1
1
|
import abc
|
|
2
|
-
from typing import Dict, List, Optional
|
|
2
|
+
from typing import Dict, List, Optional, Tuple
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
from upgini.autofe.operator import PandasOperator
|
|
6
6
|
|
|
7
|
-
# Used in derived classes
|
|
8
|
-
try:
|
|
9
|
-
from pydantic import field_validator as validator # V2
|
|
10
|
-
except ImportError:
|
|
11
|
-
from pydantic import validator # V1
|
|
12
|
-
|
|
13
7
|
|
|
14
8
|
class TimeSeriesBase(PandasOperator, abc.ABC):
|
|
15
9
|
is_vector: bool = True
|
|
@@ -70,7 +64,7 @@ class TimeSeriesBase(PandasOperator, abc.ABC):
|
|
|
70
64
|
return base_formula
|
|
71
65
|
|
|
72
66
|
@classmethod
|
|
73
|
-
def _parse_offset_from_formula(cls, formula: str, base_regex: str) ->
|
|
67
|
+
def _parse_offset_from_formula(cls, formula: str, base_regex: str) -> Tuple[Optional[Dict], Optional[str]]:
|
|
74
68
|
"""
|
|
75
69
|
Parse the offset component from a formula.
|
|
76
70
|
|
|
@@ -85,7 +79,7 @@ class TimeSeriesBase(PandasOperator, abc.ABC):
|
|
|
85
79
|
"""
|
|
86
80
|
import re
|
|
87
81
|
|
|
88
|
-
offset_regex = f"{base_regex}_offset_(\\d+)([a-zA-Z])"
|
|
82
|
+
offset_regex = f"{base_regex}_offset_(\\d+)([a-zA-Z])$"
|
|
89
83
|
match = re.match(offset_regex, formula)
|
|
90
84
|
|
|
91
85
|
if match:
|
|
@@ -1,16 +1,13 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from typing import Dict, List, Optional
|
|
2
3
|
|
|
3
4
|
import numpy as np
|
|
4
5
|
import pandas as pd
|
|
5
6
|
|
|
6
|
-
try:
|
|
7
|
-
from pydantic import field_validator as validator # V2
|
|
8
|
-
except ImportError:
|
|
9
|
-
from pydantic import validator # V1
|
|
10
|
-
|
|
11
7
|
from upgini.autofe.all_operators import find_op
|
|
12
8
|
from upgini.autofe.operator import PandasOperator, ParametrizedOperator
|
|
13
9
|
from upgini.autofe.timeseries.base import TimeSeriesBase
|
|
10
|
+
from upgini.autofe.utils import pydantic_validator
|
|
14
11
|
|
|
15
12
|
|
|
16
13
|
class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
|
|
@@ -20,11 +17,24 @@ class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
|
|
|
20
17
|
left_descriptor: List[str] = []
|
|
21
18
|
right_descriptor: List[str] = []
|
|
22
19
|
|
|
23
|
-
@
|
|
24
|
-
@classmethod
|
|
20
|
+
@pydantic_validator("descriptor_indices", mode="before")
|
|
25
21
|
def validate_descriptor_indices(cls, v):
|
|
22
|
+
if isinstance(v, str):
|
|
23
|
+
v = json.loads(v)
|
|
26
24
|
if not v:
|
|
27
|
-
raise ValueError("descriptor_indices cannot be empty
|
|
25
|
+
raise ValueError("descriptor_indices cannot be empty")
|
|
26
|
+
return v
|
|
27
|
+
|
|
28
|
+
@pydantic_validator("left_descriptor", "right_descriptor", mode="before")
|
|
29
|
+
def parse_descriptors(cls, v):
|
|
30
|
+
if isinstance(v, str):
|
|
31
|
+
return json.loads(v)
|
|
32
|
+
return v
|
|
33
|
+
|
|
34
|
+
@pydantic_validator("interaction_op", mode="before")
|
|
35
|
+
def validate_interaction_op(cls, v):
|
|
36
|
+
if isinstance(v, str):
|
|
37
|
+
return find_op(v)
|
|
28
38
|
return v
|
|
29
39
|
|
|
30
40
|
def __init__(self, **data):
|
|
@@ -83,14 +93,14 @@ class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
|
|
|
83
93
|
|
|
84
94
|
return cls(**params)
|
|
85
95
|
|
|
86
|
-
def get_params(self) -> Dict[str, str
|
|
96
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
87
97
|
res = super().get_params()
|
|
88
98
|
res.update(
|
|
89
99
|
{
|
|
90
100
|
"interaction_op": self._get_interaction_op_name(),
|
|
91
|
-
"descriptor_indices": self.descriptor_indices,
|
|
92
|
-
"left_descriptor": self.left_descriptor,
|
|
93
|
-
"right_descriptor": self.right_descriptor,
|
|
101
|
+
"descriptor_indices": json.dumps(self.descriptor_indices),
|
|
102
|
+
"left_descriptor": json.dumps(self.left_descriptor),
|
|
103
|
+
"right_descriptor": json.dumps(self.right_descriptor),
|
|
94
104
|
}
|
|
95
105
|
)
|
|
96
106
|
return res
|
upgini/autofe/timeseries/roll.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import Dict, Optional
|
|
|
3
3
|
|
|
4
4
|
from upgini.autofe.operator import ParametrizedOperator
|
|
5
5
|
from upgini.autofe.timeseries.base import TimeSeriesBase
|
|
6
|
+
from upgini.autofe.utils import pydantic_validator
|
|
6
7
|
|
|
7
8
|
# Roll aggregation functions
|
|
8
9
|
roll_aggregations = {
|
|
@@ -12,19 +13,13 @@ roll_aggregations = {
|
|
|
12
13
|
"iqr": lambda x: x.quantile(0.75) - x.quantile(0.25),
|
|
13
14
|
}
|
|
14
15
|
|
|
15
|
-
try:
|
|
16
|
-
from pydantic import field_validator as validator # V2
|
|
17
|
-
except ImportError:
|
|
18
|
-
from pydantic import validator # V1
|
|
19
|
-
|
|
20
16
|
|
|
21
17
|
class Roll(TimeSeriesBase, ParametrizedOperator):
|
|
22
18
|
aggregation: str
|
|
23
19
|
window_size: int = 1
|
|
24
20
|
window_unit: str = "D"
|
|
25
21
|
|
|
26
|
-
@
|
|
27
|
-
@classmethod
|
|
22
|
+
@pydantic_validator("window_unit")
|
|
28
23
|
def validate_window_unit(cls, v: str) -> str:
|
|
29
24
|
try:
|
|
30
25
|
pd.tseries.frequencies.to_offset(v)
|
|
@@ -2,10 +2,11 @@ from typing import Dict, Optional, Union
|
|
|
2
2
|
import numpy as np
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
5
|
+
from upgini.autofe.operator import ParametrizedOperator
|
|
5
6
|
from upgini.autofe.timeseries.base import TimeSeriesBase
|
|
6
7
|
|
|
7
8
|
|
|
8
|
-
class TrendCoefficient(TimeSeriesBase):
|
|
9
|
+
class TrendCoefficient(TimeSeriesBase, ParametrizedOperator):
|
|
9
10
|
name: str = "trend_coef"
|
|
10
11
|
step_size: int = 1
|
|
11
12
|
step_unit: str = "D"
|
upgini/autofe/utils.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility functions for autofe module.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import functools
|
|
6
|
+
from typing import Callable
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_pydantic_version():
|
|
10
|
+
"""
|
|
11
|
+
Get the major version of pydantic.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
int: Major version number (1 or 2)
|
|
15
|
+
"""
|
|
16
|
+
try:
|
|
17
|
+
from pydantic import __version__ as pydantic_version
|
|
18
|
+
|
|
19
|
+
major_version = int(pydantic_version.split(".")[0])
|
|
20
|
+
return major_version
|
|
21
|
+
except (ImportError, ValueError):
|
|
22
|
+
# Default to version 1 if unable to determine
|
|
23
|
+
return 1
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def pydantic_validator(field_name: str, *fields, mode: str = "before", **kwargs):
|
|
27
|
+
"""
|
|
28
|
+
A decorator that applies the appropriate Pydantic validator based on the installed version.
|
|
29
|
+
|
|
30
|
+
This decorator handles the differences between Pydantic v1 and v2 validator syntax,
|
|
31
|
+
making it easier to write code that works with both versions.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
field_name (str): The name of the field to validate
|
|
35
|
+
mode (str): The validation mode, either "before" or "after" (for Pydantic v2)
|
|
36
|
+
**kwargs: Additional arguments to pass to the validator
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Callable: A decorator that can be applied to validator methods
|
|
40
|
+
|
|
41
|
+
Example:
|
|
42
|
+
```python
|
|
43
|
+
class MyModel(BaseModel):
|
|
44
|
+
items: List[int]
|
|
45
|
+
|
|
46
|
+
@pydantic_validator("items")
|
|
47
|
+
def parse_items(cls, value):
|
|
48
|
+
if isinstance(value, str):
|
|
49
|
+
return [int(x) for x in value.split(",")]
|
|
50
|
+
return value
|
|
51
|
+
```
|
|
52
|
+
"""
|
|
53
|
+
pydantic_version = get_pydantic_version()
|
|
54
|
+
|
|
55
|
+
if pydantic_version >= 2:
|
|
56
|
+
# Use field_validator for Pydantic 2.x
|
|
57
|
+
from pydantic import field_validator
|
|
58
|
+
|
|
59
|
+
def decorator(func: Callable) -> Callable:
|
|
60
|
+
@field_validator(field_name, *fields, mode=mode, **kwargs)
|
|
61
|
+
@functools.wraps(func)
|
|
62
|
+
def wrapper(cls, value, **kw):
|
|
63
|
+
return func(cls, value)
|
|
64
|
+
|
|
65
|
+
return wrapper
|
|
66
|
+
|
|
67
|
+
return decorator
|
|
68
|
+
else:
|
|
69
|
+
# Use validator for Pydantic 1.x
|
|
70
|
+
from pydantic import validator
|
|
71
|
+
|
|
72
|
+
# Map mode to Pydantic v1 parameters
|
|
73
|
+
pre = True if mode == "before" else False
|
|
74
|
+
|
|
75
|
+
def decorator(func: Callable) -> Callable:
|
|
76
|
+
@validator(field_name, *fields, pre=pre, **kwargs)
|
|
77
|
+
@functools.wraps(func)
|
|
78
|
+
def wrapper(cls, value, **kw):
|
|
79
|
+
return func(cls, value)
|
|
80
|
+
|
|
81
|
+
return wrapper
|
|
82
|
+
|
|
83
|
+
return decorator
|
upgini/dataset.py
CHANGED
|
@@ -22,6 +22,7 @@ from upgini.metadata import (
|
|
|
22
22
|
EVAL_SET_INDEX,
|
|
23
23
|
SYSTEM_RECORD_ID,
|
|
24
24
|
TARGET,
|
|
25
|
+
AutoFEParameters,
|
|
25
26
|
CVType,
|
|
26
27
|
DataType,
|
|
27
28
|
FeaturesFilter,
|
|
@@ -558,6 +559,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
558
559
|
filter_features: Optional[dict] = None,
|
|
559
560
|
runtime_parameters: Optional[RuntimeParameters] = None,
|
|
560
561
|
metrics_calculation: Optional[bool] = False,
|
|
562
|
+
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
|
561
563
|
) -> SearchCustomization:
|
|
562
564
|
# self.logger.info("Constructing search customization")
|
|
563
565
|
search_customization = SearchCustomization(
|
|
@@ -585,7 +587,10 @@ class Dataset: # (pd.DataFrame):
|
|
|
585
587
|
search_customization.featuresFilter = feature_filter
|
|
586
588
|
|
|
587
589
|
search_customization.runtimeParameters.properties["etalon_imbalanced"] = self.imbalanced
|
|
588
|
-
|
|
590
|
+
if auto_fe_parameters is not None:
|
|
591
|
+
search_customization.runtimeParameters.properties["feature_generation_params.ts.gap_days"] = (
|
|
592
|
+
auto_fe_parameters.ts_gap_days
|
|
593
|
+
)
|
|
589
594
|
return search_customization
|
|
590
595
|
|
|
591
596
|
def _rename_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
|
|
@@ -640,6 +645,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
640
645
|
max_features: Optional[int] = None, # deprecated
|
|
641
646
|
filter_features: Optional[dict] = None, # deprecated
|
|
642
647
|
runtime_parameters: Optional[RuntimeParameters] = None,
|
|
648
|
+
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
|
643
649
|
force_downsampling: bool = False,
|
|
644
650
|
) -> SearchTask:
|
|
645
651
|
if self.etalon_def is None:
|
|
@@ -658,6 +664,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
658
664
|
max_features=max_features,
|
|
659
665
|
filter_features=filter_features,
|
|
660
666
|
runtime_parameters=runtime_parameters,
|
|
667
|
+
auto_fe_parameters=auto_fe_parameters,
|
|
661
668
|
)
|
|
662
669
|
|
|
663
670
|
if self.file_upload_id is not None and self.rest_client.check_uploaded_file_v2(
|