upgini 1.2.60a3792.dev2__py3-none-any.whl → 1.2.62a3818.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.60a3792.dev2"
1
+ __version__ = "1.2.62a3818.dev1"
@@ -1,4 +1,4 @@
1
- from upgini.autofe.operand import OperandRegistry
1
+ from upgini.autofe.operator import OperatorRegistry
2
2
  from upgini.autofe.unary import * # noqa
3
3
  from upgini.autofe.binary import * # noqa
4
4
  from upgini.autofe.groupby import * # noqa
@@ -7,4 +7,4 @@ from upgini.autofe.vector import * # noqa
7
7
 
8
8
 
9
9
  def find_op(name):
10
- return OperandRegistry.get_operand(name)
10
+ return OperatorRegistry.get_operand(name)
upgini/autofe/binary.py CHANGED
@@ -5,7 +5,7 @@ import numpy as np
5
5
  import pandas as pd
6
6
  from jarowinkler import jarowinkler_similarity
7
7
 
8
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
8
+ from upgini.autofe.operator import PandasOperand, VectorizableMixin
9
9
 
10
10
 
11
11
  class Min(PandasOperand):
upgini/autofe/date.py CHANGED
@@ -1,13 +1,13 @@
1
1
  import abc
2
2
  import json
3
- from typing import Any, Dict, List, Optional, Union
3
+ from typing import Dict, List, Optional, Union
4
4
 
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
  from pandas.core.arrays.timedeltas import TimedeltaArray
8
8
  from pydantic import BaseModel, __version__ as pydantic_version
9
9
 
10
- from upgini.autofe.operand import PandasOperand, ParametrizedOperand
10
+ from upgini.autofe.operator import PandasOperand, ParametrizedOperand
11
11
 
12
12
 
13
13
  def get_pydantic_version():
upgini/autofe/feature.py CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from pandas._typing import DtypeObj
8
8
 
9
9
  from upgini.autofe.all_operands import find_op
10
- from upgini.autofe.operand import Operand, PandasOperand
10
+ from upgini.autofe.operator import Operand, PandasOperand
11
11
 
12
12
 
13
13
  class Column:
upgini/autofe/groupby.py CHANGED
@@ -2,7 +2,7 @@ from typing import Optional
2
2
 
3
3
  import pandas as pd
4
4
 
5
- from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
5
+ from upgini.autofe.operator import PandasOperand, ParametrizedOperand, VectorizableMixin
6
6
 
7
7
 
8
8
  class GroupByThenAgg(
@@ -6,7 +6,7 @@ import pandas as pd
6
6
  from pydantic import BaseModel
7
7
 
8
8
 
9
- class OperandRegistry(type(BaseModel)):
9
+ class OperatorRegistry(type(BaseModel)):
10
10
  _registry = {}
11
11
  _parametrized_registry = []
12
12
 
@@ -46,7 +46,7 @@ class OperandRegistry(type(BaseModel)):
46
46
  return None
47
47
 
48
48
 
49
- class Operand(BaseModel, metaclass=OperandRegistry):
49
+ class Operand(BaseModel, metaclass=OperatorRegistry):
50
50
  name: Optional[str] = None
51
51
  alias: Optional[str] = None
52
52
  is_unary: bool = False
@@ -0,0 +1,200 @@
1
+ import abc
2
+ from typing import Dict, List, Optional
3
+
4
+ import pandas as pd
5
+ from upgini.autofe.operator import PandasOperand, ParametrizedOperand
6
+
7
+ try:
8
+ from pydantic import field_validator as validator # V2
9
+ except ImportError:
10
+ from pydantic import validator # V1
11
+
12
+
13
+ class TimeSeriesBase(PandasOperand, abc.ABC):
14
+ is_vector: bool = True
15
+ date_unit: Optional[str] = None
16
+ offset_size: int = 0
17
+ offset_unit: str = "D"
18
+
19
+ def get_params(self) -> Dict[str, Optional[str]]:
20
+ res = super().get_params()
21
+ res.update(
22
+ {
23
+ "date_unit": self.date_unit,
24
+ "offset_size": self.offset_size,
25
+ "offset_unit": self.offset_unit,
26
+ }
27
+ )
28
+ return res
29
+
30
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
31
+ # assuming first is date, last is value, rest is group columns
32
+ date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
33
+ ts = pd.concat([date] + data[1:], axis=1)
34
+ ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
35
+ ts.set_index(date.name, inplace=True)
36
+ ts = ts[ts.index.notna()].sort_index()
37
+ ts = (
38
+ ts.groupby([c.name for c in data[1:-1]], group_keys=True)
39
+ .apply(self._shift)[data[-1].name]
40
+ .to_frame()
41
+ .reset_index()
42
+ .set_index(date.name)
43
+ .groupby([c.name for c in data[1:-1]])
44
+ if len(data) > 2
45
+ else self._shift(ts)
46
+ )
47
+ ts = self._aggregate(ts)
48
+ ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
49
+ ts.index = date.index
50
+
51
+ return ts.iloc[:, -1]
52
+
53
+ def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
54
+ if self.offset_size > 0:
55
+ return ts.iloc[:, :-1].merge(
56
+ ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
57
+ left_index=True,
58
+ right_index=True,
59
+ )
60
+ return ts
61
+
62
+ @abc.abstractmethod
63
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
64
+ pass
65
+
66
+
67
+ _roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean(), "last": lambda x: x[-1]}
68
+
69
+
70
+ class Roll(TimeSeriesBase, ParametrizedOperand):
71
+ aggregation: str
72
+ window_size: int = 1
73
+ window_unit: str = "D"
74
+
75
+ @validator("window_unit")
76
+ @classmethod
77
+ def validate_window_unit(cls, v: str) -> str:
78
+ try:
79
+ pd.tseries.frequencies.to_offset(v)
80
+ return v
81
+ except ValueError:
82
+ raise ValueError(
83
+ f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
84
+ )
85
+
86
+ def to_formula(self) -> str:
87
+ roll_component = f"roll_{self.window_size}{self.window_unit}"
88
+ if self.offset_size > 0:
89
+ roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
90
+ return f"{roll_component}_{self.aggregation}"
91
+
92
+ @classmethod
93
+ def from_formula(cls, formula: str) -> Optional["Roll"]:
94
+ import re
95
+
96
+ # Try matching pattern with offset first
97
+ pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
98
+ match_with_offset = re.match(pattern_with_offset, formula)
99
+
100
+ if match_with_offset:
101
+ window_size = int(match_with_offset.group(1))
102
+ window_unit = match_with_offset.group(2)
103
+ offset_size = int(match_with_offset.group(3))
104
+ offset_unit = match_with_offset.group(4)
105
+ aggregation = match_with_offset.group(5)
106
+
107
+ return cls(
108
+ window_size=window_size,
109
+ window_unit=window_unit,
110
+ offset_size=offset_size,
111
+ offset_unit=offset_unit,
112
+ aggregation=aggregation,
113
+ )
114
+
115
+ # If no offset pattern found, try basic pattern
116
+ pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
117
+ match = re.match(pattern, formula)
118
+
119
+ if not match:
120
+ return None
121
+
122
+ window_size = int(match.group(1))
123
+ window_unit = match.group(2)
124
+ aggregation = match.group(3)
125
+
126
+ return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
127
+
128
+ def get_params(self) -> Dict[str, Optional[str]]:
129
+ res = super().get_params()
130
+ res.update(
131
+ {
132
+ "window_size": self.window_size,
133
+ "window_unit": self.window_unit,
134
+ "aggregation": self.aggregation,
135
+ }
136
+ )
137
+ return res
138
+
139
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
140
+ return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
141
+ _roll_aggregations.get(self.aggregation, self.aggregation)
142
+ )
143
+
144
+
145
+ class Lag(TimeSeriesBase, ParametrizedOperand):
146
+ lag_size: int
147
+ lag_unit: str = "D"
148
+
149
+ def to_formula(self) -> str:
150
+ lag_component = f"lag_{self.lag_size}{self.lag_unit}"
151
+ if self.offset_size > 0:
152
+ lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
153
+ return lag_component
154
+
155
+ @classmethod
156
+ def from_formula(cls, formula: str) -> Optional["Lag"]:
157
+ import re
158
+
159
+ # Try matching pattern with offset first
160
+ pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
161
+ match_with_offset = re.match(pattern_with_offset, formula)
162
+
163
+ if match_with_offset:
164
+ lag_size = int(match_with_offset.group(1))
165
+ lag_unit = match_with_offset.group(2)
166
+ offset_size = int(match_with_offset.group(3))
167
+ offset_unit = match_with_offset.group(4)
168
+
169
+ return cls(
170
+ lag_size=lag_size,
171
+ lag_unit=lag_unit,
172
+ offset_size=offset_size,
173
+ offset_unit=offset_unit,
174
+ )
175
+
176
+ # If no offset pattern found, try basic pattern
177
+ pattern = r"^lag_(\d+)([a-zA-Z])$"
178
+ match = re.match(pattern, formula)
179
+
180
+ if not match:
181
+ return None
182
+
183
+ lag_size = int(match.group(1))
184
+ lag_unit = match.group(2)
185
+
186
+ return cls(lag_size=lag_size, lag_unit=lag_unit)
187
+
188
+ def get_params(self) -> Dict[str, Optional[str]]:
189
+ res = super().get_params()
190
+ res.update(
191
+ {
192
+ "lag_size": self.lag_size,
193
+ "lag_unit": self.lag_unit,
194
+ }
195
+ )
196
+ return res
197
+
198
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
199
+ lag_window = self.lag_size + 1
200
+ return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])
upgini/autofe/unary.py CHANGED
@@ -2,7 +2,7 @@ from typing import Dict, Optional
2
2
  import numpy as np
3
3
  import pandas as pd
4
4
 
5
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
5
+ from upgini.autofe.operator import PandasOperand, VectorizableMixin
6
6
 
7
7
 
8
8
  class Abs(PandasOperand, VectorizableMixin):
upgini/autofe/vector.py CHANGED
@@ -1,14 +1,8 @@
1
- import abc
2
- from typing import Dict, List, Optional
1
+ from typing import List, Optional
3
2
 
4
3
  import pandas as pd
5
4
 
6
- try:
7
- from pydantic import field_validator as validator # V2
8
- except ImportError:
9
- from pydantic import validator # V1
10
-
11
- from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
5
+ from upgini.autofe.operator import PandasOperand, VectorizableMixin
12
6
 
13
7
 
14
8
  class Mean(PandasOperand, VectorizableMixin):
@@ -28,193 +22,3 @@ class Sum(PandasOperand, VectorizableMixin):
28
22
 
29
23
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
30
24
  return pd.DataFrame(data).T.fillna(0).sum(axis=1)
31
-
32
-
33
- class TimeSeriesBase(PandasOperand, abc.ABC):
34
- is_vector: bool = True
35
- date_unit: Optional[str] = None
36
- offset_size: int = 0
37
- offset_unit: str = "D"
38
-
39
- def get_params(self) -> Dict[str, Optional[str]]:
40
- res = super().get_params()
41
- res.update(
42
- {
43
- "date_unit": self.date_unit,
44
- "offset_size": self.offset_size,
45
- "offset_unit": self.offset_unit,
46
- }
47
- )
48
- return res
49
-
50
- def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
51
- # assuming first is date, last is value, rest is group columns
52
- date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
53
- ts = pd.concat([date] + data[1:], axis=1)
54
- ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
55
- ts.set_index(date.name, inplace=True)
56
- ts = ts[ts.index.notna()].sort_index()
57
- ts = (
58
- ts.groupby([c.name for c in data[1:-1]])
59
- .apply(self._shift)[data[-1].name]
60
- .to_frame()
61
- .reset_index()
62
- .set_index(date.name)
63
- .groupby([c.name for c in data[1:-1]])
64
- if len(data) > 2
65
- else self._shift(ts)
66
- )
67
- ts = self._aggregate(ts)
68
- ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
69
- ts.index = date.index
70
-
71
- return ts.iloc[:, -1]
72
-
73
- def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
74
- if self.offset_size > 0:
75
- return ts.iloc[:, :-1].merge(
76
- ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
77
- left_index=True,
78
- right_index=True,
79
- )
80
- return ts
81
-
82
- @abc.abstractmethod
83
- def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
84
- pass
85
-
86
-
87
- _roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
88
-
89
-
90
- class Roll(TimeSeriesBase, ParametrizedOperand):
91
- aggregation: str
92
- window_size: int = 1
93
- window_unit: str = "D"
94
-
95
- @validator("window_unit")
96
- @classmethod
97
- def validate_window_unit(cls, v: str) -> str:
98
- try:
99
- pd.tseries.frequencies.to_offset(v)
100
- return v
101
- except ValueError:
102
- raise ValueError(
103
- f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
104
- )
105
-
106
- def to_formula(self) -> str:
107
- roll_component = f"roll_{self.window_size}{self.window_unit}"
108
- if self.offset_size > 0:
109
- roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
110
- return f"{roll_component}_{self.aggregation}"
111
-
112
- @classmethod
113
- def from_formula(cls, formula: str) -> Optional["Roll"]:
114
- import re
115
-
116
- # Try matching pattern with offset first
117
- pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
118
- match_with_offset = re.match(pattern_with_offset, formula)
119
-
120
- if match_with_offset:
121
- window_size = int(match_with_offset.group(1))
122
- window_unit = match_with_offset.group(2)
123
- offset_size = int(match_with_offset.group(3))
124
- offset_unit = match_with_offset.group(4)
125
- aggregation = match_with_offset.group(5)
126
-
127
- return cls(
128
- window_size=window_size,
129
- window_unit=window_unit,
130
- offset_size=offset_size,
131
- offset_unit=offset_unit,
132
- aggregation=aggregation,
133
- )
134
-
135
- # If no offset pattern found, try basic pattern
136
- pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
137
- match = re.match(pattern, formula)
138
-
139
- if not match:
140
- return None
141
-
142
- window_size = int(match.group(1))
143
- window_unit = match.group(2)
144
- aggregation = match.group(3)
145
-
146
- return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
147
-
148
- def get_params(self) -> Dict[str, Optional[str]]:
149
- res = super().get_params()
150
- res.update(
151
- {
152
- "window_size": self.window_size,
153
- "window_unit": self.window_unit,
154
- "aggregation": self.aggregation,
155
- }
156
- )
157
- return res
158
-
159
- def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
160
- return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
161
- _roll_aggregations.get(self.aggregation, self.aggregation)
162
- )
163
-
164
-
165
- class Lag(TimeSeriesBase, ParametrizedOperand):
166
- lag_size: int
167
- lag_unit: str = "D"
168
-
169
- def to_formula(self) -> str:
170
- lag_component = f"lag_{self.lag_size}{self.lag_unit}"
171
- if self.offset_size > 0:
172
- lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
173
- return lag_component
174
-
175
- @classmethod
176
- def from_formula(cls, formula: str) -> Optional["Lag"]:
177
- import re
178
-
179
- # Try matching pattern with offset first
180
- pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
181
- match_with_offset = re.match(pattern_with_offset, formula)
182
-
183
- if match_with_offset:
184
- lag_size = int(match_with_offset.group(1))
185
- lag_unit = match_with_offset.group(2)
186
- offset_size = int(match_with_offset.group(3))
187
- offset_unit = match_with_offset.group(4)
188
-
189
- return cls(
190
- lag_size=lag_size,
191
- lag_unit=lag_unit,
192
- offset_size=offset_size,
193
- offset_unit=offset_unit,
194
- )
195
-
196
- # If no offset pattern found, try basic pattern
197
- pattern = r"^lag_(\d+)([a-zA-Z])$"
198
- match = re.match(pattern, formula)
199
-
200
- if not match:
201
- return None
202
-
203
- lag_size = int(match.group(1))
204
- lag_unit = match.group(2)
205
-
206
- return cls(lag_size=lag_size, lag_unit=lag_unit)
207
-
208
- def get_params(self) -> Dict[str, Optional[str]]:
209
- res = super().get_params()
210
- res.update(
211
- {
212
- "lag_size": self.lag_size,
213
- "lag_unit": self.lag_unit,
214
- }
215
- )
216
- return res
217
-
218
- def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
219
- lag_window = self.lag_size + 1
220
- return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])
@@ -63,6 +63,7 @@ class DataSourcePublisher:
63
63
  keep_features: Optional[List[str]] = None,
64
64
  date_features: Optional[List[str]] = None,
65
65
  date_vector_features: Optional[List[str]] = None,
66
+ date_features_format: Optional[str] = None,
66
67
  generate_runtime_embeddings: Optional[List[str]] = None,
67
68
  exclude_raw: Optional[List[str]] = None,
68
69
  _force_generation=False,
@@ -160,13 +161,17 @@ class DataSourcePublisher:
160
161
  if keep_features is not None:
161
162
  request["keepFeatures"] = keep_features
162
163
  if date_features is not None:
163
- if date_format is None:
164
- raise ValidationError("date_format should be presented if you use date features")
164
+ if date_features_format is None:
165
+ raise ValidationError("date_features_format should be presented if you use date features")
165
166
  request["dateFeatures"] = date_features
167
+ request["dateFeaturesFormat"] = date_features_format
166
168
  if date_vector_features is not None:
167
- if date_format is None:
168
- raise ValidationError("date_format should be presented if you use date vector features")
169
+ if date_features_format is None:
170
+ raise ValidationError(
171
+ "date_features_format should be presented if you use date vector features"
172
+ )
169
173
  request["dateVectorFeatures"] = date_vector_features
174
+ request["dateFeaturesFormat"] = date_features_format
170
175
  if generate_runtime_embeddings is not None:
171
176
  request["generateRuntimeEmbeddingsFeatures"] = generate_runtime_embeddings
172
177
  if exclude_raw is not None: