upgini 1.2.62__py3-none-any.whl → 1.2.62a3818.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.62"
1
+ __version__ = "1.2.62a3818.dev2"
@@ -1,4 +1,4 @@
1
- from upgini.autofe.operand import OperandRegistry
1
+ from upgini.autofe.operator import OperatorRegistry
2
2
  from upgini.autofe.unary import * # noqa
3
3
  from upgini.autofe.binary import * # noqa
4
4
  from upgini.autofe.groupby import * # noqa
@@ -7,4 +7,4 @@ from upgini.autofe.vector import * # noqa
7
7
 
8
8
 
9
9
  def find_op(name):
10
- return OperandRegistry.get_operand(name)
10
+ return OperatorRegistry.get_operand(name)
upgini/autofe/binary.py CHANGED
@@ -5,10 +5,10 @@ import numpy as np
5
5
  import pandas as pd
6
6
  from jarowinkler import jarowinkler_similarity
7
7
 
8
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
8
+ from upgini.autofe.operator import PandasOperator, VectorizableMixin
9
9
 
10
10
 
11
- class Min(PandasOperand):
11
+ class Min(PandasOperator):
12
12
  name: str = "min"
13
13
  is_binary: bool = True
14
14
  is_symmetrical: bool = True
@@ -18,7 +18,7 @@ class Min(PandasOperand):
18
18
  return np.minimum(left, right)
19
19
 
20
20
 
21
- class Max(PandasOperand):
21
+ class Max(PandasOperator):
22
22
  name: str = "max"
23
23
  is_binary: bool = True
24
24
  is_symmetrical: bool = True
@@ -28,7 +28,7 @@ class Max(PandasOperand):
28
28
  return np.maximum(left, right)
29
29
 
30
30
 
31
- class Add(PandasOperand, VectorizableMixin):
31
+ class Add(PandasOperator, VectorizableMixin):
32
32
  name: str = "+"
33
33
  alias: str = "add"
34
34
  is_binary: bool = True
@@ -47,7 +47,7 @@ class Add(PandasOperand, VectorizableMixin):
47
47
  return d1.add(d2, axis=0)
48
48
 
49
49
 
50
- class Subtract(PandasOperand, VectorizableMixin):
50
+ class Subtract(PandasOperator, VectorizableMixin):
51
51
  name: str = "-"
52
52
  alias: str = "sub"
53
53
  is_binary: bool = True
@@ -66,7 +66,7 @@ class Subtract(PandasOperand, VectorizableMixin):
66
66
  return d1.sub(d2, axis=0)
67
67
 
68
68
 
69
- class Multiply(PandasOperand, VectorizableMixin):
69
+ class Multiply(PandasOperator, VectorizableMixin):
70
70
  name: str = "*"
71
71
  alias: str = "mul"
72
72
  is_binary: bool = True
@@ -85,7 +85,7 @@ class Multiply(PandasOperand, VectorizableMixin):
85
85
  return d1.mul(d2, axis=0)
86
86
 
87
87
 
88
- class Divide(PandasOperand, VectorizableMixin):
88
+ class Divide(PandasOperator, VectorizableMixin):
89
89
  name: str = "/"
90
90
  alias: str = "div"
91
91
  is_binary: bool = True
@@ -104,7 +104,7 @@ class Divide(PandasOperand, VectorizableMixin):
104
104
  return d1.div(d2.replace(0, np.nan), axis=0)
105
105
 
106
106
 
107
- class Combine(PandasOperand):
107
+ class Combine(PandasOperator):
108
108
  name: str = "Combine"
109
109
  is_binary: bool = True
110
110
  has_symmetry_importance: bool = True
@@ -116,7 +116,7 @@ class Combine(PandasOperand):
116
116
  return pd.Series(temp, index=left.index)
117
117
 
118
118
 
119
- class CombineThenFreq(PandasOperand):
119
+ class CombineThenFreq(PandasOperator):
120
120
  name: str = "CombineThenFreq"
121
121
  is_binary: bool = True
122
122
  is_symmetrical: bool = True
@@ -132,7 +132,7 @@ class CombineThenFreq(PandasOperand):
132
132
  self._loc(temp, value_counts)
133
133
 
134
134
 
135
- class Distance(PandasOperand):
135
+ class Distance(PandasOperator):
136
136
  name: str = "dist"
137
137
  is_binary: bool = True
138
138
  output_type: Optional[str] = "float"
@@ -170,7 +170,7 @@ class Sim(Distance):
170
170
  return 1 - super().calculate_binary(left, right)
171
171
 
172
172
 
173
- class StringSim(PandasOperand, abc.ABC):
173
+ class StringSim(PandasOperator, abc.ABC):
174
174
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
175
175
  sims = []
176
176
  for i in left.index:
upgini/autofe/date.py CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from pandas.core.arrays.timedeltas import TimedeltaArray
8
8
  from pydantic import BaseModel, __version__ as pydantic_version
9
9
 
10
- from upgini.autofe.operand import PandasOperand, ParametrizedOperand
10
+ from upgini.autofe.operator import PandasOperator, ParametrizedOperator
11
11
 
12
12
 
13
13
  def get_pydantic_version():
@@ -43,7 +43,7 @@ class DateDiffMixin(BaseModel):
43
43
  raise Exception(f"Unsupported difference unit: {self.diff_unit}")
44
44
 
45
45
 
46
- class DateDiff(PandasOperand, DateDiffMixin):
46
+ class DateDiff(PandasOperator, DateDiffMixin):
47
47
  name: str = "date_diff"
48
48
  alias: Optional[str] = "date_diff_type1"
49
49
  is_binary: bool = True
@@ -78,7 +78,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
78
78
  return x
79
79
 
80
80
 
81
- class DateDiffType2(PandasOperand, DateDiffMixin):
81
+ class DateDiffType2(PandasOperator, DateDiffMixin):
82
82
  name: str = "date_diff_type2"
83
83
  is_binary: bool = True
84
84
  has_symmetry_importance: bool = True
@@ -112,7 +112,7 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
112
112
  _count_aggregations = ["nunique", "count"]
113
113
 
114
114
 
115
- class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
115
+ class DateListDiff(PandasOperator, DateDiffMixin, ParametrizedOperator):
116
116
  is_binary: bool = True
117
117
  has_symmetry_importance: bool = True
118
118
 
@@ -183,7 +183,7 @@ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
183
183
  return method(x) if len(x) > 0 else default
184
184
 
185
185
 
186
- class DateListDiffBounded(DateListDiff, ParametrizedOperand):
186
+ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
187
187
  lower_bound: Optional[int] = None
188
188
  upper_bound: Optional[int] = None
189
189
 
@@ -217,7 +217,7 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperand):
217
217
  return super()._agg(x)
218
218
 
219
219
 
220
- class DatePercentileBase(PandasOperand, abc.ABC):
220
+ class DatePercentileBase(PandasOperator, abc.ABC):
221
221
  is_binary: bool = True
222
222
  output_type: Optional[str] = "float"
223
223
 
upgini/autofe/feature.py CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from pandas._typing import DtypeObj
8
8
 
9
9
  from upgini.autofe.all_operands import find_op
10
- from upgini.autofe.operand import Operand, PandasOperand
10
+ from upgini.autofe.operator import Operator, PandasOperator
11
11
 
12
12
 
13
13
  class Column:
@@ -65,7 +65,7 @@ class Column:
65
65
  class Feature:
66
66
  def __init__(
67
67
  self,
68
- op: Operand,
68
+ op: Operator,
69
69
  children: List[Union[Column, "Feature"]],
70
70
  data: Optional[pd.DataFrame] = None,
71
71
  display_index: Optional[str] = None,
@@ -188,7 +188,7 @@ class Feature:
188
188
  return self.children[0].infer_type(data)
189
189
 
190
190
  def calculate(self, data: pd.DataFrame, is_root=False) -> Union[pd.Series, pd.DataFrame]:
191
- if isinstance(self.op, PandasOperand):
191
+ if isinstance(self.op, PandasOperator):
192
192
  if self.op.is_vector:
193
193
  ds = [child.calculate(data) for child in self.children]
194
194
  new_data = self.op.calculate(data=ds)
@@ -324,7 +324,7 @@ class Feature:
324
324
 
325
325
  class FeatureGroup:
326
326
  def __init__(
327
- self, op: Operand, main_column: Optional[Union[Column, Feature]], children: List[Union[Column, Feature]]
327
+ self, op: Operator, main_column: Optional[Union[Column, Feature]], children: List[Union[Column, Feature]]
328
328
  ):
329
329
  self.op = op
330
330
  self.main_column_node = main_column
@@ -345,7 +345,7 @@ class FeatureGroup:
345
345
  return names
346
346
 
347
347
  def calculate(self, data: pd.DataFrame, is_root=False) -> pd.DataFrame:
348
- if isinstance(self.op, PandasOperand):
348
+ if isinstance(self.op, PandasOperator):
349
349
  main_column = None if self.main_column_node is None else self.main_column_node.get_display_name()
350
350
  lower_order_children = []
351
351
  if self.main_column_node is not None:
@@ -378,7 +378,7 @@ class FeatureGroup:
378
378
  def make_groups(candidates: List[Feature]) -> List[Union[Feature, "FeatureGroup"]]:
379
379
  grouped_features = []
380
380
 
381
- def groupby_func(f: Feature) -> Tuple[Operand, Union[Column, Feature]]:
381
+ def groupby_func(f: Feature) -> Tuple[Operator, Union[Column, Feature]]:
382
382
  return (f.op, f.children[0 if not f.op.is_vectorizable else f.op.group_index])
383
383
 
384
384
  for op_child, features in itertools.groupby(candidates, groupby_func):
upgini/autofe/groupby.py CHANGED
@@ -2,13 +2,13 @@ from typing import Optional
2
2
 
3
3
  import pandas as pd
4
4
 
5
- from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
5
+ from upgini.autofe.operator import PandasOperator, ParametrizedOperator, VectorizableMixin
6
6
 
7
7
 
8
8
  class GroupByThenAgg(
9
- PandasOperand,
9
+ PandasOperator,
10
10
  VectorizableMixin,
11
- ParametrizedOperand,
11
+ ParametrizedOperator,
12
12
  ):
13
13
  agg: Optional[str]
14
14
  is_vectorizable: bool = True
@@ -39,7 +39,7 @@ class GroupByThenAgg(
39
39
  return temp.merge(d2, how="right", on=[group_column])[value_columns]
40
40
 
41
41
 
42
- class GroupByThenRank(PandasOperand, VectorizableMixin):
42
+ class GroupByThenRank(PandasOperator, VectorizableMixin):
43
43
  name: str = "GroupByThenRank"
44
44
  is_vectorizable: bool = True
45
45
  is_grouping: bool = True
@@ -58,7 +58,7 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
58
58
  return temp.merge(d2.reset_index(), how="right", on=["index"])[value_columns]
59
59
 
60
60
 
61
- class GroupByThenNUnique(PandasOperand, VectorizableMixin):
61
+ class GroupByThenNUnique(PandasOperator, VectorizableMixin):
62
62
  name: str = "GroupByThenNUnique"
63
63
  is_vectorizable: bool = True
64
64
  is_grouping: bool = True
@@ -78,7 +78,7 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
78
78
  return nunique.merge(d2, how="right", on=[group_column])[value_columns]
79
79
 
80
80
 
81
- class GroupByThenFreq(PandasOperand):
81
+ class GroupByThenFreq(PandasOperator):
82
82
  name: str = "GroupByThenFreq"
83
83
  is_grouping: bool = True
84
84
  output_type: Optional[str] = "float"
@@ -6,7 +6,7 @@ import pandas as pd
6
6
  from pydantic import BaseModel
7
7
 
8
8
 
9
- class OperandRegistry(type(BaseModel)):
9
+ class OperatorRegistry(type(BaseModel)):
10
10
  _registry = {}
11
11
  _parametrized_registry = []
12
12
 
@@ -33,7 +33,7 @@ class OperandRegistry(type(BaseModel)):
33
33
  return new_class
34
34
 
35
35
  @classmethod
36
- def get_operand(cls, name: str) -> Optional["Operand"]:
36
+ def get_operand(cls, name: str) -> Optional["Operator"]:
37
37
  # First try to resolve as a parametrized operand formula
38
38
  for operand_cls in cls._parametrized_registry:
39
39
  resolved = operand_cls.from_formula(name)
@@ -46,7 +46,7 @@ class OperandRegistry(type(BaseModel)):
46
46
  return None
47
47
 
48
48
 
49
- class Operand(BaseModel, metaclass=OperandRegistry):
49
+ class Operator(BaseModel, metaclass=OperatorRegistry):
50
50
  name: Optional[str] = None
51
51
  alias: Optional[str] = None
52
52
  is_unary: bool = False
@@ -75,7 +75,7 @@ class Operand(BaseModel, metaclass=OperandRegistry):
75
75
  return self.name
76
76
 
77
77
 
78
- class ParametrizedOperand(Operand, abc.ABC):
78
+ class ParametrizedOperator(Operator, abc.ABC):
79
79
 
80
80
  @abc.abstractmethod
81
81
  def to_formula(self) -> str:
@@ -83,14 +83,14 @@ class ParametrizedOperand(Operand, abc.ABC):
83
83
 
84
84
  @classmethod
85
85
  @abc.abstractmethod
86
- def from_formula(cls, formula: str) -> Optional["Operand"]:
86
+ def from_formula(cls, formula: str) -> Optional["Operator"]:
87
87
  pass
88
88
 
89
89
 
90
90
  MAIN_COLUMN = "main_column"
91
91
 
92
92
 
93
- class PandasOperand(Operand, abc.ABC):
93
+ class PandasOperator(Operator, abc.ABC):
94
94
  def calculate(self, **kwargs) -> pd.Series:
95
95
  if self.is_unary:
96
96
  return self.calculate_unary(kwargs["data"])
@@ -131,7 +131,7 @@ class PandasOperand(Operand, abc.ABC):
131
131
  return value
132
132
 
133
133
 
134
- class VectorizableMixin(Operand):
134
+ class VectorizableMixin(Operator):
135
135
  group_index: int = 1
136
136
 
137
137
  def validate_calculation(self, input_columns: List[str], **kwargs) -> Tuple[str, List[str]]:
@@ -0,0 +1,200 @@
1
+ import abc
2
+ from typing import Dict, List, Optional
3
+
4
+ import pandas as pd
5
+ from upgini.autofe.operator import PandasOperator, ParametrizedOperator
6
+
7
+ try:
8
+ from pydantic import field_validator as validator # V2
9
+ except ImportError:
10
+ from pydantic import validator # V1
11
+
12
+
13
+ class TimeSeriesBase(PandasOperator, abc.ABC):
14
+ is_vector: bool = True
15
+ date_unit: Optional[str] = None
16
+ offset_size: int = 0
17
+ offset_unit: str = "D"
18
+
19
+ def get_params(self) -> Dict[str, Optional[str]]:
20
+ res = super().get_params()
21
+ res.update(
22
+ {
23
+ "date_unit": self.date_unit,
24
+ "offset_size": self.offset_size,
25
+ "offset_unit": self.offset_unit,
26
+ }
27
+ )
28
+ return res
29
+
30
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
31
+ # assuming first is date, last is value, rest is group columns
32
+ date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
33
+ ts = pd.concat([date] + data[1:], axis=1)
34
+ ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
35
+ ts.set_index(date.name, inplace=True)
36
+ ts = ts[ts.index.notna()].sort_index()
37
+ ts = (
38
+ ts.groupby([c.name for c in data[1:-1]], group_keys=True)
39
+ .apply(self._shift)[data[-1].name]
40
+ .to_frame()
41
+ .reset_index()
42
+ .set_index(date.name)
43
+ .groupby([c.name for c in data[1:-1]])
44
+ if len(data) > 2
45
+ else self._shift(ts)
46
+ )
47
+ ts = self._aggregate(ts)
48
+ ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
49
+ ts.index = date.index
50
+
51
+ return ts.iloc[:, -1]
52
+
53
+ def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
54
+ if self.offset_size > 0:
55
+ return ts.iloc[:, :-1].merge(
56
+ ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
57
+ left_index=True,
58
+ right_index=True,
59
+ )
60
+ return ts
61
+
62
+ @abc.abstractmethod
63
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
64
+ pass
65
+
66
+
67
+ _roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean(), "last": lambda x: x[-1]}
68
+
69
+
70
+ class Roll(TimeSeriesBase, ParametrizedOperator):
71
+ aggregation: str
72
+ window_size: int = 1
73
+ window_unit: str = "D"
74
+
75
+ @validator("window_unit")
76
+ @classmethod
77
+ def validate_window_unit(cls, v: str) -> str:
78
+ try:
79
+ pd.tseries.frequencies.to_offset(v)
80
+ return v
81
+ except ValueError:
82
+ raise ValueError(
83
+ f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
84
+ )
85
+
86
+ def to_formula(self) -> str:
87
+ roll_component = f"roll_{self.window_size}{self.window_unit}"
88
+ if self.offset_size > 0:
89
+ roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
90
+ return f"{roll_component}_{self.aggregation}"
91
+
92
+ @classmethod
93
+ def from_formula(cls, formula: str) -> Optional["Roll"]:
94
+ import re
95
+
96
+ # Try matching pattern with offset first
97
+ pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
98
+ match_with_offset = re.match(pattern_with_offset, formula)
99
+
100
+ if match_with_offset:
101
+ window_size = int(match_with_offset.group(1))
102
+ window_unit = match_with_offset.group(2)
103
+ offset_size = int(match_with_offset.group(3))
104
+ offset_unit = match_with_offset.group(4)
105
+ aggregation = match_with_offset.group(5)
106
+
107
+ return cls(
108
+ window_size=window_size,
109
+ window_unit=window_unit,
110
+ offset_size=offset_size,
111
+ offset_unit=offset_unit,
112
+ aggregation=aggregation,
113
+ )
114
+
115
+ # If no offset pattern found, try basic pattern
116
+ pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
117
+ match = re.match(pattern, formula)
118
+
119
+ if not match:
120
+ return None
121
+
122
+ window_size = int(match.group(1))
123
+ window_unit = match.group(2)
124
+ aggregation = match.group(3)
125
+
126
+ return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
127
+
128
+ def get_params(self) -> Dict[str, Optional[str]]:
129
+ res = super().get_params()
130
+ res.update(
131
+ {
132
+ "window_size": self.window_size,
133
+ "window_unit": self.window_unit,
134
+ "aggregation": self.aggregation,
135
+ }
136
+ )
137
+ return res
138
+
139
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
140
+ return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
141
+ _roll_aggregations.get(self.aggregation, self.aggregation)
142
+ )
143
+
144
+
145
+ class Lag(TimeSeriesBase, ParametrizedOperator):
146
+ lag_size: int
147
+ lag_unit: str = "D"
148
+
149
+ def to_formula(self) -> str:
150
+ lag_component = f"lag_{self.lag_size}{self.lag_unit}"
151
+ if self.offset_size > 0:
152
+ lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
153
+ return lag_component
154
+
155
+ @classmethod
156
+ def from_formula(cls, formula: str) -> Optional["Lag"]:
157
+ import re
158
+
159
+ # Try matching pattern with offset first
160
+ pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
161
+ match_with_offset = re.match(pattern_with_offset, formula)
162
+
163
+ if match_with_offset:
164
+ lag_size = int(match_with_offset.group(1))
165
+ lag_unit = match_with_offset.group(2)
166
+ offset_size = int(match_with_offset.group(3))
167
+ offset_unit = match_with_offset.group(4)
168
+
169
+ return cls(
170
+ lag_size=lag_size,
171
+ lag_unit=lag_unit,
172
+ offset_size=offset_size,
173
+ offset_unit=offset_unit,
174
+ )
175
+
176
+ # If no offset pattern found, try basic pattern
177
+ pattern = r"^lag_(\d+)([a-zA-Z])$"
178
+ match = re.match(pattern, formula)
179
+
180
+ if not match:
181
+ return None
182
+
183
+ lag_size = int(match.group(1))
184
+ lag_unit = match.group(2)
185
+
186
+ return cls(lag_size=lag_size, lag_unit=lag_unit)
187
+
188
+ def get_params(self) -> Dict[str, Optional[str]]:
189
+ res = super().get_params()
190
+ res.update(
191
+ {
192
+ "lag_size": self.lag_size,
193
+ "lag_unit": self.lag_unit,
194
+ }
195
+ )
196
+ return res
197
+
198
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
199
+ lag_window = self.lag_size + 1
200
+ return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])
upgini/autofe/unary.py CHANGED
@@ -2,10 +2,10 @@ from typing import Dict, Optional
2
2
  import numpy as np
3
3
  import pandas as pd
4
4
 
5
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
5
+ from upgini.autofe.operator import PandasOperator, VectorizableMixin
6
6
 
7
7
 
8
- class Abs(PandasOperand, VectorizableMixin):
8
+ class Abs(PandasOperator, VectorizableMixin):
9
9
  name: str = "abs"
10
10
  is_unary: bool = True
11
11
  is_vectorizable: bool = True
@@ -20,7 +20,7 @@ class Abs(PandasOperand, VectorizableMixin):
20
20
  # return data.abs()
21
21
 
22
22
 
23
- class Log(PandasOperand, VectorizableMixin):
23
+ class Log(PandasOperator, VectorizableMixin):
24
24
  name: str = "log"
25
25
  is_unary: bool = True
26
26
  is_vectorizable: bool = True
@@ -34,7 +34,7 @@ class Log(PandasOperand, VectorizableMixin):
34
34
  return self._round_value(np.log(data.replace(0, np.nan).abs()), 10)
35
35
 
36
36
 
37
- class Sqrt(PandasOperand, VectorizableMixin):
37
+ class Sqrt(PandasOperator, VectorizableMixin):
38
38
  name: str = "sqrt"
39
39
  is_unary: bool = True
40
40
  is_vectorizable: bool = True
@@ -48,7 +48,7 @@ class Sqrt(PandasOperand, VectorizableMixin):
48
48
  return self._round_value(np.sqrt(data.abs()))
49
49
 
50
50
 
51
- class Square(PandasOperand, VectorizableMixin):
51
+ class Square(PandasOperator, VectorizableMixin):
52
52
  name: str = "square"
53
53
  is_unary: bool = True
54
54
  is_vectorizable: bool = True
@@ -61,7 +61,7 @@ class Square(PandasOperand, VectorizableMixin):
61
61
  return np.square(data)
62
62
 
63
63
 
64
- class Sigmoid(PandasOperand, VectorizableMixin):
64
+ class Sigmoid(PandasOperator, VectorizableMixin):
65
65
  name: str = "sigmoid"
66
66
  is_unary: bool = True
67
67
  is_vectorizable: bool = True
@@ -75,7 +75,7 @@ class Sigmoid(PandasOperand, VectorizableMixin):
75
75
  return self._round_value(1 / (1 + np.exp(-data)))
76
76
 
77
77
 
78
- class Floor(PandasOperand, VectorizableMixin):
78
+ class Floor(PandasOperator, VectorizableMixin):
79
79
  name: str = "floor"
80
80
  is_unary: bool = True
81
81
  is_vectorizable: bool = True
@@ -90,7 +90,7 @@ class Floor(PandasOperand, VectorizableMixin):
90
90
  return np.floor(data)
91
91
 
92
92
 
93
- class Residual(PandasOperand, VectorizableMixin):
93
+ class Residual(PandasOperator, VectorizableMixin):
94
94
  name: str = "residual"
95
95
  is_unary: bool = True
96
96
  is_vectorizable: bool = True
@@ -104,7 +104,7 @@ class Residual(PandasOperand, VectorizableMixin):
104
104
  return data - np.floor(data)
105
105
 
106
106
 
107
- class Freq(PandasOperand):
107
+ class Freq(PandasOperator):
108
108
  name: str = "freq"
109
109
  is_unary: bool = True
110
110
  output_type: Optional[str] = "float"
@@ -116,7 +116,7 @@ class Freq(PandasOperand):
116
116
  return self._loc(data, value_counts)
117
117
 
118
118
 
119
- class Norm(PandasOperand):
119
+ class Norm(PandasOperator):
120
120
  name: str = "norm"
121
121
  is_unary: bool = True
122
122
  output_type: Optional[str] = "float"
@@ -148,7 +148,7 @@ class Norm(PandasOperand):
148
148
  return res
149
149
 
150
150
 
151
- class Embeddings(PandasOperand):
151
+ class Embeddings(PandasOperator):
152
152
  name: str = "emb"
153
153
  is_unary: bool = True
154
154
  input_type: Optional[str] = "string"
upgini/autofe/vector.py CHANGED
@@ -1,17 +1,11 @@
1
- import abc
2
- from typing import Dict, List, Optional
1
+ from typing import List, Optional
3
2
 
4
3
  import pandas as pd
5
4
 
6
- try:
7
- from pydantic import field_validator as validator # V2
8
- except ImportError:
9
- from pydantic import validator # V1
5
+ from upgini.autofe.operator import PandasOperator, VectorizableMixin
10
6
 
11
- from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
12
7
 
13
-
14
- class Mean(PandasOperand, VectorizableMixin):
8
+ class Mean(PandasOperator, VectorizableMixin):
15
9
  name: str = "mean"
16
10
  output_type: Optional[str] = "float"
17
11
  is_vector: bool = True
@@ -21,200 +15,10 @@ class Mean(PandasOperand, VectorizableMixin):
21
15
  return pd.DataFrame(data).T.fillna(0).mean(axis=1)
22
16
 
23
17
 
24
- class Sum(PandasOperand, VectorizableMixin):
18
+ class Sum(PandasOperator, VectorizableMixin):
25
19
  name: str = "sum"
26
20
  is_vector: bool = True
27
21
  group_index: int = 0
28
22
 
29
23
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
30
24
  return pd.DataFrame(data).T.fillna(0).sum(axis=1)
31
-
32
-
33
- class TimeSeriesBase(PandasOperand, abc.ABC):
34
- is_vector: bool = True
35
- date_unit: Optional[str] = None
36
- offset_size: int = 0
37
- offset_unit: str = "D"
38
-
39
- def get_params(self) -> Dict[str, Optional[str]]:
40
- res = super().get_params()
41
- res.update(
42
- {
43
- "date_unit": self.date_unit,
44
- "offset_size": self.offset_size,
45
- "offset_unit": self.offset_unit,
46
- }
47
- )
48
- return res
49
-
50
- def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
51
- # assuming first is date, last is value, rest is group columns
52
- date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
53
- ts = pd.concat([date] + data[1:], axis=1)
54
- ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
55
- ts.set_index(date.name, inplace=True)
56
- ts = ts[ts.index.notna()].sort_index()
57
- ts = (
58
- ts.groupby([c.name for c in data[1:-1]])
59
- .apply(self._shift)[data[-1].name]
60
- .to_frame()
61
- .reset_index()
62
- .set_index(date.name)
63
- .groupby([c.name for c in data[1:-1]])
64
- if len(data) > 2
65
- else self._shift(ts)
66
- )
67
- ts = self._aggregate(ts)
68
- ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
69
- ts.index = date.index
70
-
71
- return ts.iloc[:, -1]
72
-
73
- def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
74
- if self.offset_size > 0:
75
- return ts.iloc[:, :-1].merge(
76
- ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
77
- left_index=True,
78
- right_index=True,
79
- )
80
- return ts
81
-
82
- @abc.abstractmethod
83
- def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
84
- pass
85
-
86
-
87
- _roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
88
-
89
-
90
- class Roll(TimeSeriesBase, ParametrizedOperand):
91
- aggregation: str
92
- window_size: int = 1
93
- window_unit: str = "D"
94
-
95
- @validator("window_unit")
96
- @classmethod
97
- def validate_window_unit(cls, v: str) -> str:
98
- try:
99
- pd.tseries.frequencies.to_offset(v)
100
- return v
101
- except ValueError:
102
- raise ValueError(
103
- f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
104
- )
105
-
106
- def to_formula(self) -> str:
107
- roll_component = f"roll_{self.window_size}{self.window_unit}"
108
- if self.offset_size > 0:
109
- roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
110
- return f"{roll_component}_{self.aggregation}"
111
-
112
- @classmethod
113
- def from_formula(cls, formula: str) -> Optional["Roll"]:
114
- import re
115
-
116
- # Try matching pattern with offset first
117
- pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
118
- match_with_offset = re.match(pattern_with_offset, formula)
119
-
120
- if match_with_offset:
121
- window_size = int(match_with_offset.group(1))
122
- window_unit = match_with_offset.group(2)
123
- offset_size = int(match_with_offset.group(3))
124
- offset_unit = match_with_offset.group(4)
125
- aggregation = match_with_offset.group(5)
126
-
127
- return cls(
128
- window_size=window_size,
129
- window_unit=window_unit,
130
- offset_size=offset_size,
131
- offset_unit=offset_unit,
132
- aggregation=aggregation,
133
- )
134
-
135
- # If no offset pattern found, try basic pattern
136
- pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
137
- match = re.match(pattern, formula)
138
-
139
- if not match:
140
- return None
141
-
142
- window_size = int(match.group(1))
143
- window_unit = match.group(2)
144
- aggregation = match.group(3)
145
-
146
- return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
147
-
148
- def get_params(self) -> Dict[str, Optional[str]]:
149
- res = super().get_params()
150
- res.update(
151
- {
152
- "window_size": self.window_size,
153
- "window_unit": self.window_unit,
154
- "aggregation": self.aggregation,
155
- }
156
- )
157
- return res
158
-
159
- def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
160
- return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
161
- _roll_aggregations.get(self.aggregation, self.aggregation)
162
- )
163
-
164
-
165
- class Lag(TimeSeriesBase, ParametrizedOperand):
166
- lag_size: int
167
- lag_unit: str = "D"
168
-
169
- def to_formula(self) -> str:
170
- lag_component = f"lag_{self.lag_size}{self.lag_unit}"
171
- if self.offset_size > 0:
172
- lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
173
- return lag_component
174
-
175
- @classmethod
176
- def from_formula(cls, formula: str) -> Optional["Lag"]:
177
- import re
178
-
179
- # Try matching pattern with offset first
180
- pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
181
- match_with_offset = re.match(pattern_with_offset, formula)
182
-
183
- if match_with_offset:
184
- lag_size = int(match_with_offset.group(1))
185
- lag_unit = match_with_offset.group(2)
186
- offset_size = int(match_with_offset.group(3))
187
- offset_unit = match_with_offset.group(4)
188
-
189
- return cls(
190
- lag_size=lag_size,
191
- lag_unit=lag_unit,
192
- offset_size=offset_size,
193
- offset_unit=offset_unit,
194
- )
195
-
196
- # If no offset pattern found, try basic pattern
197
- pattern = r"^lag_(\d+)([a-zA-Z])$"
198
- match = re.match(pattern, formula)
199
-
200
- if not match:
201
- return None
202
-
203
- lag_size = int(match.group(1))
204
- lag_unit = match.group(2)
205
-
206
- return cls(lag_size=lag_size, lag_unit=lag_unit)
207
-
208
- def get_params(self) -> Dict[str, Optional[str]]:
209
- res = super().get_params()
210
- res.update(
211
- {
212
- "lag_size": self.lag_size,
213
- "lag_unit": self.lag_unit,
214
- }
215
- )
216
- return res
217
-
218
- def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
219
- lag_window = self.lag_size + 1
220
- return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])
@@ -31,7 +31,7 @@ from sklearn.exceptions import NotFittedError
31
31
  from sklearn.model_selection import BaseCrossValidator
32
32
 
33
33
  from upgini.autofe.feature import Feature
34
- from upgini.autofe.vector import TimeSeriesBase
34
+ from upgini.autofe.timeseries import TimeSeriesBase
35
35
  from upgini.data_source.data_source_publisher import CommercialSchema
36
36
  from upgini.dataset import Dataset
37
37
  from upgini.errors import HttpError, ValidationError
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.62
3
+ Version: 1.2.62a3818.dev2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=X-PIyJPyy-W4DbKWDuHTMhmvRT8La2rsZ63Zaf_MERI,23
1
+ upgini/__about__.py,sha256=OLozvzWRYF8QVe08Gh2xAIzV-SPbWN9X8WcPvXKgTuU,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=2AMEXtoMrEFw3f0b0CsvkFyS1a7L4aqI2GO_fCsgWac,205336
6
+ upgini/features_enricher.py,sha256=cB2I5rNpbztjkYEEW5aJuKj2fCMnfxp40X4Eo63oyuQ,205340
7
7
  upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
@@ -14,14 +14,15 @@ upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1
14
14
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
15
15
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- upgini/autofe/all_operands.py,sha256=v0_NozalvvzeojSAA0d7UJ5INS654ZVaLn4S8djK6Ac,329
18
- upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
19
- upgini/autofe/date.py,sha256=pqwwk4_35RYXDT2fSJ9dlxGBm-R0jWBeiSb-79hZjkI,10721
20
- upgini/autofe/feature.py,sha256=zvRdlxCkaOsX0XiragNvh0tAPyOWut0MQTq5JGU5HtY,14749
21
- upgini/autofe/groupby.py,sha256=G48_sQZw016eGx3cOy8YQrEIOp95puWqYUpFWd-gdeM,3595
22
- upgini/autofe/operand.py,sha256=8Ttrfxv_H91dMbS7J55zxluzAJHfGXU_Y2xCh4OHwb8,4774
23
- upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
24
- upgini/autofe/vector.py,sha256=udkg4pP7IIeLjt0Cg6rzEKUmGaubOnqsEz3bz9R6E44,7110
17
+ upgini/autofe/all_operands.py,sha256=VIT5jCq5U-qypdNz1MIQ_hlIAs0ujJgRfKRUkU24nFs,332
18
+ upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
19
+ upgini/autofe/date.py,sha256=I07psJerrxOcHao91PdSCk9X6KWu61IBVyFRLjGNgK8,10730
20
+ upgini/autofe/feature.py,sha256=Xto7FHH1JG-5QvkfTPNWKtV9GAzPviTNPKFZOUN7RQA,14757
21
+ upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
22
+ upgini/autofe/operator.py,sha256=RSJWoKB2pIZ5xToVuk_T0ec7QRx-duxYEEGJ5oealaM,4784
23
+ upgini/autofe/timeseries.py,sha256=-BnDp0z_Hv6Vol1Vov6QC_82U8XPV3pfIPFspK2aTCE,6598
24
+ upgini/autofe/unary.py,sha256=yVgPvtfnPSOhrii0YgezddmgWPwyOBCR0JutaIkdTTc,4658
25
+ upgini/autofe/vector.py,sha256=l0KdKg-txlZxDSE4hPPfCtfGQofYbl7oaABPr830sPI,667
25
26
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
27
  upgini/data_source/data_source_publisher.py,sha256=4S9qwlAklD8vg9tUU_c1pHE2_glUHAh15-wr5hMwKFw,22879
27
28
  upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
@@ -62,7 +63,7 @@ upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,
62
63
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
63
64
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
64
65
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
65
- upgini-1.2.62.dist-info/METADATA,sha256=l1TBHJEV26NNT_Er41bbO3ph5UZ-QkzYTpf_JU1Y7ak,49084
66
- upgini-1.2.62.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
67
- upgini-1.2.62.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
68
- upgini-1.2.62.dist-info/RECORD,,
66
+ upgini-1.2.62a3818.dev2.dist-info/METADATA,sha256=VEJPjgu8A5gOrr4WPbk6DYHt8BNxoqUq9rsl967GQMU,49094
67
+ upgini-1.2.62a3818.dev2.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
68
+ upgini-1.2.62a3818.dev2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
69
+ upgini-1.2.62a3818.dev2.dist-info/RECORD,,