upgini 1.2.20__py3-none-any.whl → 1.2.20a3657.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.20"
1
+ __version__ = "1.2.20a3657.dev1"
upgini/__init__.py CHANGED
@@ -2,6 +2,7 @@ import os
2
2
 
3
3
  from upgini.features_enricher import FeaturesEnricher # noqa: F401
4
4
  from upgini.metadata import SearchKey, CVType, RuntimeParameters, ModelTaskType # noqa: F401
5
+
5
6
  # from .lazy_import import LazyImport
6
7
 
7
8
  os.environ["SETUPTOOLS_USE_DISTUTILS"] = "stdlib"
@@ -1,87 +1,5 @@
1
- from copy import deepcopy
2
- from typing import Dict
3
-
4
- from upgini.autofe.binary import (
5
- Add,
6
- Combine,
7
- CombineThenFreq,
8
- Distance,
9
- Divide,
10
- JaroWinklerSim1,
11
- JaroWinklerSim2,
12
- LevenshteinSim,
13
- Max,
14
- Min,
15
- Multiply,
16
- Sim,
17
- Subtract,
18
- )
19
- from upgini.autofe.date import (
20
- DateDiff,
21
- DateDiffType2,
22
- DateListDiff,
23
- DateListDiffBounded,
24
- DatePercentile,
25
- DatePercentileMethod2,
26
- )
27
- from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
28
- from upgini.autofe.operand import Operand
29
- from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
30
- from upgini.autofe.vector import Mean, Sum
31
-
32
- ALL_OPERANDS: Dict[str, Operand] = {
33
- op.name: op
34
- for op in [
35
- Freq(),
36
- Mean(),
37
- Sum(),
38
- Abs(),
39
- Log(),
40
- Sqrt(),
41
- Square(),
42
- Sigmoid(),
43
- Floor(),
44
- Residual(),
45
- Min(),
46
- Max(),
47
- Add(),
48
- Subtract(),
49
- Multiply(),
50
- Divide(),
51
- GroupByThenAgg(name="GroupByThenMin", agg="min"),
52
- GroupByThenAgg(name="GroupByThenMax", agg="max"),
53
- GroupByThenAgg(name="GroupByThenMean", agg="mean"),
54
- GroupByThenAgg(name="GroupByThenMedian", agg="median"),
55
- GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
56
- GroupByThenRank(),
57
- Combine(),
58
- CombineThenFreq(),
59
- GroupByThenNUnique(),
60
- GroupByThenFreq(),
61
- Sim(),
62
- DateDiff(),
63
- DateDiffType2(),
64
- DateListDiff(aggregation="min"),
65
- DateListDiff(aggregation="max"),
66
- DateListDiff(aggregation="mean"),
67
- DateListDiff(aggregation="nunique"),
68
- DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=0, upper_bound=18),
69
- DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=18, upper_bound=23),
70
- DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=23, upper_bound=30),
71
- DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
72
- DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
73
- DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
74
- DatePercentile(),
75
- DatePercentileMethod2(),
76
- Norm(),
77
- JaroWinklerSim1(),
78
- JaroWinklerSim2(),
79
- LevenshteinSim(),
80
- Distance(),
81
- Embeddings(),
82
- ]
83
- }
1
+ from upgini.autofe.operand import OperandRegistry
84
2
 
85
3
 
86
4
  def find_op(name):
87
- return deepcopy(ALL_OPERANDS.get(name))
5
+ return OperandRegistry.get_operand(name)
upgini/autofe/date.py CHANGED
@@ -7,11 +7,11 @@ import pandas as pd
7
7
  from pandas.core.arrays.timedeltas import TimedeltaArray
8
8
  from pydantic import BaseModel, __version__ as pydantic_version
9
9
 
10
- from upgini.autofe.operand import PandasOperand
10
+ from upgini.autofe.operand import PandasOperand, ParametrizedOperand
11
11
 
12
12
 
13
13
  def get_pydantic_version():
14
- major_version = int(pydantic_version.split('.')[0])
14
+ major_version = int(pydantic_version.split(".")[0])
15
15
  return major_version
16
16
 
17
17
 
@@ -109,7 +109,7 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
109
109
  _count_aggregations = ["nunique", "count"]
110
110
 
111
111
 
112
- class DateListDiff(PandasOperand, DateDiffMixin):
112
+ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
113
113
  is_binary: bool = True
114
114
  has_symmetry_importance: bool = True
115
115
 
@@ -134,6 +134,15 @@ class DateListDiff(PandasOperand, DateDiffMixin):
134
134
  data["name"] = f"date_diff_{data.get('aggregation')}"
135
135
  super().__init__(**data)
136
136
 
137
+ @classmethod
138
+ def from_formula(cls, formula: str) -> Optional["DateListDiff"]:
139
+ if not formula.startswith("date_diff_"):
140
+ return None
141
+ aggregation = formula.replace("date_diff_", "")
142
+ if "_" in aggregation:
143
+ return None
144
+ return cls(aggregation=aggregation)
145
+
137
146
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
138
147
  left = self._convert_to_date(left, self.left_unit)
139
148
  right_mask = right.apply(lambda x: len(x) > 0)
@@ -170,7 +179,7 @@ class DateListDiff(PandasOperand, DateDiffMixin):
170
179
  return method(x) if len(x) > 0 else default
171
180
 
172
181
 
173
- class DateListDiffBounded(DateListDiff):
182
+ class DateListDiffBounded(DateListDiff, ParametrizedOperand):
174
183
  lower_bound: Optional[int] = None
175
184
  upper_bound: Optional[int] = None
176
185
 
@@ -188,6 +197,23 @@ class DateListDiffBounded(DateListDiff):
188
197
  data["name"] = "_".join(components)
189
198
  super().__init__(**data)
190
199
 
200
+ @classmethod
201
+ def from_formula(cls, formula: str) -> Optional["DateListDiffBounded"]:
202
+ import re
203
+
204
+ pattern = r"^date_diff_([^_]+)_((minusinf|\d+))_((plusinf|\d+))_(\w+)$"
205
+ match = re.match(pattern, formula)
206
+
207
+ if not match:
208
+ return None
209
+
210
+ diff_unit = match.group(1)
211
+ lower_bound = None if match.group(2) == "minusinf" else int(match.group(2))
212
+ upper_bound = None if match.group(4) == "plusinf" else int(match.group(4))
213
+ aggregation = match.group(6)
214
+
215
+ return cls(diff_unit=diff_unit, lower_bound=lower_bound, upper_bound=upper_bound, aggregation=aggregation)
216
+
191
217
  def _agg(self, x):
192
218
  x = x[
193
219
  (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
@@ -257,16 +283,17 @@ class DatePercentile(DatePercentileBase):
257
283
  # Use @field_validator for Pydantic 2.x
258
284
  from pydantic import field_validator
259
285
 
260
- @field_validator('zero_bounds', mode='before')
286
+ @field_validator("zero_bounds", mode="before")
261
287
  def parse_zero_bounds(cls, value):
262
288
  if isinstance(value, str):
263
289
  return json.loads(value)
264
290
  return value
291
+
265
292
  else:
266
293
  # Use @validator for Pydantic 1.x
267
294
  from pydantic import validator
268
295
 
269
- @validator('zero_bounds', pre=True)
296
+ @validator("zero_bounds", pre=True)
270
297
  def parse_zero_bounds(cls, value):
271
298
  if isinstance(value, str):
272
299
  return json.loads(value)
upgini/autofe/operand.py CHANGED
@@ -6,7 +6,47 @@ import pandas as pd
6
6
  from pydantic import BaseModel
7
7
 
8
8
 
9
- class Operand(BaseModel):
9
+ class OperandRegistry(type(BaseModel)):
10
+ _registry = {}
11
+ _parametrized_registry = []
12
+
13
+ def __new__(cls, name, bases, attrs):
14
+ new_class = super().__new__(cls, name, bases, attrs)
15
+ # Only register if it's a concrete class that inherits from Operand
16
+ base_classes = [b for b in bases]
17
+ base_names = {b.__name__ for b in bases}
18
+ while base_classes:
19
+ base = base_classes.pop()
20
+ base_names.update(b.__name__ for b in base.__bases__)
21
+ base_classes.extend(base.__bases__)
22
+
23
+ if "Operand" in base_names:
24
+ # Track parametrized operands separately
25
+ if "ParametrizedOperand" in base_names:
26
+ cls._parametrized_registry.append(new_class)
27
+ else:
28
+ try:
29
+ instance = new_class()
30
+ cls._registry[instance.name] = new_class
31
+ except Exception:
32
+ pass
33
+ return new_class
34
+
35
+ @classmethod
36
+ def get_operand(cls, name: str) -> Optional["Operand"]:
37
+ # First try to resolve as a parametrized operand formula
38
+ for operand_cls in cls._parametrized_registry:
39
+ resolved = operand_cls.from_formula(name)
40
+ if resolved is not None:
41
+ return resolved
42
+ # Fall back to direct registry lookup
43
+ non_parametrized = cls._registry.get(name)
44
+ if non_parametrized is not None:
45
+ return non_parametrized()
46
+ return None
47
+
48
+
49
+ class Operand(BaseModel, metaclass=OperandRegistry):
10
50
  name: str
11
51
  alias: Optional[str] = None
12
52
  is_unary: bool = False
@@ -32,6 +72,12 @@ class Operand(BaseModel):
32
72
  return res
33
73
 
34
74
 
75
+ class ParametrizedOperand(Operand):
76
+ @classmethod
77
+ def from_formula(cls, formula: str) -> Optional["Operand"]:
78
+ pass
79
+
80
+
35
81
  MAIN_COLUMN = "main_column"
36
82
 
37
83
 
upgini/autofe/vector.py CHANGED
@@ -1,8 +1,10 @@
1
- from typing import List, Optional
1
+ import abc
2
+ from typing import Any, Dict, List, Optional
2
3
 
3
4
  import pandas as pd
5
+ from pydantic import validator
4
6
 
5
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
7
+ from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
6
8
 
7
9
 
8
10
  class Mean(PandasOperand, VectorizableMixin):
@@ -22,3 +24,132 @@ class Sum(PandasOperand, VectorizableMixin):
22
24
 
23
25
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
24
26
  return pd.DataFrame(data).T.fillna(0).sum(axis=1)
27
+
28
+
29
+ class TimeSeriesBase(PandasOperand, abc.ABC):
30
+ is_vector: bool = True
31
+ date_unit: Optional[str] = None
32
+
33
+ def get_params(self) -> Dict[str, Optional[str]]:
34
+ res = super().get_params()
35
+ res.update(
36
+ {
37
+ "date_unit": self.date_unit,
38
+ }
39
+ )
40
+ return res
41
+
42
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
43
+ # assuming first is date, last is value, rest is group columns
44
+ date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
45
+ ts = pd.concat([date] + data[1:], axis=1)
46
+ ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
47
+ ts.set_index(date.name, inplace=True)
48
+ ts = ts[ts.index.notna()].sort_index()
49
+ ts = ts.groupby([c.name for c in data[1:-1]]) if len(data) > 2 else ts
50
+ ts = self._aggregate(ts)
51
+ ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
52
+
53
+ return ts.iloc[:, -1]
54
+
55
+ @abc.abstractmethod
56
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
57
+ pass
58
+
59
+
60
+ _roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
61
+
62
+
63
+ class Roll(TimeSeriesBase, ParametrizedOperand):
64
+ aggregation: str
65
+ window_size: int = 1
66
+ window_unit: str = "D"
67
+
68
+ @validator("window_unit")
69
+ def validate_window_unit(cls, v: str) -> str:
70
+ try:
71
+ pd.tseries.frequencies.to_offset(v)
72
+ return v
73
+ except ValueError:
74
+ raise ValueError(
75
+ f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
76
+ )
77
+
78
+ def __init__(self, **data: Any) -> None:
79
+ if "name" not in data:
80
+ components = [
81
+ "roll",
82
+ str(data.get("window_size") or 1) + str(data.get("window_unit") or "D"),
83
+ data.get("aggregation"),
84
+ ]
85
+ data["name"] = "_".join(components).lower()
86
+ super().__init__(**data)
87
+
88
+ @classmethod
89
+ def from_formula(cls, formula: str) -> Optional["Roll"]:
90
+ import re
91
+
92
+ pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
93
+ match = re.match(pattern, formula)
94
+
95
+ if not match:
96
+ return None
97
+
98
+ window_size = int(match.group(1))
99
+ window_unit = match.group(2)
100
+ aggregation = match.group(3)
101
+
102
+ return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
103
+
104
+ def get_params(self) -> Dict[str, Optional[str]]:
105
+ res = super().get_params()
106
+ res.update(
107
+ {
108
+ "window_size": self.window_size,
109
+ "window_unit": self.window_unit,
110
+ "aggregation": self.aggregation,
111
+ }
112
+ )
113
+ return res
114
+
115
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
116
+ return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=self.window_size).agg(
117
+ _roll_aggregations.get(self.aggregation, self.aggregation)
118
+ )
119
+
120
+
121
+ class Lag(TimeSeriesBase, ParametrizedOperand):
122
+ lag_size: int
123
+ lag_unit: str = "D"
124
+
125
+ def __init__(self, **data: Any) -> None:
126
+ if "name" not in data:
127
+ components = [
128
+ "lag",
129
+ str(data.get("lag_size") or 1) + str(data.get("lag_unit") or "D"),
130
+ ]
131
+ data["name"] = "_".join(components).lower()
132
+ super().__init__(**data)
133
+
134
+ @classmethod
135
+ def from_formula(cls, formula: str) -> Optional["Lag"]:
136
+ import re
137
+
138
+ pattern = r"^lag_(\d+)([a-zA-Z])$"
139
+ match = re.match(pattern, formula)
140
+
141
+ if not match:
142
+ return None
143
+
144
+ lag_size = int(match.group(1))
145
+ lag_unit = match.group(2)
146
+
147
+ return cls(lag_size=lag_size, lag_unit=lag_unit)
148
+
149
+ def get_params(self) -> Dict[str, Optional[str]]:
150
+ res = super().get_params()
151
+ return res
152
+
153
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
154
+ lag_window = self.lag_size + 1
155
+ return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])
@@ -1,12 +1,11 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.20
3
+ Version: 1.2.20a3657.dev1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
7
7
  Project-URL: Source, https://github.com/upgini/upgini
8
8
  Author-email: Upgini Developers <madewithlove@upgini.com>
9
- License-Expression: BSD-3-Clause
10
9
  License-File: LICENSE
11
10
  Keywords: automl,data mining,data science,data search,machine learning
12
11
  Classifier: Development Status :: 5 - Production/Stable
@@ -1,5 +1,5 @@
1
- upgini/__about__.py,sha256=nQtXpLTEUbMtAPecTV_hZAJZb9EhWc8glRv6hgKyvG4,23
2
- upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
1
+ upgini/__about__.py,sha256=YkZ_uLYHtqgChcjML_VbuHRPzZ0weOtfhilztAaEx10,33
2
+ upgini/__init__.py,sha256=Mb_sTh-IiGiyQLExOF226RsqnpVH8u1ozaCSW3Scdx4,590
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
@@ -14,14 +14,14 @@ upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1
14
14
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
15
15
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- upgini/autofe/all_operands.py,sha256=cCCB44qvkmuWyiRM5Xykx8tkHPIjQthrWyj67STWN80,2578
17
+ upgini/autofe/all_operands.py,sha256=z3RSj98mkIXOkkmXHVCV7ese6V6rgD4uXyHge65HMVA,116
18
18
  upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
19
- upgini/autofe/date.py,sha256=OpFc3Al0xO3qlESn2Uokfxw51ArVqmh3xngWwdrsaqE,9762
19
+ upgini/autofe/date.py,sha256=kC1oQ_LKaqq-JTiqzIbUti-JB3bWizaB5nvXQ_BoD6Y,10780
20
20
  upgini/autofe/feature.py,sha256=eL7wABUhDKZzv3E-RPJNcyGwSfB0UptcfU2RbvsOks4,15082
21
21
  upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
22
- upgini/autofe/operand.py,sha256=uk883RaNqgXqtkaRqA1re1d9OFnnpv0JVvelYx09Yw0,2943
22
+ upgini/autofe/operand.py,sha256=sEyFD_SdQ5tqJ5yGUZlXSqUnQb6WxOqZ0bMS6oKDjdU,4593
23
23
  upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
24
- upgini/autofe/vector.py,sha256=ehcZUDqV71TfbU8EmKfdYp603gS2dJY_-fpr10ho5sI,663
24
+ upgini/autofe/vector.py,sha256=KBoEcRywc1xdgYLCPlkUnKi5w0wCF0j3IYQP5eSmmgY,4807
25
25
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
26
  upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lYQaGZbxDnOd4A3Q,22516
27
27
  upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
57
57
  upgini/utils/target_utils.py,sha256=qHzZRmICFbLNCrmVqGkaBcjm91L2ERRZMppci36acV4,10085
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.2.20.dist-info/METADATA,sha256=NVxQ5AA2uDaCtzEDlqWqpG6uEOi2xufY3pqvO9XtdgY,48611
61
- upgini-1.2.20.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
- upgini-1.2.20.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.2.20.dist-info/RECORD,,
60
+ upgini-1.2.20a3657.dev1.dist-info/METADATA,sha256=bgrk-SB81K0mrOkFRfrSl04-TuA2wxZWIbYdQOJePKA,48588
61
+ upgini-1.2.20a3657.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
62
+ upgini-1.2.20a3657.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.2.20a3657.dev1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.25.0
2
+ Generator: hatchling 1.24.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any