upgini 1.2.19a1__tar.gz → 1.2.20a3657.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (67) hide show
  1. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/PKG-INFO +1 -1
  2. upgini-1.2.20a3657.dev1/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/__init__.py +1 -0
  4. upgini-1.2.20a3657.dev1/src/upgini/autofe/all_operands.py +5 -0
  5. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/autofe/date.py +33 -6
  6. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/autofe/operand.py +47 -1
  7. upgini-1.2.20a3657.dev1/src/upgini/autofe/vector.py +155 -0
  8. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/features_enricher.py +9 -10
  9. upgini-1.2.19a1/src/upgini/__about__.py +0 -1
  10. upgini-1.2.19a1/src/upgini/autofe/all_operands.py +0 -87
  11. upgini-1.2.19a1/src/upgini/autofe/vector.py +0 -24
  12. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/.gitignore +0 -0
  13. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/LICENSE +0 -0
  14. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/README.md +0 -0
  15. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/pyproject.toml +0 -0
  16. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/ads.py +0 -0
  17. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/ads_management/__init__.py +0 -0
  18. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/ads_management/ads_manager.py +0 -0
  19. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/autofe/__init__.py +0 -0
  20. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/autofe/binary.py +0 -0
  21. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/autofe/feature.py +0 -0
  22. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/autofe/groupby.py +0 -0
  23. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/autofe/unary.py +0 -0
  24. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/data_source/__init__.py +0 -0
  25. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/data_source/data_source_publisher.py +0 -0
  26. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/dataset.py +0 -0
  27. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/errors.py +0 -0
  28. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/http.py +0 -0
  29. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/lazy_import.py +0 -0
  30. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/mdc/__init__.py +0 -0
  31. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/mdc/context.py +0 -0
  32. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/metadata.py +0 -0
  33. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/metrics.py +0 -0
  34. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/normalizer/__init__.py +0 -0
  35. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/normalizer/normalize_utils.py +0 -0
  36. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/resource_bundle/__init__.py +0 -0
  37. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/resource_bundle/exceptions.py +0 -0
  38. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/resource_bundle/strings.properties +0 -0
  39. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  40. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/sampler/__init__.py +0 -0
  41. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/sampler/base.py +0 -0
  42. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/sampler/random_under_sampler.py +0 -0
  43. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/sampler/utils.py +0 -0
  44. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/search_task.py +0 -0
  45. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/spinner.py +0 -0
  46. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/__init__.py +0 -0
  47. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/base_search_key_detector.py +0 -0
  48. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/blocked_time_series.py +0 -0
  49. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/country_utils.py +0 -0
  50. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/custom_loss_utils.py +0 -0
  51. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/cv_utils.py +0 -0
  52. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/datetime_utils.py +0 -0
  53. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/deduplicate_utils.py +0 -0
  54. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/display_utils.py +0 -0
  55. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/email_utils.py +0 -0
  56. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/fallback_progress_bar.py +0 -0
  57. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/features_validator.py +0 -0
  58. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/format.py +0 -0
  59. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/ip_utils.py +0 -0
  60. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/phone_utils.py +0 -0
  61. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/postal_code_utils.py +0 -0
  62. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/progress_bar.py +0 -0
  63. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/sklearn_ext.py +0 -0
  64. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/target_utils.py +0 -0
  65. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/track_info.py +0 -0
  66. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/utils/warning_counter.py +0 -0
  67. {upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.19a1
3
+ Version: 1.2.20a3657.dev1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.20a3657.dev1"
@@ -2,6 +2,7 @@ import os
2
2
 
3
3
  from upgini.features_enricher import FeaturesEnricher # noqa: F401
4
4
  from upgini.metadata import SearchKey, CVType, RuntimeParameters, ModelTaskType # noqa: F401
5
+
5
6
  # from .lazy_import import LazyImport
6
7
 
7
8
  os.environ["SETUPTOOLS_USE_DISTUTILS"] = "stdlib"
@@ -0,0 +1,5 @@
1
+ from upgini.autofe.operand import OperandRegistry
2
+
3
+
4
+ def find_op(name):
5
+ return OperandRegistry.get_operand(name)
@@ -7,11 +7,11 @@ import pandas as pd
7
7
  from pandas.core.arrays.timedeltas import TimedeltaArray
8
8
  from pydantic import BaseModel, __version__ as pydantic_version
9
9
 
10
- from upgini.autofe.operand import PandasOperand
10
+ from upgini.autofe.operand import PandasOperand, ParametrizedOperand
11
11
 
12
12
 
13
13
  def get_pydantic_version():
14
- major_version = int(pydantic_version.split('.')[0])
14
+ major_version = int(pydantic_version.split(".")[0])
15
15
  return major_version
16
16
 
17
17
 
@@ -109,7 +109,7 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
109
109
  _count_aggregations = ["nunique", "count"]
110
110
 
111
111
 
112
- class DateListDiff(PandasOperand, DateDiffMixin):
112
+ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
113
113
  is_binary: bool = True
114
114
  has_symmetry_importance: bool = True
115
115
 
@@ -134,6 +134,15 @@ class DateListDiff(PandasOperand, DateDiffMixin):
134
134
  data["name"] = f"date_diff_{data.get('aggregation')}"
135
135
  super().__init__(**data)
136
136
 
137
+ @classmethod
138
+ def from_formula(cls, formula: str) -> Optional["DateListDiff"]:
139
+ if not formula.startswith("date_diff_"):
140
+ return None
141
+ aggregation = formula.replace("date_diff_", "")
142
+ if "_" in aggregation:
143
+ return None
144
+ return cls(aggregation=aggregation)
145
+
137
146
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
138
147
  left = self._convert_to_date(left, self.left_unit)
139
148
  right_mask = right.apply(lambda x: len(x) > 0)
@@ -170,7 +179,7 @@ class DateListDiff(PandasOperand, DateDiffMixin):
170
179
  return method(x) if len(x) > 0 else default
171
180
 
172
181
 
173
- class DateListDiffBounded(DateListDiff):
182
+ class DateListDiffBounded(DateListDiff, ParametrizedOperand):
174
183
  lower_bound: Optional[int] = None
175
184
  upper_bound: Optional[int] = None
176
185
 
@@ -188,6 +197,23 @@ class DateListDiffBounded(DateListDiff):
188
197
  data["name"] = "_".join(components)
189
198
  super().__init__(**data)
190
199
 
200
+ @classmethod
201
+ def from_formula(cls, formula: str) -> Optional["DateListDiffBounded"]:
202
+ import re
203
+
204
+ pattern = r"^date_diff_([^_]+)_((minusinf|\d+))_((plusinf|\d+))_(\w+)$"
205
+ match = re.match(pattern, formula)
206
+
207
+ if not match:
208
+ return None
209
+
210
+ diff_unit = match.group(1)
211
+ lower_bound = None if match.group(2) == "minusinf" else int(match.group(2))
212
+ upper_bound = None if match.group(4) == "plusinf" else int(match.group(4))
213
+ aggregation = match.group(6)
214
+
215
+ return cls(diff_unit=diff_unit, lower_bound=lower_bound, upper_bound=upper_bound, aggregation=aggregation)
216
+
191
217
  def _agg(self, x):
192
218
  x = x[
193
219
  (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
@@ -257,16 +283,17 @@ class DatePercentile(DatePercentileBase):
257
283
  # Use @field_validator for Pydantic 2.x
258
284
  from pydantic import field_validator
259
285
 
260
- @field_validator('zero_bounds', mode='before')
286
+ @field_validator("zero_bounds", mode="before")
261
287
  def parse_zero_bounds(cls, value):
262
288
  if isinstance(value, str):
263
289
  return json.loads(value)
264
290
  return value
291
+
265
292
  else:
266
293
  # Use @validator for Pydantic 1.x
267
294
  from pydantic import validator
268
295
 
269
- @validator('zero_bounds', pre=True)
296
+ @validator("zero_bounds", pre=True)
270
297
  def parse_zero_bounds(cls, value):
271
298
  if isinstance(value, str):
272
299
  return json.loads(value)
@@ -6,7 +6,47 @@ import pandas as pd
6
6
  from pydantic import BaseModel
7
7
 
8
8
 
9
- class Operand(BaseModel):
9
+ class OperandRegistry(type(BaseModel)):
10
+ _registry = {}
11
+ _parametrized_registry = []
12
+
13
+ def __new__(cls, name, bases, attrs):
14
+ new_class = super().__new__(cls, name, bases, attrs)
15
+ # Only register if it's a concrete class that inherits from Operand
16
+ base_classes = [b for b in bases]
17
+ base_names = {b.__name__ for b in bases}
18
+ while base_classes:
19
+ base = base_classes.pop()
20
+ base_names.update(b.__name__ for b in base.__bases__)
21
+ base_classes.extend(base.__bases__)
22
+
23
+ if "Operand" in base_names:
24
+ # Track parametrized operands separately
25
+ if "ParametrizedOperand" in base_names:
26
+ cls._parametrized_registry.append(new_class)
27
+ else:
28
+ try:
29
+ instance = new_class()
30
+ cls._registry[instance.name] = new_class
31
+ except Exception:
32
+ pass
33
+ return new_class
34
+
35
+ @classmethod
36
+ def get_operand(cls, name: str) -> Optional["Operand"]:
37
+ # First try to resolve as a parametrized operand formula
38
+ for operand_cls in cls._parametrized_registry:
39
+ resolved = operand_cls.from_formula(name)
40
+ if resolved is not None:
41
+ return resolved
42
+ # Fall back to direct registry lookup
43
+ non_parametrized = cls._registry.get(name)
44
+ if non_parametrized is not None:
45
+ return non_parametrized()
46
+ return None
47
+
48
+
49
+ class Operand(BaseModel, metaclass=OperandRegistry):
10
50
  name: str
11
51
  alias: Optional[str] = None
12
52
  is_unary: bool = False
@@ -32,6 +72,12 @@ class Operand(BaseModel):
32
72
  return res
33
73
 
34
74
 
75
+ class ParametrizedOperand(Operand):
76
+ @classmethod
77
+ def from_formula(cls, formula: str) -> Optional["Operand"]:
78
+ pass
79
+
80
+
35
81
  MAIN_COLUMN = "main_column"
36
82
 
37
83
 
@@ -0,0 +1,155 @@
1
+ import abc
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ import pandas as pd
5
+ from pydantic import validator
6
+
7
+ from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
8
+
9
+
10
+ class Mean(PandasOperand, VectorizableMixin):
11
+ name: str = "mean"
12
+ output_type: Optional[str] = "float"
13
+ is_vector: bool = True
14
+ group_index: int = 0
15
+
16
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
17
+ return pd.DataFrame(data).T.fillna(0).mean(axis=1)
18
+
19
+
20
+ class Sum(PandasOperand, VectorizableMixin):
21
+ name: str = "sum"
22
+ is_vector: bool = True
23
+ group_index: int = 0
24
+
25
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
26
+ return pd.DataFrame(data).T.fillna(0).sum(axis=1)
27
+
28
+
29
+ class TimeSeriesBase(PandasOperand, abc.ABC):
30
+ is_vector: bool = True
31
+ date_unit: Optional[str] = None
32
+
33
+ def get_params(self) -> Dict[str, Optional[str]]:
34
+ res = super().get_params()
35
+ res.update(
36
+ {
37
+ "date_unit": self.date_unit,
38
+ }
39
+ )
40
+ return res
41
+
42
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
43
+ # assuming first is date, last is value, rest is group columns
44
+ date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
45
+ ts = pd.concat([date] + data[1:], axis=1)
46
+ ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
47
+ ts.set_index(date.name, inplace=True)
48
+ ts = ts[ts.index.notna()].sort_index()
49
+ ts = ts.groupby([c.name for c in data[1:-1]]) if len(data) > 2 else ts
50
+ ts = self._aggregate(ts)
51
+ ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
52
+
53
+ return ts.iloc[:, -1]
54
+
55
+ @abc.abstractmethod
56
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
57
+ pass
58
+
59
+
60
+ _roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
61
+
62
+
63
+ class Roll(TimeSeriesBase, ParametrizedOperand):
64
+ aggregation: str
65
+ window_size: int = 1
66
+ window_unit: str = "D"
67
+
68
+ @validator("window_unit")
69
+ def validate_window_unit(cls, v: str) -> str:
70
+ try:
71
+ pd.tseries.frequencies.to_offset(v)
72
+ return v
73
+ except ValueError:
74
+ raise ValueError(
75
+ f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
76
+ )
77
+
78
+ def __init__(self, **data: Any) -> None:
79
+ if "name" not in data:
80
+ components = [
81
+ "roll",
82
+ str(data.get("window_size") or 1) + str(data.get("window_unit") or "D"),
83
+ data.get("aggregation"),
84
+ ]
85
+ data["name"] = "_".join(components).lower()
86
+ super().__init__(**data)
87
+
88
+ @classmethod
89
+ def from_formula(cls, formula: str) -> Optional["Roll"]:
90
+ import re
91
+
92
+ pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
93
+ match = re.match(pattern, formula)
94
+
95
+ if not match:
96
+ return None
97
+
98
+ window_size = int(match.group(1))
99
+ window_unit = match.group(2)
100
+ aggregation = match.group(3)
101
+
102
+ return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
103
+
104
+ def get_params(self) -> Dict[str, Optional[str]]:
105
+ res = super().get_params()
106
+ res.update(
107
+ {
108
+ "window_size": self.window_size,
109
+ "window_unit": self.window_unit,
110
+ "aggregation": self.aggregation,
111
+ }
112
+ )
113
+ return res
114
+
115
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
116
+ return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=self.window_size).agg(
117
+ _roll_aggregations.get(self.aggregation, self.aggregation)
118
+ )
119
+
120
+
121
+ class Lag(TimeSeriesBase, ParametrizedOperand):
122
+ lag_size: int
123
+ lag_unit: str = "D"
124
+
125
+ def __init__(self, **data: Any) -> None:
126
+ if "name" not in data:
127
+ components = [
128
+ "lag",
129
+ str(data.get("lag_size") or 1) + str(data.get("lag_unit") or "D"),
130
+ ]
131
+ data["name"] = "_".join(components).lower()
132
+ super().__init__(**data)
133
+
134
+ @classmethod
135
+ def from_formula(cls, formula: str) -> Optional["Lag"]:
136
+ import re
137
+
138
+ pattern = r"^lag_(\d+)([a-zA-Z])$"
139
+ match = re.match(pattern, formula)
140
+
141
+ if not match:
142
+ return None
143
+
144
+ lag_size = int(match.group(1))
145
+ lag_unit = match.group(2)
146
+
147
+ return cls(lag_size=lag_size, lag_unit=lag_unit)
148
+
149
+ def get_params(self) -> Dict[str, Optional[str]]:
150
+ res = super().get_params()
151
+ return res
152
+
153
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
154
+ lag_window = self.lag_size + 1
155
+ return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])
@@ -2474,8 +2474,16 @@ class FeaturesEnricher(TransformerMixin):
2474
2474
 
2475
2475
  df = pd.concat([validated_X, validated_y], axis=1)
2476
2476
 
2477
+ if validated_eval_set is not None and len(validated_eval_set) > 0:
2478
+ df[EVAL_SET_INDEX] = 0
2479
+ for idx, (eval_X, eval_y) in enumerate(validated_eval_set):
2480
+ eval_df = pd.concat([eval_X, eval_y], axis=1)
2481
+ eval_df[EVAL_SET_INDEX] = idx + 1
2482
+ df = pd.concat([df, eval_df])
2483
+
2477
2484
  self.fit_search_keys = self.search_keys.copy()
2478
- self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
2485
+ df = self.__handle_index_search_keys(df, self.fit_search_keys)
2486
+ self.fit_search_keys = self.__prepare_search_keys(df, self.fit_search_keys, is_demo_dataset)
2479
2487
 
2480
2488
  maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2481
2489
  has_date = maybe_date_column is not None
@@ -2487,17 +2495,8 @@ class FeaturesEnricher(TransformerMixin):
2487
2495
  self.loss, self.model_task_type, self.runtime_parameters, self.logger
2488
2496
  )
2489
2497
 
2490
- if validated_eval_set is not None and len(validated_eval_set) > 0:
2491
- df[EVAL_SET_INDEX] = 0
2492
- for idx, (eval_X, eval_y) in enumerate(validated_eval_set):
2493
- eval_df = pd.concat([eval_X, eval_y], axis=1)
2494
- eval_df[EVAL_SET_INDEX] = idx + 1
2495
- df = pd.concat([df, eval_df])
2496
-
2497
2498
  df = self.__correct_target(df)
2498
2499
 
2499
- df = self.__handle_index_search_keys(df, self.fit_search_keys)
2500
-
2501
2500
  if DEFAULT_INDEX in df.columns:
2502
2501
  msg = self.bundle.get("unsupported_index_column")
2503
2502
  self.logger.info(msg)
@@ -1 +0,0 @@
1
- __version__ = "1.2.19a1"
@@ -1,87 +0,0 @@
1
- from copy import deepcopy
2
- from typing import Dict
3
-
4
- from upgini.autofe.binary import (
5
- Add,
6
- Combine,
7
- CombineThenFreq,
8
- Distance,
9
- Divide,
10
- JaroWinklerSim1,
11
- JaroWinklerSim2,
12
- LevenshteinSim,
13
- Max,
14
- Min,
15
- Multiply,
16
- Sim,
17
- Subtract,
18
- )
19
- from upgini.autofe.date import (
20
- DateDiff,
21
- DateDiffType2,
22
- DateListDiff,
23
- DateListDiffBounded,
24
- DatePercentile,
25
- DatePercentileMethod2,
26
- )
27
- from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
28
- from upgini.autofe.operand import Operand
29
- from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
30
- from upgini.autofe.vector import Mean, Sum
31
-
32
- ALL_OPERANDS: Dict[str, Operand] = {
33
- op.name: op
34
- for op in [
35
- Freq(),
36
- Mean(),
37
- Sum(),
38
- Abs(),
39
- Log(),
40
- Sqrt(),
41
- Square(),
42
- Sigmoid(),
43
- Floor(),
44
- Residual(),
45
- Min(),
46
- Max(),
47
- Add(),
48
- Subtract(),
49
- Multiply(),
50
- Divide(),
51
- GroupByThenAgg(name="GroupByThenMin", agg="min"),
52
- GroupByThenAgg(name="GroupByThenMax", agg="max"),
53
- GroupByThenAgg(name="GroupByThenMean", agg="mean"),
54
- GroupByThenAgg(name="GroupByThenMedian", agg="median"),
55
- GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
56
- GroupByThenRank(),
57
- Combine(),
58
- CombineThenFreq(),
59
- GroupByThenNUnique(),
60
- GroupByThenFreq(),
61
- Sim(),
62
- DateDiff(),
63
- DateDiffType2(),
64
- DateListDiff(aggregation="min"),
65
- DateListDiff(aggregation="max"),
66
- DateListDiff(aggregation="mean"),
67
- DateListDiff(aggregation="nunique"),
68
- DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=0, upper_bound=18),
69
- DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=18, upper_bound=23),
70
- DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=23, upper_bound=30),
71
- DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
72
- DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
73
- DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
74
- DatePercentile(),
75
- DatePercentileMethod2(),
76
- Norm(),
77
- JaroWinklerSim1(),
78
- JaroWinklerSim2(),
79
- LevenshteinSim(),
80
- Distance(),
81
- Embeddings(),
82
- ]
83
- }
84
-
85
-
86
- def find_op(name):
87
- return deepcopy(ALL_OPERANDS.get(name))
@@ -1,24 +0,0 @@
1
- from typing import List, Optional
2
-
3
- import pandas as pd
4
-
5
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
6
-
7
-
8
- class Mean(PandasOperand, VectorizableMixin):
9
- name: str = "mean"
10
- output_type: Optional[str] = "float"
11
- is_vector: bool = True
12
- group_index: int = 0
13
-
14
- def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
15
- return pd.DataFrame(data).T.fillna(0).mean(axis=1)
16
-
17
-
18
- class Sum(PandasOperand, VectorizableMixin):
19
- name: str = "sum"
20
- is_vector: bool = True
21
- group_index: int = 0
22
-
23
- def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
24
- return pd.DataFrame(data).T.fillna(0).sum(axis=1)
File without changes
File without changes
File without changes