upgini 1.2.32__py3-none-any.whl → 1.2.34a3657.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.32"
1
+ __version__ = "1.2.34a3657.dev1"
@@ -1,87 +1,10 @@
1
- from copy import deepcopy
2
- from typing import Dict
3
-
4
- from upgini.autofe.binary import (
5
- Add,
6
- Combine,
7
- CombineThenFreq,
8
- Distance,
9
- Divide,
10
- JaroWinklerSim1,
11
- JaroWinklerSim2,
12
- LevenshteinSim,
13
- Max,
14
- Min,
15
- Multiply,
16
- Sim,
17
- Subtract,
18
- )
19
- from upgini.autofe.date import (
20
- DateDiff,
21
- DateDiffType2,
22
- DateListDiff,
23
- DateListDiffBounded,
24
- DatePercentile,
25
- DatePercentileMethod2,
26
- )
27
- from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
28
- from upgini.autofe.operand import Operand
29
- from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
30
- from upgini.autofe.vector import Mean, Sum
31
-
32
- ALL_OPERANDS: Dict[str, Operand] = {
33
- op.name: op
34
- for op in [
35
- Freq(),
36
- Mean(),
37
- Sum(),
38
- Abs(),
39
- Log(),
40
- Sqrt(),
41
- Square(),
42
- Sigmoid(),
43
- Floor(),
44
- Residual(),
45
- Min(),
46
- Max(),
47
- Add(),
48
- Subtract(),
49
- Multiply(),
50
- Divide(),
51
- GroupByThenAgg(name="GroupByThenMin", agg="min"),
52
- GroupByThenAgg(name="GroupByThenMax", agg="max"),
53
- GroupByThenAgg(name="GroupByThenMean", agg="mean"),
54
- GroupByThenAgg(name="GroupByThenMedian", agg="median"),
55
- GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
56
- GroupByThenRank(),
57
- Combine(),
58
- CombineThenFreq(),
59
- GroupByThenNUnique(),
60
- GroupByThenFreq(),
61
- Sim(),
62
- DateDiff(),
63
- DateDiffType2(),
64
- DateListDiff(aggregation="min"),
65
- DateListDiff(aggregation="max"),
66
- DateListDiff(aggregation="mean"),
67
- DateListDiff(aggregation="nunique"),
68
- DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=0, upper_bound=18),
69
- DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=18, upper_bound=23),
70
- DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=23, upper_bound=30),
71
- DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
72
- DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
73
- DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
74
- DatePercentile(),
75
- DatePercentileMethod2(),
76
- Norm(),
77
- JaroWinklerSim1(),
78
- JaroWinklerSim2(),
79
- LevenshteinSim(),
80
- Distance(),
81
- Embeddings(),
82
- ]
83
- }
1
+ from upgini.autofe.operand import OperandRegistry
2
+ from upgini.autofe.unary import * # noqa
3
+ from upgini.autofe.binary import * # noqa
4
+ from upgini.autofe.groupby import * # noqa
5
+ from upgini.autofe.date import * # noqa
6
+ from upgini.autofe.vector import * # noqa
84
7
 
85
8
 
86
9
  def find_op(name):
87
- return deepcopy(ALL_OPERANDS.get(name))
10
+ return OperandRegistry.get_operand(name)
upgini/autofe/date.py CHANGED
@@ -7,11 +7,11 @@ import pandas as pd
7
7
  from pandas.core.arrays.timedeltas import TimedeltaArray
8
8
  from pydantic import BaseModel, __version__ as pydantic_version
9
9
 
10
- from upgini.autofe.operand import PandasOperand
10
+ from upgini.autofe.operand import PandasOperand, ParametrizedOperand
11
11
 
12
12
 
13
13
  def get_pydantic_version():
14
- major_version = int(pydantic_version.split('.')[0])
14
+ major_version = int(pydantic_version.split(".")[0])
15
15
  return major_version
16
16
 
17
17
 
@@ -109,7 +109,7 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
109
109
  _count_aggregations = ["nunique", "count"]
110
110
 
111
111
 
112
- class DateListDiff(PandasOperand, DateDiffMixin):
112
+ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
113
113
  is_binary: bool = True
114
114
  has_symmetry_importance: bool = True
115
115
 
@@ -129,10 +129,17 @@ class DateListDiff(PandasOperand, DateDiffMixin):
129
129
  )
130
130
  return res
131
131
 
132
- def __init__(self, **data: Any) -> None:
133
- if "name" not in data:
134
- data["name"] = f"date_diff_{data.get('aggregation')}"
135
- super().__init__(**data)
132
+ def to_formula(self) -> str:
133
+ return f"date_diff_{self.aggregation}"
134
+
135
+ @classmethod
136
+ def from_formula(cls, formula: str) -> Optional["DateListDiff"]:
137
+ if not formula.startswith("date_diff_"):
138
+ return None
139
+ aggregation = formula.replace("date_diff_", "")
140
+ if "_" in aggregation:
141
+ return None
142
+ return cls(aggregation=aggregation)
136
143
 
137
144
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
138
145
  left = self._convert_to_date(left, self.left_unit)
@@ -170,23 +177,31 @@ class DateListDiff(PandasOperand, DateDiffMixin):
170
177
  return method(x) if len(x) > 0 else default
171
178
 
172
179
 
173
- class DateListDiffBounded(DateListDiff):
180
+ class DateListDiffBounded(DateListDiff, ParametrizedOperand):
174
181
  lower_bound: Optional[int] = None
175
182
  upper_bound: Optional[int] = None
176
183
 
177
- def __init__(self, **data: Any) -> None:
178
- if "name" not in data:
179
- lower_bound = data.get("lower_bound")
180
- upper_bound = data.get("upper_bound")
181
- components = [
182
- "date_diff",
183
- data.get("diff_unit"),
184
- str(lower_bound if lower_bound is not None else "minusinf"),
185
- str(upper_bound if upper_bound is not None else "plusinf"),
186
- ]
187
- components.append(data.get("aggregation"))
188
- data["name"] = "_".join(components)
189
- super().__init__(**data)
184
+ def to_formula(self) -> str:
185
+ lower_bound = "minusinf" if self.lower_bound is None else self.lower_bound
186
+ upper_bound = "plusinf" if self.upper_bound is None else self.upper_bound
187
+ return f"date_diff_{self.diff_unit}_{lower_bound}_{upper_bound}_{self.aggregation}"
188
+
189
+ @classmethod
190
+ def from_formula(cls, formula: str) -> Optional["DateListDiffBounded"]:
191
+ import re
192
+
193
+ pattern = r"^date_diff_([^_]+)_((minusinf|\d+))_((plusinf|\d+))_(\w+)$"
194
+ match = re.match(pattern, formula)
195
+
196
+ if not match:
197
+ return None
198
+
199
+ diff_unit = match.group(1)
200
+ lower_bound = None if match.group(2) == "minusinf" else int(match.group(2))
201
+ upper_bound = None if match.group(4) == "plusinf" else int(match.group(4))
202
+ aggregation = match.group(6)
203
+
204
+ return cls(diff_unit=diff_unit, lower_bound=lower_bound, upper_bound=upper_bound, aggregation=aggregation)
190
205
 
191
206
  def _agg(self, x):
192
207
  x = x[
@@ -257,16 +272,17 @@ class DatePercentile(DatePercentileBase):
257
272
  # Use @field_validator for Pydantic 2.x
258
273
  from pydantic import field_validator
259
274
 
260
- @field_validator('zero_bounds', mode='before')
275
+ @field_validator("zero_bounds", mode="before")
261
276
  def parse_zero_bounds(cls, value):
262
277
  if isinstance(value, str):
263
278
  return json.loads(value)
264
279
  return value
280
+
265
281
  else:
266
282
  # Use @validator for Pydantic 1.x
267
283
  from pydantic import validator
268
284
 
269
- @validator('zero_bounds', pre=True)
285
+ @validator("zero_bounds", pre=True)
270
286
  def parse_zero_bounds(cls, value):
271
287
  if isinstance(value, str):
272
288
  return json.loads(value)
upgini/autofe/feature.py CHANGED
@@ -121,7 +121,7 @@ class Feature:
121
121
 
122
122
  def get_hash(self) -> str:
123
123
  return hashlib.sha256(
124
- "_".join([self.op.name] + [ch.get_display_name() for ch in self.children]).encode("utf-8")
124
+ "_".join([self.op.to_formula()] + [ch.get_display_name() for ch in self.children]).encode("utf-8")
125
125
  ).hexdigest()[:8]
126
126
 
127
127
  def set_alias(self, alias: str) -> "Feature":
@@ -129,7 +129,7 @@ class Feature:
129
129
  return self
130
130
 
131
131
  def get_all_operand_names(self) -> Set[str]:
132
- return {self.op.name}.union(
132
+ return {self.op.to_formula()}.union(
133
133
  {n for f in self.children if isinstance(f, Feature) for n in f.get_all_operand_names()}
134
134
  )
135
135
 
@@ -160,7 +160,7 @@ class Feature:
160
160
  child.delete_data()
161
161
 
162
162
  def get_op_display_name(self) -> str:
163
- return self.op.alias or self.op.name.lower()
163
+ return (self.op.alias or self.op.to_formula()).lower()
164
164
 
165
165
  def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
166
166
  if self.cached_display_name is not None and cache:
@@ -239,9 +239,9 @@ class Feature:
239
239
  if self.op.name in ["+", "-", "*", "/"]:
240
240
  left = self.children[0].to_formula(**kwargs)
241
241
  right = self.children[1].to_formula(**kwargs)
242
- return f"({left}{self.op.name}{right})"
242
+ return f"({left}{self.op.to_formula()}{right})"
243
243
  else:
244
- result = [self.op.name, "("]
244
+ result = [self.op.to_formula(), "("]
245
245
  for i in range(len(self.children)):
246
246
  string_i = self.children[i].to_formula(**kwargs)
247
247
  result.append(string_i)
@@ -254,9 +254,9 @@ class Feature:
254
254
  if self.op.name in ["+", "-", "*", "/"]:
255
255
  left = self.children[0].to_pretty_formula()
256
256
  right = self.children[1].to_pretty_formula()
257
- return f"{left} {self.op.name} {right}"
257
+ return f"{left} {self.op.to_formula()} {right}"
258
258
  else:
259
- result = [self.op.name, "("]
259
+ result = [self.op.to_formula(), "("]
260
260
  for i in range(len(self.children)):
261
261
  string_i = self.children[i].to_pretty_formula()
262
262
  result.append(string_i)
upgini/autofe/groupby.py CHANGED
@@ -2,33 +2,43 @@ from typing import Optional
2
2
 
3
3
  import pandas as pd
4
4
 
5
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
5
+ from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
6
6
 
7
7
 
8
- class GroupByThenAgg(PandasOperand, VectorizableMixin):
8
+ class GroupByThenAgg(
9
+ PandasOperand,
10
+ VectorizableMixin,
11
+ ParametrizedOperand,
12
+ ):
9
13
  agg: Optional[str]
10
14
  is_vectorizable: bool = True
11
15
  is_grouping: bool = True
12
16
  is_distribution_dependent: bool = True
13
17
 
18
+ def to_formula(self) -> str:
19
+ return f"GroupByThen{self.agg}"
20
+
21
+ @classmethod
22
+ def from_formula(cls, formula: str) -> Optional["GroupByThenAgg"]:
23
+ if not formula.startswith("GroupByThen"):
24
+ return None
25
+ agg = formula[len("GroupByThen") :]
26
+ if agg.lower() in ["rank", "nunique", "freq"]: # other implementation
27
+ return None
28
+ return cls(agg=agg)
29
+
14
30
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
15
- temp = left.groupby(right).agg(self.agg)
31
+ temp = left.groupby(right).agg(self.agg.lower())
16
32
  return self._loc(right, temp)
17
33
 
18
34
  def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
19
35
  group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
20
36
  d1 = data[value_columns]
21
37
  d2 = data[group_column]
22
- temp = d1.groupby(d2).agg(self.agg)
38
+ temp = d1.groupby(d2).agg(self.agg.lower())
23
39
  return temp.merge(d2, how="right", on=[group_column])[value_columns]
24
40
 
25
41
 
26
- class GroupByThenMedian(GroupByThenAgg):
27
- name: str = "GroupByThenMedian"
28
- pandas_agg: str = "median"
29
- is_distribution_dependent: bool = True
30
-
31
-
32
42
  class GroupByThenRank(PandasOperand, VectorizableMixin):
33
43
  name: str = "GroupByThenRank"
34
44
  is_vectorizable: bool = True
upgini/autofe/operand.py CHANGED
@@ -6,8 +6,48 @@ import pandas as pd
6
6
  from pydantic import BaseModel
7
7
 
8
8
 
9
- class Operand(BaseModel):
10
- name: str
9
+ class OperandRegistry(type(BaseModel)):
10
+ _registry = {}
11
+ _parametrized_registry = []
12
+
13
+ def __new__(cls, name, bases, attrs):
14
+ new_class = super().__new__(cls, name, bases, attrs)
15
+ # Only register if it's a concrete class that inherits from Operand
16
+ base_classes = [b for b in bases]
17
+ base_names = {b.__name__ for b in bases}
18
+ while base_classes:
19
+ base = base_classes.pop()
20
+ base_names.update(b.__name__ for b in base.__bases__)
21
+ base_classes.extend(base.__bases__)
22
+
23
+ if "Operand" in base_names:
24
+ # Track parametrized operands separately
25
+ if "ParametrizedOperand" in base_names:
26
+ cls._parametrized_registry.append(new_class)
27
+ else:
28
+ try:
29
+ instance = new_class()
30
+ cls._registry[instance.name] = new_class
31
+ except Exception:
32
+ pass
33
+ return new_class
34
+
35
+ @classmethod
36
+ def get_operand(cls, name: str) -> Optional["Operand"]:
37
+ # First try to resolve as a parametrized operand formula
38
+ for operand_cls in cls._parametrized_registry:
39
+ resolved = operand_cls.from_formula(name)
40
+ if resolved is not None:
41
+ return resolved
42
+ # Fall back to direct registry lookup
43
+ non_parametrized = cls._registry.get(name)
44
+ if non_parametrized is not None:
45
+ return non_parametrized()
46
+ return None
47
+
48
+
49
+ class Operand(BaseModel, metaclass=OperandRegistry):
50
+ name: Optional[str] = None
11
51
  alias: Optional[str] = None
12
52
  is_unary: bool = False
13
53
  is_symmetrical: bool = False
@@ -31,6 +71,21 @@ class Operand(BaseModel):
31
71
  res.update(self.params or {})
32
72
  return res
33
73
 
74
+ def to_formula(self) -> str:
75
+ return self.name
76
+
77
+
78
+ class ParametrizedOperand(Operand, abc.ABC):
79
+
80
+ @abc.abstractmethod
81
+ def to_formula(self) -> str:
82
+ pass
83
+
84
+ @classmethod
85
+ @abc.abstractmethod
86
+ def from_formula(cls, formula: str) -> Optional["Operand"]:
87
+ pass
88
+
34
89
 
35
90
  MAIN_COLUMN = "main_column"
36
91
 
upgini/autofe/vector.py CHANGED
@@ -1,8 +1,10 @@
1
- from typing import List, Optional
1
+ import abc
2
+ from typing import Dict, List, Optional
2
3
 
3
4
  import pandas as pd
5
+ from pydantic import validator
4
6
 
5
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
7
+ from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
6
8
 
7
9
 
8
10
  class Mean(PandasOperand, VectorizableMixin):
@@ -22,3 +24,119 @@ class Sum(PandasOperand, VectorizableMixin):
22
24
 
23
25
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
24
26
  return pd.DataFrame(data).T.fillna(0).sum(axis=1)
27
+
28
+
29
+ class TimeSeriesBase(PandasOperand, abc.ABC):
30
+ is_vector: bool = True
31
+ date_unit: Optional[str] = None
32
+
33
+ def get_params(self) -> Dict[str, Optional[str]]:
34
+ res = super().get_params()
35
+ res.update(
36
+ {
37
+ "date_unit": self.date_unit,
38
+ }
39
+ )
40
+ return res
41
+
42
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
43
+ # assuming first is date, last is value, rest is group columns
44
+ date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
45
+ ts = pd.concat([date] + data[1:], axis=1)
46
+ ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
47
+ ts.set_index(date.name, inplace=True)
48
+ ts = ts[ts.index.notna()].sort_index()
49
+ ts = ts.groupby([c.name for c in data[1:-1]]) if len(data) > 2 else ts
50
+ ts = self._aggregate(ts)
51
+ ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
52
+
53
+ return ts.iloc[:, -1]
54
+
55
+ @abc.abstractmethod
56
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
57
+ pass
58
+
59
+
60
+ _roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
61
+
62
+
63
+ class Roll(TimeSeriesBase, ParametrizedOperand):
64
+ aggregation: str
65
+ window_size: int = 1
66
+ window_unit: str = "D"
67
+
68
+ @validator("window_unit")
69
+ def validate_window_unit(cls, v: str) -> str:
70
+ try:
71
+ pd.tseries.frequencies.to_offset(v)
72
+ return v
73
+ except ValueError:
74
+ raise ValueError(
75
+ f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
76
+ )
77
+
78
+ def to_formula(self) -> str:
79
+ return f"roll_{self.window_size}{self.window_unit}_{self.aggregation}"
80
+
81
+ @classmethod
82
+ def from_formula(cls, formula: str) -> Optional["Roll"]:
83
+ import re
84
+
85
+ pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
86
+ match = re.match(pattern, formula)
87
+
88
+ if not match:
89
+ return None
90
+
91
+ window_size = int(match.group(1))
92
+ window_unit = match.group(2)
93
+ aggregation = match.group(3)
94
+
95
+ return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
96
+
97
+ def get_params(self) -> Dict[str, Optional[str]]:
98
+ res = super().get_params()
99
+ res.update(
100
+ {
101
+ "window_size": self.window_size,
102
+ "window_unit": self.window_unit,
103
+ "aggregation": self.aggregation,
104
+ }
105
+ )
106
+ return res
107
+
108
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
109
+ return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=self.window_size).agg(
110
+ _roll_aggregations.get(self.aggregation, self.aggregation)
111
+ )
112
+
113
+
114
+ class Lag(TimeSeriesBase, ParametrizedOperand):
115
+ lag_size: int
116
+ lag_unit: str = "D"
117
+
118
+ def to_formula(self) -> str:
119
+ return f"lag_{self.lag_size}{self.lag_unit}"
120
+
121
+ @classmethod
122
+ def from_formula(cls, formula: str) -> Optional["Lag"]:
123
+ import re
124
+
125
+ pattern = r"^lag_(\d+)([a-zA-Z])$"
126
+ match = re.match(pattern, formula)
127
+
128
+ if not match:
129
+ return None
130
+
131
+ lag_size = int(match.group(1))
132
+ lag_unit = match.group(2)
133
+
134
+ return cls(lag_size=lag_size, lag_unit=lag_unit)
135
+
136
+ def get_params(self) -> Dict[str, Optional[str]]:
137
+ res = super().get_params()
138
+ return res
139
+
140
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
141
+ lag_window = self.lag_size + 1
142
+ return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])
@@ -111,7 +111,11 @@ try:
111
111
  except Exception:
112
112
  from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
113
113
 
114
- from upgini.utils.target_utils import calculate_psi, define_task
114
+ from upgini.utils.target_utils import (
115
+ balance_undersample_forced,
116
+ calculate_psi,
117
+ define_task,
118
+ )
115
119
  from upgini.utils.warning_counter import WarningCounter
116
120
  from upgini.version_validator import validate_version
117
121
 
@@ -967,6 +971,13 @@ class FeaturesEnricher(TransformerMixin):
967
971
  self.__log_warning(self.bundle.get("metrics_no_important_free_features"))
968
972
  return None
969
973
 
974
+ maybe_phone_column = self._get_phone_column(self.search_keys)
975
+ text_features = (
976
+ [f for f in self.generate_features if f != maybe_phone_column]
977
+ if self.generate_features is not None
978
+ else None
979
+ )
980
+
970
981
  print(self.bundle.get("metrics_start"))
971
982
  with Spinner():
972
983
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
@@ -982,7 +993,7 @@ class FeaturesEnricher(TransformerMixin):
982
993
  fitting_enriched_X,
983
994
  scoring,
984
995
  groups=groups,
985
- text_features=self.generate_features,
996
+ text_features=text_features,
986
997
  has_date=has_date,
987
998
  )
988
999
  metric = wrapper.metric_name
@@ -1009,7 +1020,7 @@ class FeaturesEnricher(TransformerMixin):
1009
1020
  cat_features,
1010
1021
  add_params=custom_loss_add_params,
1011
1022
  groups=groups,
1012
- text_features=self.generate_features,
1023
+ text_features=text_features,
1013
1024
  has_date=has_date,
1014
1025
  )
1015
1026
  etalon_cv_result = baseline_estimator.cross_val_predict(
@@ -1044,7 +1055,7 @@ class FeaturesEnricher(TransformerMixin):
1044
1055
  cat_features,
1045
1056
  add_params=custom_loss_add_params,
1046
1057
  groups=groups,
1047
- text_features=self.generate_features,
1058
+ text_features=text_features,
1048
1059
  has_date=has_date,
1049
1060
  )
1050
1061
  enriched_cv_result = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
@@ -1827,7 +1838,27 @@ class FeaturesEnricher(TransformerMixin):
1827
1838
 
1828
1839
  # downsample if need to eval_set threshold
1829
1840
  num_samples = _num_samples(df)
1830
- if num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
1841
+ phone_column = self._get_phone_column(self.search_keys)
1842
+ force_downsampling = (
1843
+ not self.disable_force_downsampling
1844
+ and self.generate_features is not None
1845
+ and phone_column is not None
1846
+ and self.fit_columns_renaming[phone_column] in self.generate_features
1847
+ and num_samples > Dataset.FORCE_SAMPLE_SIZE
1848
+ )
1849
+ if force_downsampling:
1850
+ self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1851
+ df = balance_undersample_forced(
1852
+ df=df,
1853
+ target_column=TARGET,
1854
+ task_type=self.model_task_type,
1855
+ random_state=self.random_state,
1856
+ sample_size=Dataset.FORCE_SAMPLE_SIZE,
1857
+ logger=self.logger,
1858
+ bundle=self.bundle,
1859
+ warning_callback=self.__log_warning,
1860
+ )
1861
+ elif num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
1831
1862
  self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
1832
1863
  df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
1833
1864
 
@@ -2063,6 +2094,15 @@ class FeaturesEnricher(TransformerMixin):
2063
2094
  self.__display_support_link(msg)
2064
2095
  return None, {c: c for c in X.columns}, []
2065
2096
 
2097
+ features_meta = self._search_task.get_all_features_metadata_v2()
2098
+ online_api_features = [fm.name for fm in features_meta if fm.from_online_api]
2099
+ if len(online_api_features) > 0:
2100
+ self.logger.warning(
2101
+ f"There are important features for transform, that generated by online API: {online_api_features}"
2102
+ )
2103
+ # TODO
2104
+ raise Exception("There are features selected that are paid. Contact support (sales@upgini.com)")
2105
+
2066
2106
  if not metrics_calculation:
2067
2107
  transform_usage = self.rest_client.get_current_transform_usage(trace_id)
2068
2108
  self.logger.info(f"Current transform usage: {transform_usage}. Transforming {len(X)} rows")
@@ -2708,8 +2748,9 @@ class FeaturesEnricher(TransformerMixin):
2708
2748
  and self.generate_features is not None
2709
2749
  and phone_column is not None
2710
2750
  and self.fit_columns_renaming[phone_column] in self.generate_features
2751
+ and len(df) > Dataset.FORCE_SAMPLE_SIZE
2711
2752
  )
2712
- if force_downsampling and len(df) > Dataset.FORCE_SAMPLE_SIZE:
2753
+ if force_downsampling:
2713
2754
  runtime_parameters.properties["fast_fit"] = True
2714
2755
 
2715
2756
  dataset = Dataset(
upgini/metadata.py CHANGED
@@ -255,6 +255,7 @@ class FeaturesMetadataV2(BaseModel):
255
255
  data_source_links: Optional[List[str]] = None
256
256
  doc_link: Optional[str] = None
257
257
  update_frequency: Optional[str] = None
258
+ from_online_api: Optional[bool] = None
258
259
 
259
260
 
260
261
  class HitRateMetrics(BaseModel):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.32
3
+ Version: 1.2.34a3657.dev1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -110,7 +110,7 @@ Description-Content-Type: text/markdown
110
110
  </tr>
111
111
  </table>
112
112
 
113
- ⭐️ [Simple Drag & Drop Search UI](https://upgini.com/upgini-widget):
113
+ ⭐️ [Simple Drag & Drop Search UI](https://www.upgini.com/data-search-widget):
114
114
  <a href="https://upgini.com/upgini-widget">
115
115
  <img width="710" alt="Drag & Drop Search UI" src="https://github.com/upgini/upgini/assets/95645411/36b6460c-51f3-400e-9f04-445b938bf45e">
116
116
  </a>
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=UIgtmuCowO7KL6tTNZJcBfDcGQ4kWH7MjulZqCj9os8,23
1
+ upgini/__about__.py,sha256=JBP_tvOiBuuOyLx7mNqZYU1UEW5bf82plZzE0AvVsfI,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=KnkqV7Nnx3kxfQ89giDao3bmCm4MFJWqJUrONy85E-k,32030
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=ZUbq6ZsIYznFSyU7tTmCPNzEyr-yRI0r-FJcz4i_Ads,192899
6
+ upgini/features_enricher.py,sha256=q11aMFPlCJy1m4sOFfGZFfb4vdG3-hdd0wgm2BXgs9A,194748
7
7
  upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
- upgini/metadata.py,sha256=lUa2xYhBhnCeTqNt6lWc9iP_YuikYGIsDSn8Vwyjv1I,11235
9
+ upgini/metadata.py,sha256=ACzIQQwCHCFHlUqXqKpxd3IQ4bBAaVvy8UaCGTqLGQs,11278
10
10
  upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
11
11
  upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
@@ -14,14 +14,14 @@ upgini/version_validator.py,sha256=h1GViOWzULy5vf6M4dpTJuIk-4V38UCrTY1sb9yLa5I,1
14
14
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
15
15
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- upgini/autofe/all_operands.py,sha256=cCCB44qvkmuWyiRM5Xykx8tkHPIjQthrWyj67STWN80,2578
17
+ upgini/autofe/all_operands.py,sha256=v0_NozalvvzeojSAA0d7UJ5INS654ZVaLn4S8djK6Ac,329
18
18
  upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
19
- upgini/autofe/date.py,sha256=OpFc3Al0xO3qlESn2Uokfxw51ArVqmh3xngWwdrsaqE,9762
20
- upgini/autofe/feature.py,sha256=eL7wABUhDKZzv3E-RPJNcyGwSfB0UptcfU2RbvsOks4,15082
21
- upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
22
- upgini/autofe/operand.py,sha256=uk883RaNqgXqtkaRqA1re1d9OFnnpv0JVvelYx09Yw0,2943
19
+ upgini/autofe/date.py,sha256=Sd1Bm_uby9liSgsUkxsFgnCFaHxmj9MLX0ymR9DLQuQ,10401
20
+ upgini/autofe/feature.py,sha256=l8A8E3BH2BmYvqEC81zbcIEfH6KEEhcesJ2BH4fn0-4,15140
21
+ upgini/autofe/groupby.py,sha256=G48_sQZw016eGx3cOy8YQrEIOp95puWqYUpFWd-gdeM,3595
22
+ upgini/autofe/operand.py,sha256=8Ttrfxv_H91dMbS7J55zxluzAJHfGXU_Y2xCh4OHwb8,4774
23
23
  upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
24
- upgini/autofe/vector.py,sha256=ehcZUDqV71TfbU8EmKfdYp603gS2dJY_-fpr10ho5sI,663
24
+ upgini/autofe/vector.py,sha256=MyNPuqZ5J2vqRSn2UQcKp0ekXWv-d6lImEwqfU3pbCM,4328
25
25
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
26
  upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lYQaGZbxDnOd4A3Q,22516
27
27
  upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
59
59
  upgini/utils/target_utils.py,sha256=Ed5IXkPjV9AfAZQAwCYksAmKaPGQliplvDYS_yeWdfk,11330
60
60
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
61
61
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
62
- upgini-1.2.32.dist-info/METADATA,sha256=GTTmeHuetD3Mrl8pR9K3YFzJcPE8Zl8UdWb23vG-R_s,48578
63
- upgini-1.2.32.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
64
- upgini-1.2.32.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
- upgini-1.2.32.dist-info/RECORD,,
62
+ upgini-1.2.34a3657.dev1.dist-info/METADATA,sha256=marFhP2NoGmDk3lYZemMPRXcBRCB6jr_3tgx-I7fhIE,48597
63
+ upgini-1.2.34a3657.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
64
+ upgini-1.2.34a3657.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
+ upgini-1.2.34a3657.dev1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.25.0
2
+ Generator: hatchling 1.24.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any