upgini 1.1.262a3250.post3__py3-none-any.whl → 1.1.274a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  from typing import Dict
2
- from upgini.autofe.date import DateDiff, DateDiffFuture
2
+ from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded
3
3
  from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
4
4
  from upgini.autofe.operand import Operand
5
5
  from upgini.autofe.unary import Abs, Log, Residual, Sqrt, Square, Sigmoid, Floor, Freq
@@ -37,7 +37,17 @@ ALL_OPERANDS: Dict[str, Operand] = {
37
37
  Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
38
38
  Sim(),
39
39
  DateDiff(),
40
- DateDiffFuture(),
40
+ DateDiffType2(),
41
+ DateListDiff(aggregation="min"),
42
+ DateListDiff(aggregation="max"),
43
+ DateListDiff(aggregation="mean"),
44
+ DateListDiff(aggregation="nunique"),
45
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=0, upper_bound=18),
46
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=18, upper_bound=23),
47
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=23, upper_bound=30),
48
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
49
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
50
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
41
51
  ]
42
52
  }
43
53
 
upgini/autofe/date.py CHANGED
@@ -1,11 +1,12 @@
1
- from typing import Optional, Union
1
+ from typing import Any, Optional, Union
2
2
  import numpy as np
3
3
  import pandas as pd
4
+ from pydantic import BaseModel
4
5
 
5
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
6
+ from upgini.autofe.operand import PandasOperand
6
7
 
7
8
 
8
- class DateDiffMixin:
9
+ class DateDiffMixin(BaseModel):
9
10
  diff_unit: str = "D"
10
11
  left_unit: Optional[str] = None
11
12
  right_unit: Optional[str] = None
@@ -34,18 +35,77 @@ class DateDiff(PandasOperand, DateDiffMixin):
34
35
  return x
35
36
 
36
37
 
37
- class DateDiffFuture(PandasOperand, DateDiffMixin):
38
- name = "date_diff_future"
38
+ class DateDiffType2(PandasOperand, DateDiffMixin):
39
+ name = "date_diff_type2"
39
40
  is_binary = True
40
41
  has_symmetry_importance = True
41
- is_vectorizable = False
42
42
 
43
43
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
44
44
  left = self._convert_to_date(left, self.left_unit)
45
45
  right = self._convert_to_date(right, self.right_unit)
46
- future = pd.to_datetime(dict(day=right.dt.day, month=right.dt.month, year=left.dt.year))
46
+ future = right + (left.dt.year - right.dt.year).apply(
47
+ lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
48
+ )
49
+ future = pd.to_datetime(future)
47
50
  before = future[future < left]
48
- future[future < left] = pd.to_datetime(dict(day=before.dt.day, month=before.dt.month, year=before.dt.year + 1))
51
+ future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
49
52
  diff = (future - left) / np.timedelta64(1, self.diff_unit)
50
53
 
51
54
  return diff
55
+
56
+
57
+ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len, 0)}
58
+
59
+
60
+ class DateListDiff(PandasOperand, DateDiffMixin):
61
+ is_binary = True
62
+ has_symmetry_importance = True
63
+ aggregation: str
64
+
65
+ def __init__(self, **data: Any) -> None:
66
+ if "name" not in data:
67
+ data["name"] = f"date_diff_{data.get('aggregation')}"
68
+ super().__init__(**data)
69
+
70
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
71
+ left = self._convert_to_date(left, self.left_unit)
72
+ right = right.apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
73
+
74
+ return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
75
+
76
+ def _diff(self, x):
77
+ x = x / np.timedelta64(1, self.diff_unit)
78
+ return x[x > 0]
79
+
80
+ def _agg(self, x):
81
+ method = getattr(np, self.aggregation, None)
82
+ default = np.nan
83
+ if method is None and self.aggregation in _ext_aggregations:
84
+ method, default = _ext_aggregations[self.aggregation]
85
+ elif not callable(method):
86
+ raise ValueError(f"Unsupported aggregation: {self.aggregation}")
87
+
88
+ return method(x) if len(x) > 0 else default
89
+
90
+
91
+ class DateListDiffBounded(DateListDiff):
92
+ lower_bound: Optional[int]
93
+ upper_bound: Optional[int]
94
+
95
+ def __init__(self, **data: Any) -> None:
96
+ if "name" not in data:
97
+ lower_bound = data.get("lower_bound")
98
+ upper_bound = data.get("upper_bound")
99
+ components = [
100
+ "date_diff",
101
+ data.get("diff_unit"),
102
+ str(lower_bound if lower_bound is not None else "minusinf"),
103
+ str(upper_bound if upper_bound is not None else "plusinf"),
104
+ ]
105
+ components.append(data.get("aggregation"))
106
+ data["name"] = "_".join(components)
107
+ super().__init__(**data)
108
+
109
+ def _agg(self, x):
110
+ x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
111
+ return super()._agg(x)
upgini/autofe/feature.py CHANGED
@@ -305,7 +305,7 @@ class FeatureGroup:
305
305
  grouped_features = []
306
306
 
307
307
  def groupby_func(f: Feature) -> Tuple[Operand, Union[Column, Feature]]:
308
- return (f.op, f.children[f.op.group_index])
308
+ return (f.op, f.children[0 if not f.op.is_vectorizable else f.op.group_index])
309
309
 
310
310
  for op_child, features in itertools.groupby(candidates, groupby_func):
311
311
  op, main_child = op_child
@@ -40,7 +40,7 @@ class DataSourcePublisher:
40
40
  if logs_enabled:
41
41
  self.logger = LoggerFactory().get_logger(endpoint, api_key)
42
42
  else:
43
- self.logger = logging.getLogger()
43
+ self.logger = logging.getLogger("muted_logger")
44
44
  self.logger.setLevel("FATAL")
45
45
 
46
46
  def place(
@@ -48,6 +48,7 @@ class DataSourcePublisher:
48
48
  data_table_uri: str,
49
49
  search_keys: Dict[str, SearchKey],
50
50
  update_frequency: str,
51
+ exclude_from_autofe_generation: Optional[List[str]],
51
52
  secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
52
53
  sort_column: Optional[str] = None,
53
54
  date_format: Optional[str] = None,
@@ -57,7 +58,6 @@ class DataSourcePublisher:
57
58
  join_date_abs_limit_days: Optional[int] = None,
58
59
  features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
59
60
  data_table_id_to_replace: Optional[str] = None,
60
- exclude_from_autofe_generation: Optional[List[str]] = None,
61
61
  _force_generation=False,
62
62
  _silent=False,
63
63
  ) -> str:
@@ -72,8 +72,8 @@ class DataSourcePublisher:
72
72
  )
73
73
  if search_keys is None or len(search_keys) == 0:
74
74
  raise ValidationError("Empty search keys")
75
- if SearchKey.DATE in search_keys.values() and date_format is None:
76
- raise ValidationError("date_format is required for DATE search key")
75
+ # if SearchKey.DATE in search_keys.values() and date_format is None:
76
+ # raise ValidationError("date_format is required for DATE search key")
77
77
  if update_frequency not in self.ACCEPTABLE_UPDATE_FREQUENCIES:
78
78
  raise ValidationError(
79
79
  f"Invalid update frequency: {update_frequency}. "
@@ -85,11 +85,19 @@ class DataSourcePublisher:
85
85
  or set(search_keys.values()) == {SearchKey.MSISDN_RANGE_FROM, SearchKey.MSISDN_RANGE_TO}
86
86
  ) and sort_column is None:
87
87
  raise ValidationError("Sort column is required for passed search keys")
88
+ if (
89
+ set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
90
+ and snapshot_frequency_days is None
91
+ and join_date_abs_limit_days is None
92
+ ):
93
+ raise ValidationError(
94
+ "With MSISDN and DATE keys one of the snapshot_frequency_days or"
95
+ " join_date_abs_limit_days parameters is required"
96
+ )
88
97
 
89
98
  request = {
90
99
  "dataTableUri": data_table_uri,
91
100
  "searchKeys": {k: v.value.value for k, v in search_keys.items()},
92
- "dateFormat": date_format,
93
101
  "excludeColumns": exclude_columns,
94
102
  "hashFeatureNames": str(hash_feature_names).lower(),
95
103
  "snapshotFrequencyDays": snapshot_frequency_days,
@@ -98,6 +106,8 @@ class DataSourcePublisher:
98
106
  "featuresForEmbeddings": features_for_embeddings,
99
107
  "forceGeneration": str(_force_generation).lower(),
100
108
  }
109
+ if date_format is not None:
110
+ request["dateFormat"] = date_format
101
111
  if secondary_search_keys is not None:
102
112
  request["secondarySearchKeys"] = {k: v.value.value for k, v in secondary_search_keys.items()}
103
113
  if sort_column is not None:
@@ -170,6 +180,7 @@ class DataSourcePublisher:
170
180
  print(msg)
171
181
  self.logger.info(msg)
172
182
  self._rest_client.stop_ads_management_task(task_id, trace_id)
183
+ raise
173
184
  except Exception:
174
185
  self.logger.exception("Failed to register data table")
175
186
  raise
@@ -289,6 +300,7 @@ class DataSourcePublisher:
289
300
  raise ValidationError("One of arguments: bq_table_id or search_keys should be presented")
290
301
  if bq_table_id is not None and search_keys is not None:
291
302
  raise ValidationError("Only one argument could be presented: bq_table_id or search_keys")
303
+ task_id = None
292
304
  try:
293
305
  search_keys = [k.value.value for k in search_keys] if search_keys else None
294
306
  request = {"bqTableId": bq_table_id, "searchKeys": search_keys}
@@ -303,6 +315,13 @@ class DataSourcePublisher:
303
315
  raise Exception("Failed to register ADS: " + status_response["errorMessage"])
304
316
 
305
317
  print("Uploading successfully finished")
318
+ except KeyboardInterrupt:
319
+ if task_id is not None:
320
+ msg = f"Stopping AdsManagementTask {task_id}"
321
+ print(msg)
322
+ self.logger.info(msg)
323
+ self._rest_client.stop_ads_management_task(task_id, trace_id)
324
+ raise
306
325
  except Exception:
307
326
  self.logger.exception(f"Failed to upload table {bq_table_id}")
308
327
  raise
upgini/dataset.py CHANGED
@@ -39,10 +39,10 @@ from upgini.metadata import (
39
39
  )
40
40
  from upgini.normalizer.phone_normalizer import PhoneNormalizer
41
41
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
42
- from upgini.sampler.random_under_sampler import RandomUnderSampler
43
42
  from upgini.search_task import SearchTask
44
43
  from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
45
44
  from upgini.utils.email_utils import EmailSearchKeyConverter
45
+ from upgini.utils.target_utils import balance_undersample
46
46
 
47
47
  try:
48
48
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
@@ -60,7 +60,9 @@ class Dataset: # (pd.DataFrame):
60
60
  FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
61
61
  FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
62
62
  MIN_SAMPLE_THRESHOLD = 5_000
63
- IMBALANCE_THESHOLD = 0.4
63
+ IMBALANCE_THESHOLD = 0.6
64
+ BINARY_BOOTSTRAP_LOOPS = 5
65
+ MULTICLASS_BOOTSTRAP_LOOPS = 2
64
66
  MIN_TARGET_CLASS_ROWS = 100
65
67
  MAX_MULTICLASS_CLASS_COUNT = 100
66
68
  MIN_SUPPORTED_DATE_TS = 946684800000 # 2000-01-01
@@ -460,10 +462,8 @@ class Dataset: # (pd.DataFrame):
460
462
  self.task_type == ModelTaskType.BINARY and len(train_segment) > self.MIN_SAMPLE_THRESHOLD
461
463
  ):
462
464
  count = len(train_segment)
463
- min_class_count = count
464
- min_class_value = None
465
- target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, "")
466
- target = train_segment[target_column].copy()
465
+ target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
466
+ target = train_segment[target_column]
467
467
  target_classes_count = target.nunique()
468
468
 
469
469
  if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
@@ -473,12 +473,9 @@ class Dataset: # (pd.DataFrame):
473
473
  self.logger.warning(msg)
474
474
  raise ValidationError(msg)
475
475
 
476
- unique_target = target.unique()
477
- for v in list(unique_target): # type: ignore
478
- current_class_count = len(train_segment.loc[target == v])
479
- if current_class_count < min_class_count:
480
- min_class_count = current_class_count
481
- min_class_value = v
476
+ vc = target.value_counts()
477
+ min_class_value = vc.index[len(vc) - 1]
478
+ min_class_count = vc[min_class_value]
482
479
 
483
480
  if min_class_count < self.MIN_TARGET_CLASS_ROWS:
484
481
  msg = self.bundle.get("dataset_rarest_class_less_min").format(
@@ -491,53 +488,19 @@ class Dataset: # (pd.DataFrame):
491
488
  min_class_threshold = min_class_percent * count
492
489
 
493
490
  if min_class_count < min_class_threshold:
494
- msg = self.bundle.get("dataset_rarest_class_less_threshold").format(
495
- min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
496
- )
497
- self.logger.warning(msg)
498
- print(msg)
499
- self.warning_counter.increment()
500
-
501
- train_segment = train_segment.copy().sort_values(by=SYSTEM_RECORD_ID)
502
- if self.task_type == ModelTaskType.MULTICLASS:
503
- # Sort classes by rows count and find 25% quantile class
504
- classes = target.value_counts().index
505
- quantile25_idx = int(0.75 * len(classes))
506
- quantile25_class = classes[quantile25_idx]
507
- count_of_quantile25_class = len(target[target == quantile25_class])
508
- msg = self.bundle.get("imbalance_multiclass").format(quantile25_class, count_of_quantile25_class)
509
- self.logger.warning(msg)
510
- print(msg)
511
- # 25% and lower classes will stay as is. Higher classes will be downsampled
512
- parts = []
513
- for class_idx in range(quantile25_idx):
514
- sampled = train_segment[train_segment[target_column] == classes[class_idx]].sample(
515
- n=count_of_quantile25_class, random_state=self.random_state
516
- )
517
- parts.append(sampled)
518
- for class_idx in range(quantile25_idx, len(classes)):
519
- parts.append(train_segment[train_segment[target_column] == classes[class_idx]])
520
- resampled_data = pd.concat(parts)
521
- elif self.task_type == ModelTaskType.BINARY and min_class_count < self.MIN_SAMPLE_THRESHOLD / 2:
522
- minority_class = train_segment[train_segment[target_column] == min_class_value]
523
- majority_class = train_segment[train_segment[target_column] != min_class_value]
524
- sampled_majority_class = majority_class.sample(
525
- n=self.MIN_SAMPLE_THRESHOLD - min_class_count, random_state=self.random_state
526
- )
527
- resampled_data = train_segment[
528
- (train_segment[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
529
- | (train_segment[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
530
- ]
531
- else:
532
- sampler = RandomUnderSampler(random_state=self.random_state)
533
- X = train_segment[SYSTEM_RECORD_ID]
534
- X = X.to_frame(SYSTEM_RECORD_ID)
535
- new_x, _ = sampler.fit_resample(X, target) # type: ignore
536
- resampled_data = train_segment[train_segment[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
537
-
538
- self.data = resampled_data
539
- self.logger.info(f"Shape after rebalance resampling: {self.data.shape}")
540
491
  self.imbalanced = True
492
+ self.data = balance_undersample(
493
+ df=train_segment,
494
+ target_column=target_column,
495
+ task_type=self.task_type,
496
+ random_state=self.random_state,
497
+ imbalance_threshold=self.IMBALANCE_THESHOLD,
498
+ binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
499
+ multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
500
+ logger=self.logger,
501
+ bundle=self.bundle,
502
+ warning_counter=self.warning_counter,
503
+ )
541
504
 
542
505
  # Resample over fit threshold
543
506
  if not self.imbalanced and EVAL_SET_INDEX in self.data.columns: