upgini 1.2.113a3974.dev1__py3-none-any.whl → 1.2.114__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.113a3974.dev1"
1
+ __version__ = "1.2.114"
upgini/autofe/date.py CHANGED
@@ -244,7 +244,8 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
244
244
 
245
245
  class DatePercentileBase(PandasOperator, abc.ABC):
246
246
  is_binary: bool = True
247
- output_type: Optional[str] = "float"
247
+ is_categorical: bool = True
248
+ output_type: Optional[str] = "category"
248
249
 
249
250
  date_unit: Optional[str] = None
250
251
 
@@ -254,7 +255,12 @@ class DatePercentileBase(PandasOperator, abc.ABC):
254
255
 
255
256
  bounds = self._get_bounds(left)
256
257
 
257
- return right.index.to_series().apply(lambda i: self._perc(right[i], bounds[i]))
258
+ return (
259
+ right.index.to_series()
260
+ .apply(lambda i: self._perc(right[i], bounds[i]))
261
+ .astype(pd.Int64Dtype())
262
+ .astype("category")
263
+ )
258
264
 
259
265
  @abc.abstractmethod
260
266
  def _get_bounds(self, date_col: pd.Series) -> pd.Series:
@@ -318,8 +324,6 @@ class DatePercentile(DatePercentileBase):
318
324
 
319
325
  class DatePercentileMethod2(DatePercentileBase):
320
326
  name: str = "date_per_method2"
321
- is_categorical: bool = True
322
- output_type: Optional[str] = "category"
323
327
 
324
328
  def _get_bounds(self, date_col: pd.Series) -> pd.Series:
325
329
  pass
upgini/dataset.py CHANGED
@@ -25,7 +25,6 @@ from upgini.metadata import (
25
25
  AutoFEParameters,
26
26
  CVType,
27
27
  DataType,
28
- FeaturesFilter,
29
28
  FileColumnMeaningType,
30
29
  FileColumnMetadata,
31
30
  FileMetadata,
@@ -37,8 +36,9 @@ from upgini.metadata import (
37
36
  )
38
37
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
39
38
  from upgini.search_task import SearchTask
39
+ from upgini.utils.config import SampleConfig
40
40
  from upgini.utils.email_utils import EmailSearchKeyConverter
41
- from upgini.utils.sample_utils import SampleColumns, SampleConfig, sample
41
+ from upgini.utils.sample_utils import SampleColumns, sample
42
42
 
43
43
  try:
44
44
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
@@ -50,10 +50,7 @@ except Exception:
50
50
 
51
51
  class Dataset:
52
52
  MIN_ROWS_COUNT = 100
53
- MAX_ROWS = 100_000
54
- IMBALANCE_THESHOLD = 0.6
55
- MIN_TARGET_CLASS_ROWS = 100
56
- MAX_MULTICLASS_CLASS_COUNT = 100
53
+ MAX_ROWS = 200_000
57
54
  MIN_SUPPORTED_DATE_TS = 946684800000 # 2000-01-01
58
55
  MAX_FEATURES_COUNT = 3500
59
56
  MAX_UPLOADING_FILE_SIZE = 268435456 # 256 Mb
@@ -73,6 +70,7 @@ class Dataset:
73
70
  cv_type: Optional[CVType] = None,
74
71
  date_column: Optional[str] = None,
75
72
  id_columns: Optional[List[str]] = None,
73
+ is_imbalanced: bool = False,
76
74
  random_state: Optional[int] = None,
77
75
  sample_config: Optional[SampleConfig] = None,
78
76
  rest_client: Optional[_RestClient] = None,
@@ -117,8 +115,9 @@ class Dataset:
117
115
  self.rest_client = rest_client
118
116
  self.random_state = random_state
119
117
  self.columns_renaming: Dict[str, str] = {}
120
- self.imbalanced: bool = False
118
+ self.is_imbalanced: bool = False
121
119
  self.id_columns = id_columns
120
+ self.is_imbalanced = is_imbalanced
122
121
  self.date_column = date_column
123
122
  if logger is not None:
124
123
  self.logger = logger
@@ -184,7 +183,19 @@ class Dataset:
184
183
  def __validate_target(self):
185
184
  # self.logger.info("Validating target")
186
185
  target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, "")
187
- target = self.data[target_column]
186
+
187
+ oot_indices = []
188
+ if EVAL_SET_INDEX in self.data.columns:
189
+ for eval_set_index in self.data[EVAL_SET_INDEX].unique():
190
+ eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
191
+ if eval_set[target_column].isna().all():
192
+ oot_indices.append(eval_set_index)
193
+
194
+ df_to_check = self.data.copy()
195
+ if oot_indices:
196
+ df_to_check = df_to_check[~df_to_check[EVAL_SET_INDEX].isin(oot_indices)]
197
+
198
+ target = df_to_check[target_column]
188
199
 
189
200
  if self.task_type == ModelTaskType.BINARY:
190
201
  if not is_integer_dtype(target):
@@ -201,7 +212,7 @@ class Dataset:
201
212
  elif self.task_type == ModelTaskType.MULTICLASS:
202
213
  if not is_integer_dtype(target):
203
214
  try:
204
- target = self.data[target_column].astype("category").cat.codes
215
+ target = target.astype("category").cat.codes
205
216
  except Exception:
206
217
  self.logger.exception("Failed to cast target to category codes for multiclass task type")
207
218
  raise ValidationError(self.bundle.get("dataset_invalid_multiclass_target").format(target.dtype))
@@ -227,8 +238,6 @@ class Dataset:
227
238
  else:
228
239
  train_segment = self.data
229
240
 
230
- self.imbalanced = self.__is_imbalanced(train_segment)
231
-
232
241
  sample_columns = SampleColumns(
233
242
  ids=self.id_columns,
234
243
  date=self.date_column,
@@ -237,55 +246,19 @@ class Dataset:
237
246
  )
238
247
 
239
248
  self.data = sample(
240
- train_segment if self.imbalanced else self.data, # for imbalanced data we will be doing transform anyway
249
+ train_segment if self.is_imbalanced else self.data, # for imbalanced data we will be doing transform anyway
241
250
  self.task_type,
242
251
  self.cv_type,
243
252
  self.sample_config,
244
253
  sample_columns,
245
254
  self.random_state,
246
- balance=self.imbalanced,
255
+ balance=self.is_imbalanced,
247
256
  force_downsampling=force_downsampling,
248
257
  logger=self.logger,
249
258
  bundle=self.bundle,
250
259
  warning_callback=self.warning_callback,
251
260
  )
252
261
 
253
- def __is_imbalanced(self, data: pd.DataFrame) -> bool:
254
- if self.task_type is None or not self.task_type.is_classification():
255
- return False
256
-
257
- if self.task_type == ModelTaskType.BINARY and len(data) <= self.sample_config.binary_min_sample_threshold:
258
- return False
259
-
260
- count = len(data)
261
- target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
262
- target = data[target_column]
263
- target_classes_count = target.nunique()
264
-
265
- if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
266
- msg = self.bundle.get("dataset_to_many_multiclass_targets").format(
267
- target_classes_count, self.MAX_MULTICLASS_CLASS_COUNT
268
- )
269
- self.logger.warning(msg)
270
- raise ValidationError(msg)
271
-
272
- vc = target.value_counts()
273
- min_class_value = vc.index[len(vc) - 1]
274
- min_class_count = vc[min_class_value]
275
-
276
- if min_class_count < self.MIN_TARGET_CLASS_ROWS:
277
- msg = self.bundle.get("dataset_rarest_class_less_min").format(
278
- min_class_value, min_class_count, self.MIN_TARGET_CLASS_ROWS
279
- )
280
- self.logger.warning(msg)
281
- raise ValidationError(msg)
282
-
283
- min_class_percent = self.IMBALANCE_THESHOLD / target_classes_count
284
- min_class_threshold = min_class_percent * count
285
-
286
- # If min class count less than 30% for binary or (60 / classes_count)% for multiclass
287
- return bool(min_class_count < min_class_threshold)
288
-
289
262
  def __validate_dataset(self, validate_target: bool, silent_mode: bool):
290
263
  """Validate DataSet"""
291
264
  # self.logger.info("validating etalon")
@@ -335,15 +308,37 @@ class Dataset:
335
308
  all_valid_message = self.bundle.get("validation_all_valid_message")
336
309
  invalid_message = self.bundle.get("validation_invalid_message")
337
310
 
311
+ oot_indices = []
312
+ if EVAL_SET_INDEX in self.data.columns:
313
+ for eval_set_index in self.data[EVAL_SET_INDEX].unique():
314
+ eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
315
+ if eval_set[target].isna().all():
316
+ oot_indices.append(eval_set_index)
317
+
338
318
  for col in columns_to_validate:
339
- self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
340
319
  if validate_target and target is not None and col == target:
341
- self.data.loc[self.data[target] == np.inf, f"{col}_is_valid"] = False
320
+ if oot_indices:
321
+ mask_not_oot = ~self.data[EVAL_SET_INDEX].isin(oot_indices)
322
+ invalid_target_mask = (
323
+ self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
324
+ )
325
+ # Initialize as valid and mark invalid only for non-OOT rows with NaN or +/-inf
326
+ self.data[f"{col}_is_valid"] = True
327
+ self.data.loc[mask_not_oot & invalid_target_mask, f"{col}_is_valid"] = False
328
+ else:
329
+ # No OOT: mark invalid where target is NaN or +/-inf
330
+ self.data[f"{col}_is_valid"] = ~(
331
+ self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
332
+ )
333
+ else:
334
+ self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
342
335
 
343
336
  if col in mandatory_columns:
344
337
  self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
345
338
 
346
- invalid_values = list(set(self.data.loc[self.data[f"{col}_is_valid"] == 0, col].head().values))
339
+ # Use stable pandas API across versions: Series.unique keeps order
340
+ # and collapses multiple NaNs into a single NaN
341
+ invalid_values = self.data.loc[self.data[f"{col}_is_valid"] == 0, col].unique().tolist()[:5]
347
342
  valid_share = self.data[f"{col}_is_valid"].sum() / nrows
348
343
  original_col_name = self.columns_renaming[col]
349
344
  validation_stats[original_col_name] = {}
@@ -503,9 +498,6 @@ class Dataset:
503
498
  return_scores: bool,
504
499
  extract_features: bool,
505
500
  accurate_model: Optional[bool] = None,
506
- importance_threshold: Optional[float] = None,
507
- max_features: Optional[int] = None,
508
- filter_features: Optional[dict] = None,
509
501
  runtime_parameters: Optional[RuntimeParameters] = None,
510
502
  metrics_calculation: Optional[bool] = False,
511
503
  auto_fe_parameters: Optional[AutoFEParameters] = None,
@@ -514,28 +506,12 @@ class Dataset:
514
506
  search_customization = SearchCustomization(
515
507
  extractFeatures=extract_features,
516
508
  accurateModel=accurate_model,
517
- importanceThreshold=importance_threshold,
518
- maxFeatures=max_features,
519
509
  returnScores=return_scores,
520
510
  runtimeParameters=runtime_parameters,
521
511
  metricsCalculation=metrics_calculation,
522
512
  )
523
- if filter_features:
524
- if [
525
- key
526
- for key in filter_features
527
- if key not in {"min_importance", "max_psi", "max_count", "selected_features"}
528
- ]:
529
- raise ValidationError(self.bundle.get("dataset_invalid_filter"))
530
- feature_filter = FeaturesFilter(
531
- minImportance=filter_features.get("min_importance"),
532
- maxPSI=filter_features.get("max_psi"),
533
- maxCount=filter_features.get("max_count"),
534
- selectedFeatures=filter_features.get("selected_features"),
535
- )
536
- search_customization.featuresFilter = feature_filter
537
513
 
538
- search_customization.runtimeParameters.properties["etalon_imbalanced"] = self.imbalanced
514
+ search_customization.runtimeParameters.properties["etalon_imbalanced"] = self.is_imbalanced
539
515
  if auto_fe_parameters is not None:
540
516
  search_customization.runtimeParameters.properties["feature_generation_params.ts.gap_days"] = (
541
517
  auto_fe_parameters.ts_gap_days
@@ -590,9 +566,6 @@ class Dataset:
590
566
  extract_features: bool = False,
591
567
  accurate_model: bool = False,
592
568
  exclude_features_sources: Optional[List[str]] = None,
593
- importance_threshold: Optional[float] = None, # deprecated
594
- max_features: Optional[int] = None, # deprecated
595
- filter_features: Optional[dict] = None, # deprecated
596
569
  runtime_parameters: Optional[RuntimeParameters] = None,
597
570
  auto_fe_parameters: Optional[AutoFEParameters] = None,
598
571
  force_downsampling: bool = False,
@@ -609,9 +582,6 @@ class Dataset:
609
582
  return_scores=return_scores,
610
583
  extract_features=extract_features,
611
584
  accurate_model=accurate_model,
612
- importance_threshold=importance_threshold,
613
- max_features=max_features,
614
- filter_features=filter_features,
615
585
  runtime_parameters=runtime_parameters,
616
586
  auto_fe_parameters=auto_fe_parameters,
617
587
  )