upgini 1.2.113a3974.dev2__py3-none-any.whl → 1.2.114a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.113a3974.dev2"
1
+ __version__ = "1.2.114a2"
upgini/dataset.py CHANGED
@@ -50,7 +50,7 @@ except Exception:
50
50
 
51
51
  class Dataset:
52
52
  MIN_ROWS_COUNT = 100
53
- MAX_ROWS = 100_000
53
+ MAX_ROWS = 200_000
54
54
  IMBALANCE_THESHOLD = 0.6
55
55
  MIN_TARGET_CLASS_ROWS = 100
56
56
  MAX_MULTICLASS_CLASS_COUNT = 100
@@ -184,7 +184,19 @@ class Dataset:
184
184
  def __validate_target(self):
185
185
  # self.logger.info("Validating target")
186
186
  target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, "")
187
- target = self.data[target_column]
187
+
188
+ oot_indices = []
189
+ if EVAL_SET_INDEX in self.data.columns:
190
+ for eval_set_index in self.data[EVAL_SET_INDEX].unique():
191
+ eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
192
+ if eval_set[target_column].isna().all():
193
+ oot_indices.append(eval_set_index)
194
+
195
+ df_to_check = self.data.copy()
196
+ if oot_indices:
197
+ df_to_check = df_to_check[~df_to_check[EVAL_SET_INDEX].isin(oot_indices)]
198
+
199
+ target = df_to_check[target_column]
188
200
 
189
201
  if self.task_type == ModelTaskType.BINARY:
190
202
  if not is_integer_dtype(target):
@@ -201,7 +213,7 @@ class Dataset:
201
213
  elif self.task_type == ModelTaskType.MULTICLASS:
202
214
  if not is_integer_dtype(target):
203
215
  try:
204
- target = self.data[target_column].astype("category").cat.codes
216
+ target = target.astype("category").cat.codes
205
217
  except Exception:
206
218
  self.logger.exception("Failed to cast target to category codes for multiclass task type")
207
219
  raise ValidationError(self.bundle.get("dataset_invalid_multiclass_target").format(target.dtype))
@@ -335,15 +347,37 @@ class Dataset:
335
347
  all_valid_message = self.bundle.get("validation_all_valid_message")
336
348
  invalid_message = self.bundle.get("validation_invalid_message")
337
349
 
350
+ oot_indices = []
351
+ if EVAL_SET_INDEX in self.data.columns:
352
+ for eval_set_index in self.data[EVAL_SET_INDEX].unique():
353
+ eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
354
+ if eval_set[target].isna().all():
355
+ oot_indices.append(eval_set_index)
356
+
338
357
  for col in columns_to_validate:
339
- self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
340
358
  if validate_target and target is not None and col == target:
341
- self.data.loc[self.data[target] == np.inf, f"{col}_is_valid"] = False
359
+ if oot_indices:
360
+ mask_not_oot = ~self.data[EVAL_SET_INDEX].isin(oot_indices)
361
+ invalid_target_mask = (
362
+ self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
363
+ )
364
+ # Initialize as valid and mark invalid only for non-OOT rows with NaN or +/-inf
365
+ self.data[f"{col}_is_valid"] = True
366
+ self.data.loc[mask_not_oot & invalid_target_mask, f"{col}_is_valid"] = False
367
+ else:
368
+ # No OOT: mark invalid where target is NaN or +/-inf
369
+ self.data[f"{col}_is_valid"] = ~(
370
+ self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
371
+ )
372
+ else:
373
+ self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
342
374
 
343
375
  if col in mandatory_columns:
344
376
  self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
345
377
 
346
- invalid_values = list(set(self.data.loc[self.data[f"{col}_is_valid"] == 0, col].head().values))
378
+ # Use stable pandas API across versions: Series.unique keeps order
379
+ # and collapses multiple NaNs into a single NaN
380
+ invalid_values = self.data.loc[self.data[f"{col}_is_valid"] == 0, col].unique().tolist()[:5]
347
381
  valid_share = self.data[f"{col}_is_valid"].sum() / nrows
348
382
  original_col_name = self.columns_renaming[col]
349
383
  validation_stats[original_col_name] = {}