upgini 1.2.113a3974.dev2__py3-none-any.whl → 1.2.114a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/dataset.py +40 -6
- upgini/features_enricher.py +489 -147
- upgini/metadata.py +1 -0
- upgini/metrics.py +6 -2
- upgini/resource_bundle/strings.properties +6 -1
- upgini/sampler/base.py +3 -1
- upgini/sampler/random_under_sampler.py +18 -8
- upgini/utils/deduplicate_utils.py +57 -9
- upgini/utils/feature_info.py +5 -0
- upgini/utils/psi.py +294 -0
- upgini/utils/sample_utils.py +30 -2
- upgini/utils/target_utils.py +6 -1
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114a2.dist-info}/METADATA +31 -17
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114a2.dist-info}/RECORD +17 -16
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114a2.dist-info}/WHEEL +1 -1
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114a2.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.114a2"
|
upgini/dataset.py
CHANGED
@@ -50,7 +50,7 @@ except Exception:
|
|
50
50
|
|
51
51
|
class Dataset:
|
52
52
|
MIN_ROWS_COUNT = 100
|
53
|
-
MAX_ROWS =
|
53
|
+
MAX_ROWS = 200_000
|
54
54
|
IMBALANCE_THESHOLD = 0.6
|
55
55
|
MIN_TARGET_CLASS_ROWS = 100
|
56
56
|
MAX_MULTICLASS_CLASS_COUNT = 100
|
@@ -184,7 +184,19 @@ class Dataset:
|
|
184
184
|
def __validate_target(self):
|
185
185
|
# self.logger.info("Validating target")
|
186
186
|
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, "")
|
187
|
-
|
187
|
+
|
188
|
+
oot_indices = []
|
189
|
+
if EVAL_SET_INDEX in self.data.columns:
|
190
|
+
for eval_set_index in self.data[EVAL_SET_INDEX].unique():
|
191
|
+
eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
|
192
|
+
if eval_set[target_column].isna().all():
|
193
|
+
oot_indices.append(eval_set_index)
|
194
|
+
|
195
|
+
df_to_check = self.data.copy()
|
196
|
+
if oot_indices:
|
197
|
+
df_to_check = df_to_check[~df_to_check[EVAL_SET_INDEX].isin(oot_indices)]
|
198
|
+
|
199
|
+
target = df_to_check[target_column]
|
188
200
|
|
189
201
|
if self.task_type == ModelTaskType.BINARY:
|
190
202
|
if not is_integer_dtype(target):
|
@@ -201,7 +213,7 @@ class Dataset:
|
|
201
213
|
elif self.task_type == ModelTaskType.MULTICLASS:
|
202
214
|
if not is_integer_dtype(target):
|
203
215
|
try:
|
204
|
-
target =
|
216
|
+
target = target.astype("category").cat.codes
|
205
217
|
except Exception:
|
206
218
|
self.logger.exception("Failed to cast target to category codes for multiclass task type")
|
207
219
|
raise ValidationError(self.bundle.get("dataset_invalid_multiclass_target").format(target.dtype))
|
@@ -335,15 +347,37 @@ class Dataset:
|
|
335
347
|
all_valid_message = self.bundle.get("validation_all_valid_message")
|
336
348
|
invalid_message = self.bundle.get("validation_invalid_message")
|
337
349
|
|
350
|
+
oot_indices = []
|
351
|
+
if EVAL_SET_INDEX in self.data.columns:
|
352
|
+
for eval_set_index in self.data[EVAL_SET_INDEX].unique():
|
353
|
+
eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
|
354
|
+
if eval_set[target].isna().all():
|
355
|
+
oot_indices.append(eval_set_index)
|
356
|
+
|
338
357
|
for col in columns_to_validate:
|
339
|
-
self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
|
340
358
|
if validate_target and target is not None and col == target:
|
341
|
-
|
359
|
+
if oot_indices:
|
360
|
+
mask_not_oot = ~self.data[EVAL_SET_INDEX].isin(oot_indices)
|
361
|
+
invalid_target_mask = (
|
362
|
+
self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
|
363
|
+
)
|
364
|
+
# Initialize as valid and mark invalid only for non-OOT rows with NaN or +/-inf
|
365
|
+
self.data[f"{col}_is_valid"] = True
|
366
|
+
self.data.loc[mask_not_oot & invalid_target_mask, f"{col}_is_valid"] = False
|
367
|
+
else:
|
368
|
+
# No OOT: mark invalid where target is NaN or +/-inf
|
369
|
+
self.data[f"{col}_is_valid"] = ~(
|
370
|
+
self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
|
371
|
+
)
|
372
|
+
else:
|
373
|
+
self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
|
342
374
|
|
343
375
|
if col in mandatory_columns:
|
344
376
|
self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
|
345
377
|
|
346
|
-
|
378
|
+
# Use stable pandas API across versions: Series.unique keeps order
|
379
|
+
# and collapses multiple NaNs into a single NaN
|
380
|
+
invalid_values = self.data.loc[self.data[f"{col}_is_valid"] == 0, col].unique().tolist()[:5]
|
347
381
|
valid_share = self.data[f"{col}_is_valid"].sum() / nrows
|
348
382
|
original_col_name = self.columns_renaming[col]
|
349
383
|
validation_stats[original_col_name] = {}
|