upgini 1.1.262a3250.post4__py3-none-any.whl → 1.1.280a3418.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -0
- upgini/ads.py +6 -2
- upgini/ads_management/ads_manager.py +4 -2
- upgini/autofe/all_operands.py +16 -4
- upgini/autofe/binary.py +2 -1
- upgini/autofe/date.py +74 -7
- upgini/autofe/feature.py +1 -1
- upgini/autofe/groupby.py +3 -1
- upgini/autofe/operand.py +4 -3
- upgini/autofe/unary.py +20 -1
- upgini/autofe/vector.py +2 -0
- upgini/data_source/data_source_publisher.py +14 -4
- upgini/dataset.py +8 -7
- upgini/errors.py +1 -1
- upgini/features_enricher.py +156 -63
- upgini/http.py +11 -10
- upgini/mdc/__init__.py +1 -3
- upgini/mdc/context.py +4 -6
- upgini/metadata.py +3 -0
- upgini/metrics.py +160 -96
- upgini/normalizer/phone_normalizer.py +2 -2
- upgini/resource_bundle/__init__.py +5 -5
- upgini/resource_bundle/strings.properties +9 -4
- upgini/sampler/base.py +1 -4
- upgini/sampler/random_under_sampler.py +2 -5
- upgini/search_task.py +4 -4
- upgini/spinner.py +1 -1
- upgini/utils/__init__.py +3 -2
- upgini/utils/base_search_key_detector.py +2 -2
- upgini/utils/blocked_time_series.py +4 -2
- upgini/utils/country_utils.py +2 -2
- upgini/utils/custom_loss_utils.py +3 -2
- upgini/utils/cv_utils.py +2 -2
- upgini/utils/datetime_utils.py +75 -18
- upgini/utils/deduplicate_utils.py +61 -18
- upgini/utils/email_utils.py +3 -3
- upgini/utils/fallback_progress_bar.py +1 -1
- upgini/utils/features_validator.py +2 -1
- upgini/utils/progress_bar.py +1 -1
- upgini/utils/sklearn_ext.py +15 -15
- upgini/utils/target_utils.py +21 -7
- upgini/utils/track_info.py +27 -15
- upgini/version_validator.py +2 -2
- {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info}/METADATA +21 -23
- upgini-1.1.280a3418.post2.dist-info/RECORD +62 -0
- {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info}/WHEEL +1 -2
- upgini-1.1.262a3250.post4.dist-info/RECORD +0 -62
- upgini-1.1.262a3250.post4.dist-info/top_level.txt +0 -1
- {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info/licenses}/LICENSE +0 -0
upgini/metrics.py
CHANGED
|
@@ -1,17 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import inspect
|
|
1
4
|
import logging
|
|
2
5
|
import re
|
|
3
6
|
from copy import deepcopy
|
|
4
7
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
5
8
|
|
|
9
|
+
import catboost
|
|
6
10
|
import numpy as np
|
|
7
11
|
import pandas as pd
|
|
8
12
|
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
9
|
-
import catboost
|
|
10
13
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
11
14
|
from numpy import log1p
|
|
12
15
|
from pandas.api.types import is_numeric_dtype
|
|
13
16
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
|
14
17
|
|
|
18
|
+
from upgini.utils.features_validator import FeaturesValidator
|
|
15
19
|
from upgini.utils.sklearn_ext import cross_validate
|
|
16
20
|
|
|
17
21
|
try:
|
|
@@ -123,7 +127,7 @@ NA_REPLACEMENT = "NA"
|
|
|
123
127
|
|
|
124
128
|
SUPPORTED_CATBOOST_METRICS = {
|
|
125
129
|
s.upper(): s
|
|
126
|
-
for s in
|
|
130
|
+
for s in (
|
|
127
131
|
"Logloss",
|
|
128
132
|
"CrossEntropy",
|
|
129
133
|
"CtrFactor",
|
|
@@ -202,7 +206,7 @@ SUPPORTED_CATBOOST_METRICS = {
|
|
|
202
206
|
"MultiLogloss",
|
|
203
207
|
"MultiCrossEntropy",
|
|
204
208
|
"Combination",
|
|
205
|
-
|
|
209
|
+
)
|
|
206
210
|
}
|
|
207
211
|
|
|
208
212
|
|
|
@@ -234,71 +238,71 @@ class EstimatorWrapper:
|
|
|
234
238
|
self.text_features = text_features
|
|
235
239
|
self.logger = logger or logging.getLogger()
|
|
236
240
|
|
|
237
|
-
def fit(self,
|
|
238
|
-
|
|
241
|
+
def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
|
|
242
|
+
x, y, _, fit_params = self._prepare_to_fit(x, y)
|
|
239
243
|
kwargs.update(fit_params)
|
|
240
|
-
self.estimator.fit(
|
|
244
|
+
self.estimator.fit(x, y, **kwargs)
|
|
241
245
|
return self
|
|
242
246
|
|
|
243
247
|
def predict(self, **kwargs):
|
|
244
248
|
return self.estimator.predict(**kwargs)
|
|
245
249
|
|
|
246
|
-
def _prepare_to_fit(self,
|
|
247
|
-
|
|
248
|
-
return
|
|
250
|
+
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
251
|
+
x, y, groups = self._prepare_data(x, y, groups=self.groups)
|
|
252
|
+
return x, y, groups, {}
|
|
249
253
|
|
|
250
254
|
def _prepare_data(
|
|
251
|
-
self,
|
|
255
|
+
self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
|
|
252
256
|
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
|
|
253
|
-
for c in
|
|
254
|
-
if is_numeric_dtype(
|
|
255
|
-
|
|
257
|
+
for c in x.columns:
|
|
258
|
+
if is_numeric_dtype(x[c]):
|
|
259
|
+
x[c] = x[c].astype(float)
|
|
256
260
|
else:
|
|
257
|
-
|
|
261
|
+
x[c] = x[c].astype(str)
|
|
258
262
|
|
|
259
263
|
if not isinstance(y, pd.Series):
|
|
260
264
|
raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
|
|
261
265
|
|
|
262
266
|
if groups is not None:
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
groups =
|
|
267
|
-
|
|
267
|
+
x = x.copy()
|
|
268
|
+
x["__groups"] = groups
|
|
269
|
+
x, y = self._remove_empty_target_rows(x, y)
|
|
270
|
+
groups = x["__groups"]
|
|
271
|
+
x = x.drop(columns="__groups")
|
|
268
272
|
else:
|
|
269
|
-
|
|
273
|
+
x, y = self._remove_empty_target_rows(x, y)
|
|
270
274
|
|
|
271
|
-
return
|
|
275
|
+
return x, y, groups
|
|
272
276
|
|
|
273
|
-
def _remove_empty_target_rows(self,
|
|
274
|
-
joined = pd.concat([
|
|
277
|
+
def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
|
|
278
|
+
joined = pd.concat([x, y], axis=1)
|
|
275
279
|
joined = joined[joined[y.name].notna()]
|
|
276
280
|
joined = joined.reset_index(drop=True)
|
|
277
|
-
|
|
281
|
+
x = joined.drop(columns=y.name)
|
|
278
282
|
y = np.array(list(joined[y.name].values))
|
|
279
283
|
|
|
280
|
-
return
|
|
284
|
+
return x, y
|
|
281
285
|
|
|
282
|
-
def _prepare_to_calculate(self,
|
|
283
|
-
|
|
284
|
-
return
|
|
286
|
+
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
287
|
+
x, y, _ = self._prepare_data(x, y)
|
|
288
|
+
return x, y, {}
|
|
285
289
|
|
|
286
290
|
def cross_val_predict(
|
|
287
|
-
self,
|
|
291
|
+
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
288
292
|
) -> Optional[float]:
|
|
289
|
-
|
|
293
|
+
x, y, groups, fit_params = self._prepare_to_fit(x, y)
|
|
290
294
|
|
|
291
|
-
if
|
|
295
|
+
if x.shape[1] == 0:
|
|
292
296
|
return None
|
|
293
297
|
|
|
294
298
|
scorer = check_scoring(self.estimator, scoring=self.scorer)
|
|
295
299
|
|
|
296
300
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
|
297
|
-
metric = roc_auc_score(y,
|
|
301
|
+
metric = roc_auc_score(y, x[baseline_score_column])
|
|
298
302
|
else:
|
|
299
303
|
cv_results = cross_validate(
|
|
300
304
|
estimator=self.estimator,
|
|
301
|
-
|
|
305
|
+
x=x,
|
|
302
306
|
y=y,
|
|
303
307
|
scoring=scorer,
|
|
304
308
|
cv=self.cv,
|
|
@@ -318,14 +322,14 @@ class EstimatorWrapper:
|
|
|
318
322
|
metric = 2 * metric - 1
|
|
319
323
|
return metric
|
|
320
324
|
|
|
321
|
-
def calculate_metric(self,
|
|
322
|
-
|
|
325
|
+
def calculate_metric(self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
|
|
326
|
+
x, y, _ = self._prepare_to_calculate(x, y)
|
|
323
327
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
|
324
|
-
metric = roc_auc_score(y,
|
|
328
|
+
metric = roc_auc_score(y, x[baseline_score_column])
|
|
325
329
|
else:
|
|
326
330
|
metrics = []
|
|
327
331
|
for est in self.cv_estimators:
|
|
328
|
-
metrics.append(self.scorer(est,
|
|
332
|
+
metrics.append(self.scorer(est, x, y))
|
|
329
333
|
|
|
330
334
|
metric = np.mean(metrics) * self.multiplier
|
|
331
335
|
return self.post_process_metric(metric)
|
|
@@ -336,13 +340,13 @@ class EstimatorWrapper:
|
|
|
336
340
|
logger: logging.Logger,
|
|
337
341
|
target_type: ModelTaskType,
|
|
338
342
|
cv: BaseCrossValidator,
|
|
339
|
-
|
|
343
|
+
x: pd.DataFrame,
|
|
340
344
|
scoring: Union[Callable, str, None] = None,
|
|
341
345
|
cat_features: Optional[List[str]] = None,
|
|
342
346
|
text_features: Optional[List[str]] = None,
|
|
343
347
|
add_params: Optional[Dict[str, Any]] = None,
|
|
344
348
|
groups: Optional[List[str]] = None,
|
|
345
|
-
) ->
|
|
349
|
+
) -> EstimatorWrapper:
|
|
346
350
|
scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
|
|
347
351
|
kwargs = {
|
|
348
352
|
"scorer": scorer,
|
|
@@ -352,6 +356,7 @@ class EstimatorWrapper:
|
|
|
352
356
|
"target_type": target_type,
|
|
353
357
|
"groups": groups,
|
|
354
358
|
"text_features": text_features,
|
|
359
|
+
"logger": logger,
|
|
355
360
|
}
|
|
356
361
|
if estimator is None:
|
|
357
362
|
params = dict()
|
|
@@ -377,15 +382,20 @@ class EstimatorWrapper:
|
|
|
377
382
|
else:
|
|
378
383
|
estimator_copy = deepcopy(estimator)
|
|
379
384
|
kwargs["estimator"] = estimator_copy
|
|
380
|
-
if isinstance(estimator, CatBoostClassifier
|
|
385
|
+
if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
|
|
381
386
|
if cat_features is not None:
|
|
387
|
+
for cat_feature in cat_features:
|
|
388
|
+
if cat_feature not in x.columns:
|
|
389
|
+
logger.error(
|
|
390
|
+
f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
|
|
391
|
+
)
|
|
382
392
|
estimator_copy.set_params(
|
|
383
|
-
cat_features=[
|
|
393
|
+
cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
|
|
384
394
|
)
|
|
385
395
|
estimator = CatBoostWrapper(**kwargs)
|
|
386
396
|
else:
|
|
387
397
|
try:
|
|
388
|
-
if isinstance(estimator, LGBMClassifier
|
|
398
|
+
if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
|
|
389
399
|
estimator = LightGBMWrapper(**kwargs)
|
|
390
400
|
else:
|
|
391
401
|
logger.warning(
|
|
@@ -414,32 +424,40 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
414
424
|
target_type: ModelTaskType,
|
|
415
425
|
groups: Optional[List[str]] = None,
|
|
416
426
|
text_features: Optional[List[str]] = None,
|
|
427
|
+
logger: Optional[logging.Logger] = None,
|
|
417
428
|
):
|
|
418
429
|
super(CatBoostWrapper, self).__init__(
|
|
419
|
-
estimator,
|
|
430
|
+
estimator,
|
|
431
|
+
scorer,
|
|
432
|
+
metric_name,
|
|
433
|
+
multiplier,
|
|
434
|
+
cv,
|
|
435
|
+
target_type,
|
|
436
|
+
groups=groups,
|
|
437
|
+
text_features=text_features,
|
|
438
|
+
logger=logger,
|
|
420
439
|
)
|
|
421
440
|
self.cat_features = None
|
|
422
441
|
self.emb_features = None
|
|
442
|
+
self.exclude_features = []
|
|
423
443
|
|
|
424
|
-
def _prepare_to_fit(self,
|
|
425
|
-
|
|
444
|
+
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
445
|
+
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
426
446
|
|
|
427
447
|
# Find embeddings
|
|
428
448
|
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
|
429
449
|
emb_pattern = r"(.+)_emb\d+"
|
|
430
|
-
self.emb_features = [c for c in
|
|
450
|
+
self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
|
431
451
|
embedding_features = []
|
|
432
452
|
if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
|
|
433
453
|
self.logger.info(
|
|
434
454
|
"Embedding features count more than 3, so group them into one vector for CatBoost: "
|
|
435
455
|
f"{self.emb_features}"
|
|
436
456
|
)
|
|
437
|
-
|
|
457
|
+
x, embedding_features = self.group_embeddings(x)
|
|
438
458
|
params["embedding_features"] = embedding_features
|
|
439
459
|
else:
|
|
440
|
-
self.logger.info(
|
|
441
|
-
f"Embedding features count less than 3, so use them separately: {self.emb_features}"
|
|
442
|
-
)
|
|
460
|
+
self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
|
|
443
461
|
self.emb_features = []
|
|
444
462
|
else:
|
|
445
463
|
self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
|
|
@@ -448,7 +466,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
448
466
|
if hasattr(CatBoostClassifier, "get_text_feature_indices"):
|
|
449
467
|
if self.text_features is not None:
|
|
450
468
|
self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
|
|
451
|
-
self.text_features = [f for f in self.text_features if f in
|
|
469
|
+
self.text_features = [f for f in self.text_features if f in x.columns and not is_numeric_dtype(x[f])]
|
|
452
470
|
self.logger.info(f"Rest text features after checks: {self.text_features}")
|
|
453
471
|
params["text_features"] = self.text_features
|
|
454
472
|
else:
|
|
@@ -456,15 +474,15 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
456
474
|
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
|
457
475
|
|
|
458
476
|
# Find rest categorical features
|
|
459
|
-
self.cat_features = _get_cat_features(
|
|
460
|
-
|
|
477
|
+
self.cat_features = _get_cat_features(x, self.text_features, embedding_features)
|
|
478
|
+
x = fill_na_cat_features(x, self.cat_features)
|
|
461
479
|
unique_cat_features = []
|
|
462
480
|
for name in self.cat_features:
|
|
463
481
|
# Remove constant categorical features
|
|
464
|
-
if
|
|
482
|
+
if x[name].nunique() > 1:
|
|
465
483
|
unique_cat_features.append(name)
|
|
466
484
|
else:
|
|
467
|
-
|
|
485
|
+
x = x.drop(columns=name)
|
|
468
486
|
self.cat_features = unique_cat_features
|
|
469
487
|
if (
|
|
470
488
|
hasattr(self.estimator, "get_param")
|
|
@@ -473,9 +491,9 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
473
491
|
):
|
|
474
492
|
estimator_cat_features = self.estimator.get_param("cat_features")
|
|
475
493
|
if all([isinstance(c, int) for c in estimator_cat_features]):
|
|
476
|
-
cat_features_idx = {
|
|
494
|
+
cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
|
|
477
495
|
cat_features_idx.update(estimator_cat_features)
|
|
478
|
-
self.cat_features = [
|
|
496
|
+
self.cat_features = [x.columns[idx] for idx in sorted(cat_features_idx)]
|
|
479
497
|
elif all([isinstance(c, str) for c in estimator_cat_features]):
|
|
480
498
|
self.cat_features = list(set(self.cat_features + estimator_cat_features))
|
|
481
499
|
else:
|
|
@@ -486,7 +504,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
486
504
|
self.logger.info(f"Selected categorical features: {self.cat_features}")
|
|
487
505
|
params["cat_features"] = self.cat_features
|
|
488
506
|
|
|
489
|
-
return
|
|
507
|
+
return x, y, groups, params
|
|
490
508
|
|
|
491
509
|
def group_embeddings(self, df: pd.DataFrame):
|
|
492
510
|
emb_name = "__grouped_embeddings"
|
|
@@ -497,18 +515,40 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
497
515
|
|
|
498
516
|
return df, [emb_name]
|
|
499
517
|
|
|
500
|
-
def _prepare_to_calculate(self,
|
|
501
|
-
|
|
518
|
+
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
519
|
+
if self.exclude_features:
|
|
520
|
+
x = x.drop(columns=self.exclude_features)
|
|
521
|
+
x, y, params = super()._prepare_to_calculate(x, y)
|
|
502
522
|
if self.text_features:
|
|
503
523
|
params["text_features"] = self.text_features
|
|
504
524
|
if self.emb_features:
|
|
505
|
-
|
|
525
|
+
x, emb_columns = self.group_embeddings(x)
|
|
506
526
|
params["embedding_features"] = emb_columns
|
|
507
527
|
if self.cat_features:
|
|
508
|
-
|
|
528
|
+
x = fill_na_cat_features(x, self.cat_features)
|
|
509
529
|
params["cat_features"] = self.cat_features
|
|
510
530
|
|
|
511
|
-
return
|
|
531
|
+
return x, y, params
|
|
532
|
+
|
|
533
|
+
def cross_val_predict(
|
|
534
|
+
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
535
|
+
) -> Optional[float]:
|
|
536
|
+
try:
|
|
537
|
+
return super().cross_val_predict(x, y, baseline_score_column)
|
|
538
|
+
except Exception as e:
|
|
539
|
+
if "Dictionary size is 0" in e.args[0] and self.text_features:
|
|
540
|
+
high_cardinality_features = FeaturesValidator.find_high_cardinality(x[self.text_features])
|
|
541
|
+
self.logger.warning(
|
|
542
|
+
"Calculate metrics has problem with CatBoost text features. Try to remove high cardinality"
|
|
543
|
+
f" text features {high_cardinality_features} and retry"
|
|
544
|
+
)
|
|
545
|
+
for f in high_cardinality_features:
|
|
546
|
+
self.text_features.remove(f)
|
|
547
|
+
self.exclude_features.append(f)
|
|
548
|
+
x = x.drop(columns=f)
|
|
549
|
+
return super().cross_val_predict(x, y, baseline_score_column)
|
|
550
|
+
else:
|
|
551
|
+
raise e
|
|
512
552
|
|
|
513
553
|
|
|
514
554
|
class LightGBMWrapper(EstimatorWrapper):
|
|
@@ -522,32 +562,41 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
522
562
|
target_type: ModelTaskType,
|
|
523
563
|
groups: Optional[List[str]] = None,
|
|
524
564
|
text_features: Optional[List[str]] = None,
|
|
565
|
+
logger: Optional[logging.Logger] = None,
|
|
525
566
|
):
|
|
526
567
|
super(LightGBMWrapper, self).__init__(
|
|
527
|
-
estimator,
|
|
568
|
+
estimator,
|
|
569
|
+
scorer,
|
|
570
|
+
metric_name,
|
|
571
|
+
multiplier,
|
|
572
|
+
cv,
|
|
573
|
+
target_type,
|
|
574
|
+
groups=groups,
|
|
575
|
+
text_features=text_features,
|
|
576
|
+
logger=logger,
|
|
528
577
|
)
|
|
529
578
|
self.cat_features = None
|
|
530
579
|
|
|
531
|
-
def _prepare_to_fit(self,
|
|
532
|
-
|
|
533
|
-
self.cat_features = _get_cat_features(
|
|
534
|
-
|
|
580
|
+
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
|
581
|
+
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
582
|
+
self.cat_features = _get_cat_features(x)
|
|
583
|
+
x = fill_na_cat_features(x, self.cat_features)
|
|
535
584
|
for feature in self.cat_features:
|
|
536
|
-
|
|
585
|
+
x[feature] = x[feature].astype("category").cat.codes
|
|
537
586
|
if not is_numeric_dtype(y):
|
|
538
587
|
y = correct_string_target(y)
|
|
539
588
|
|
|
540
|
-
return
|
|
589
|
+
return x, y, groups, params
|
|
541
590
|
|
|
542
|
-
def _prepare_to_calculate(self,
|
|
543
|
-
|
|
591
|
+
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
592
|
+
x, y, params = super()._prepare_to_calculate(x, y)
|
|
544
593
|
if self.cat_features is not None:
|
|
545
|
-
|
|
594
|
+
x = fill_na_cat_features(x, self.cat_features)
|
|
546
595
|
for feature in self.cat_features:
|
|
547
|
-
|
|
596
|
+
x[feature] = x[feature].astype("category").cat.codes
|
|
548
597
|
if not is_numeric_dtype(y):
|
|
549
598
|
y = correct_string_target(y)
|
|
550
|
-
return
|
|
599
|
+
return x, y, params
|
|
551
600
|
|
|
552
601
|
|
|
553
602
|
class OtherEstimatorWrapper(EstimatorWrapper):
|
|
@@ -561,54 +610,69 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
|
561
610
|
target_type: ModelTaskType,
|
|
562
611
|
groups: Optional[List[str]] = None,
|
|
563
612
|
text_features: Optional[List[str]] = None,
|
|
613
|
+
logger: Optional[logging.Logger] = None,
|
|
564
614
|
):
|
|
565
615
|
super(OtherEstimatorWrapper, self).__init__(
|
|
566
|
-
estimator,
|
|
616
|
+
estimator,
|
|
617
|
+
scorer,
|
|
618
|
+
metric_name,
|
|
619
|
+
multiplier,
|
|
620
|
+
cv,
|
|
621
|
+
target_type,
|
|
622
|
+
groups=groups,
|
|
623
|
+
text_features=text_features,
|
|
624
|
+
logger=logger,
|
|
567
625
|
)
|
|
568
626
|
self.cat_features = None
|
|
569
627
|
|
|
570
|
-
def _prepare_to_fit(self,
|
|
571
|
-
|
|
572
|
-
self.cat_features = _get_cat_features(
|
|
573
|
-
num_features = [col for col in
|
|
574
|
-
|
|
575
|
-
|
|
628
|
+
def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
629
|
+
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
630
|
+
self.cat_features = _get_cat_features(x)
|
|
631
|
+
num_features = [col for col in x.columns if col not in self.cat_features]
|
|
632
|
+
x[num_features] = x[num_features].fillna(-999)
|
|
633
|
+
x = fill_na_cat_features(x, self.cat_features)
|
|
576
634
|
# TODO use one-hot encoding if cardinality is less 50
|
|
577
635
|
for feature in self.cat_features:
|
|
578
|
-
|
|
636
|
+
x[feature] = x[feature].astype("category").cat.codes
|
|
579
637
|
if not is_numeric_dtype(y):
|
|
580
638
|
y = correct_string_target(y)
|
|
581
|
-
return
|
|
639
|
+
return x, y, groups, params
|
|
582
640
|
|
|
583
|
-
def _prepare_to_calculate(self,
|
|
584
|
-
|
|
641
|
+
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
642
|
+
x, y, params = super()._prepare_to_calculate(x, y)
|
|
585
643
|
if self.cat_features is not None:
|
|
586
|
-
num_features = [col for col in
|
|
587
|
-
|
|
588
|
-
|
|
644
|
+
num_features = [col for col in x.columns if col not in self.cat_features]
|
|
645
|
+
x[num_features] = x[num_features].fillna(-999)
|
|
646
|
+
x = fill_na_cat_features(x, self.cat_features)
|
|
589
647
|
# TODO use one-hot encoding if cardinality is less 50
|
|
590
648
|
for feature in self.cat_features:
|
|
591
|
-
|
|
649
|
+
x[feature] = x[feature].astype("category").cat.codes
|
|
592
650
|
if not is_numeric_dtype(y):
|
|
593
651
|
y = correct_string_target(y)
|
|
594
|
-
return
|
|
652
|
+
return x, y, params
|
|
595
653
|
|
|
596
654
|
|
|
597
655
|
def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
|
598
656
|
if isinstance(scoring, str) and scoring is not None:
|
|
599
657
|
_get_scorer_by_name(scoring)
|
|
658
|
+
elif isinstance(scoring, Callable):
|
|
659
|
+
spec = inspect.getfullargspec(scoring)
|
|
660
|
+
if len(spec.args) < 3:
|
|
661
|
+
raise ValidationError(
|
|
662
|
+
f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
|
|
663
|
+
)
|
|
600
664
|
|
|
601
665
|
|
|
602
666
|
def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
|
|
603
667
|
metric_name = scoring
|
|
604
668
|
multiplier = 1
|
|
605
|
-
if "mean_squared_log_error"
|
|
669
|
+
if metric_name == "mean_squared_log_error" or metric_name == "MSLE" or metric_name == "msle":
|
|
606
670
|
scoring = make_scorer(_ext_mean_squared_log_error, greater_is_better=False)
|
|
607
671
|
multiplier = -1
|
|
608
|
-
elif "root_mean_squared_log_error" in metric_name or "RMSLE"
|
|
672
|
+
elif "root_mean_squared_log_error" in metric_name or metric_name == "RMSLE" or metric_name == "rmsle":
|
|
609
673
|
scoring = make_scorer(_ext_root_mean_squared_log_error, greater_is_better=False)
|
|
610
674
|
multiplier = -1
|
|
611
|
-
elif "root_mean_squared_error"
|
|
675
|
+
elif metric_name == "root_mean_squared_error" or metric_name == "RMSE" or metric_name == "rmse":
|
|
612
676
|
scoring = get_scorer("neg_root_mean_squared_error")
|
|
613
677
|
multiplier = -1
|
|
614
678
|
elif scoring in available_scorers:
|
|
@@ -660,12 +724,12 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
|
|
|
660
724
|
|
|
661
725
|
|
|
662
726
|
def _get_cat_features(
|
|
663
|
-
|
|
727
|
+
x: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
|
|
664
728
|
) -> List[str]:
|
|
665
729
|
text_features = text_features or []
|
|
666
730
|
emb_features = emb_features or []
|
|
667
731
|
exclude_features = text_features + emb_features
|
|
668
|
-
return [c for c in
|
|
732
|
+
return [c for c in x.columns if c not in exclude_features and not is_numeric_dtype(x[c])]
|
|
669
733
|
|
|
670
734
|
|
|
671
735
|
def _get_add_params(input_params, add_params):
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
|
-
from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype
|
|
4
|
+
from pandas.api.types import is_float_dtype, is_int64_dtype, is_object_dtype, is_string_dtype
|
|
5
5
|
|
|
6
6
|
from upgini.errors import ValidationError
|
|
7
7
|
|
|
@@ -44,7 +44,7 @@ class PhoneNormalizer:
|
|
|
44
44
|
Method will remove all non numeric chars from string and convert it to int.
|
|
45
45
|
None will be set for phone numbers that couldn"t be converted to int
|
|
46
46
|
"""
|
|
47
|
-
if is_string_dtype(self.df[self.phone_column_name]):
|
|
47
|
+
if is_string_dtype(self.df[self.phone_column_name]) or is_object_dtype(self.df[self.phone_column_name]):
|
|
48
48
|
convert_func = self.phone_str_to_int_safe
|
|
49
49
|
elif is_float_dtype(self.df[self.phone_column_name]):
|
|
50
50
|
convert_func = self.phone_float_to_int_safe
|
|
@@ -17,7 +17,7 @@ __author__ = "Felix Zenk"
|
|
|
17
17
|
__email__ = "felix.zenk@web.de"
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
class _Parser
|
|
20
|
+
class _Parser:
|
|
21
21
|
"""
|
|
22
22
|
A parser for the .properties file format.
|
|
23
23
|
"""
|
|
@@ -49,7 +49,7 @@ class _Parser(object):
|
|
|
49
49
|
return re.sub(pattern, lambda match: codecs.decode(match.group(0), "unicode-escape"), arg)
|
|
50
50
|
|
|
51
51
|
# I/O read
|
|
52
|
-
with open(file_path,
|
|
52
|
+
with open(file_path, encoding="utf-8") as f:
|
|
53
53
|
lines = f.readlines()
|
|
54
54
|
|
|
55
55
|
# parse
|
|
@@ -83,7 +83,7 @@ class _Parser(object):
|
|
|
83
83
|
return mapping
|
|
84
84
|
|
|
85
85
|
|
|
86
|
-
class ResourceBundle
|
|
86
|
+
class ResourceBundle:
|
|
87
87
|
"""
|
|
88
88
|
A ResourceBundle manages internationalization of string resources
|
|
89
89
|
"""
|
|
@@ -199,7 +199,7 @@ class ResourceBundle(object):
|
|
|
199
199
|
raise NotInResourceBundleError(self.name, item)
|
|
200
200
|
|
|
201
201
|
|
|
202
|
-
def get_bundle(bundle_name: str, locale: str | Sequence[str
|
|
202
|
+
def get_bundle(bundle_name: str, locale: str | Sequence[str] = None, path: Path | str = None) -> ResourceBundle:
|
|
203
203
|
"""
|
|
204
204
|
Return a new :class:`ResourceBundle` after parsing the locale
|
|
205
205
|
|
|
@@ -224,7 +224,7 @@ bundle = ResourceBundle("strings", None, path=os.path.dirname(os.path.realpath(_
|
|
|
224
224
|
custom_bundles = dict()
|
|
225
225
|
|
|
226
226
|
|
|
227
|
-
def get_custom_bundle(custom_cfg: Optional[str] = None) ->
|
|
227
|
+
def get_custom_bundle(custom_cfg: Optional[str] = None) -> ResourceBundle:
|
|
228
228
|
global custom_bundles
|
|
229
229
|
if custom_cfg is not None:
|
|
230
230
|
custom_bundle = custom_bundles.get(custom_cfg)
|
|
@@ -38,6 +38,7 @@ loss_selection_warn=\nWARNING: Loss `{0}` is not supported for feature selection
|
|
|
38
38
|
loss_calc_metrics_warn=\nWARNING: Loss `{0}` is not supported for metrics calculation with {1}
|
|
39
39
|
multivariate_timeseries_detected=\nWARNING: Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
|
|
40
40
|
group_k_fold_in_classification=\nWARNING: Using group K-fold cross-validation split for classification task.
|
|
41
|
+
current_date_added=\nWARNING: No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
|
|
41
42
|
|
|
42
43
|
# Errors
|
|
43
44
|
failed_search_by_task_id=Failed to retrieve the specified search results
|
|
@@ -111,6 +112,9 @@ x_is_empty=X is empty
|
|
|
111
112
|
y_is_empty=y is empty
|
|
112
113
|
x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
|
|
113
114
|
missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
|
|
115
|
+
x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
|
|
116
|
+
train_unstable_target=\nWARNING: Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
|
|
117
|
+
eval_unstable_target=\nWARNING: Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
|
|
114
118
|
# eval set validation
|
|
115
119
|
unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
|
|
116
120
|
eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
|
|
@@ -145,7 +149,8 @@ dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample
|
|
|
145
149
|
dataset_empty_column_names=Some column names are empty. Add names please
|
|
146
150
|
dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
|
|
147
151
|
dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
|
|
148
|
-
|
|
152
|
+
dataset_train_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
|
153
|
+
dataset_eval_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
|
149
154
|
dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
|
|
150
155
|
dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
|
|
151
156
|
dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
|
|
@@ -154,7 +159,7 @@ dataset_invalid_multiclass_target=Unexpected dtype of target for multiclass task
|
|
|
154
159
|
dataset_invalid_regression_target=Unexpected dtype of target for regression task type: {}. Expected float
|
|
155
160
|
dataset_invalid_timeseries_target=Unexpected dtype of target for timeseries task type: {}. Expected float
|
|
156
161
|
dataset_to_many_multiclass_targets=The number of target classes {} exceeds the allowed threshold: {}. Please, correct your data and try again
|
|
157
|
-
dataset_rarest_class_less_min=
|
|
162
|
+
dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
|
|
158
163
|
dataset_rarest_class_less_threshold=\nWARNING: Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
|
|
159
164
|
dataset_date_features=\nWARNING: Columns {} is a datetime or period type but not used as a search key, removed from X
|
|
160
165
|
dataset_too_many_features=Too many features. Maximum number of features is {}
|
|
@@ -196,10 +201,10 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
|
|
|
196
201
|
email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
197
202
|
phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
198
203
|
phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
199
|
-
target_type_detected
|
|
204
|
+
target_type_detected=\nDetected task type: {}\n
|
|
200
205
|
# all_ok_community_invite=Chat with us in Slack community:
|
|
201
206
|
all_ok_community_invite=❓ Support request
|
|
202
|
-
too_small_for_metrics=Your train dataset contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
|
|
207
|
+
too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
|
|
203
208
|
imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
|
|
204
209
|
loss_selection_info=Using loss `{}` for feature selection
|
|
205
210
|
loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
|
upgini/sampler/base.py
CHANGED
|
@@ -9,13 +9,11 @@ from abc import ABCMeta, abstractmethod
|
|
|
9
9
|
from typing import List, Optional
|
|
10
10
|
|
|
11
11
|
import numpy as np
|
|
12
|
-
|
|
13
12
|
from sklearn.base import BaseEstimator
|
|
14
13
|
from sklearn.preprocessing import label_binarize
|
|
15
14
|
from sklearn.utils.multiclass import check_classification_targets
|
|
16
15
|
|
|
17
|
-
from .utils import check_sampling_strategy, check_target_type
|
|
18
|
-
from .utils import ArraysTransformer
|
|
16
|
+
from .utils import ArraysTransformer, check_sampling_strategy, check_target_type
|
|
19
17
|
|
|
20
18
|
|
|
21
19
|
class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
|
|
@@ -107,7 +105,6 @@ class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
|
|
|
107
105
|
The corresponding label of `X_resampled`.
|
|
108
106
|
|
|
109
107
|
"""
|
|
110
|
-
pass
|
|
111
108
|
|
|
112
109
|
@abstractmethod
|
|
113
110
|
def _check_X_y(self, X, y, accept_sparse: Optional[List[str]] = None):
|