upgini 1.1.279__py3-none-any.whl → 1.1.279a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/ads_management/ads_manager.py +2 -4
- upgini/autofe/all_operands.py +2 -3
- upgini/autofe/binary.py +1 -2
- upgini/autofe/date.py +1 -2
- upgini/autofe/feature.py +1 -1
- upgini/autofe/groupby.py +1 -3
- upgini/autofe/operand.py +3 -4
- upgini/autofe/unary.py +1 -2
- upgini/autofe/vector.py +0 -2
- upgini/dataset.py +4 -4
- upgini/errors.py +1 -1
- upgini/features_enricher.py +4 -4
- upgini/fingerprint.js +8 -0
- upgini/http.py +10 -11
- upgini/mdc/__init__.py +3 -1
- upgini/mdc/context.py +6 -4
- upgini/metadata.py +0 -3
- upgini/metrics.py +99 -101
- upgini/normalizer/phone_normalizer.py +1 -1
- upgini/resource_bundle/__init__.py +5 -5
- upgini/sampler/base.py +4 -1
- upgini/sampler/random_under_sampler.py +5 -2
- upgini/search_task.py +4 -4
- upgini/spinner.py +1 -1
- upgini/utils/__init__.py +1 -1
- upgini/utils/base_search_key_detector.py +2 -2
- upgini/utils/blocked_time_series.py +2 -4
- upgini/utils/country_utils.py +1 -1
- upgini/utils/custom_loss_utils.py +2 -3
- upgini/utils/cv_utils.py +2 -2
- upgini/utils/datetime_utils.py +6 -12
- upgini/utils/email_utils.py +2 -2
- upgini/utils/fallback_progress_bar.py +1 -1
- upgini/utils/progress_bar.py +1 -1
- upgini/utils/sklearn_ext.py +13 -14
- upgini/utils/track_info.py +2 -2
- upgini/version_validator.py +2 -2
- {upgini-1.1.279.dist-info → upgini-1.1.279a1.dist-info}/METADATA +23 -21
- upgini-1.1.279a1.dist-info/RECORD +63 -0
- {upgini-1.1.279.dist-info → upgini-1.1.279a1.dist-info}/WHEEL +2 -1
- upgini-1.1.279a1.dist-info/top_level.txt +1 -0
- upgini/__about__.py +0 -1
- upgini-1.1.279.dist-info/RECORD +0 -62
- {upgini-1.1.279.dist-info/licenses → upgini-1.1.279a1.dist-info}/LICENSE +0 -0
upgini/metrics.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
1
|
import inspect
|
|
4
2
|
import logging
|
|
5
3
|
import re
|
|
@@ -127,7 +125,7 @@ NA_REPLACEMENT = "NA"
|
|
|
127
125
|
|
|
128
126
|
SUPPORTED_CATBOOST_METRICS = {
|
|
129
127
|
s.upper(): s
|
|
130
|
-
for s in
|
|
128
|
+
for s in {
|
|
131
129
|
"Logloss",
|
|
132
130
|
"CrossEntropy",
|
|
133
131
|
"CtrFactor",
|
|
@@ -206,7 +204,7 @@ SUPPORTED_CATBOOST_METRICS = {
|
|
|
206
204
|
"MultiLogloss",
|
|
207
205
|
"MultiCrossEntropy",
|
|
208
206
|
"Combination",
|
|
209
|
-
|
|
207
|
+
}
|
|
210
208
|
}
|
|
211
209
|
|
|
212
210
|
|
|
@@ -238,71 +236,71 @@ class EstimatorWrapper:
|
|
|
238
236
|
self.text_features = text_features
|
|
239
237
|
self.logger = logger or logging.getLogger()
|
|
240
238
|
|
|
241
|
-
def fit(self,
|
|
242
|
-
|
|
239
|
+
def fit(self, X: pd.DataFrame, y: np.ndarray, **kwargs):
|
|
240
|
+
X, y, _, fit_params = self._prepare_to_fit(X, y)
|
|
243
241
|
kwargs.update(fit_params)
|
|
244
|
-
self.estimator.fit(
|
|
242
|
+
self.estimator.fit(X, y, **kwargs)
|
|
245
243
|
return self
|
|
246
244
|
|
|
247
245
|
def predict(self, **kwargs):
|
|
248
246
|
return self.estimator.predict(**kwargs)
|
|
249
247
|
|
|
250
|
-
def _prepare_to_fit(self,
|
|
251
|
-
|
|
252
|
-
return
|
|
248
|
+
def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
249
|
+
X, y, groups = self._prepare_data(X, y, groups=self.groups)
|
|
250
|
+
return X, y, groups, {}
|
|
253
251
|
|
|
254
252
|
def _prepare_data(
|
|
255
|
-
self,
|
|
253
|
+
self, X: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
|
|
256
254
|
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
|
|
257
|
-
for c in
|
|
258
|
-
if is_numeric_dtype(
|
|
259
|
-
|
|
255
|
+
for c in X.columns:
|
|
256
|
+
if is_numeric_dtype(X[c]):
|
|
257
|
+
X[c] = X[c].astype(float)
|
|
260
258
|
else:
|
|
261
|
-
|
|
259
|
+
X[c] = X[c].astype(str)
|
|
262
260
|
|
|
263
261
|
if not isinstance(y, pd.Series):
|
|
264
262
|
raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
|
|
265
263
|
|
|
266
264
|
if groups is not None:
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
groups =
|
|
271
|
-
|
|
265
|
+
X = X.copy()
|
|
266
|
+
X["__groups"] = groups
|
|
267
|
+
X, y = self._remove_empty_target_rows(X, y)
|
|
268
|
+
groups = X["__groups"]
|
|
269
|
+
X = X.drop(columns="__groups")
|
|
272
270
|
else:
|
|
273
|
-
|
|
271
|
+
X, y = self._remove_empty_target_rows(X, y)
|
|
274
272
|
|
|
275
|
-
return
|
|
273
|
+
return X, y, groups
|
|
276
274
|
|
|
277
|
-
def _remove_empty_target_rows(self,
|
|
278
|
-
joined = pd.concat([
|
|
275
|
+
def _remove_empty_target_rows(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
|
|
276
|
+
joined = pd.concat([X, y], axis=1)
|
|
279
277
|
joined = joined[joined[y.name].notna()]
|
|
280
278
|
joined = joined.reset_index(drop=True)
|
|
281
|
-
|
|
279
|
+
X = joined.drop(columns=y.name)
|
|
282
280
|
y = np.array(list(joined[y.name].values))
|
|
283
281
|
|
|
284
|
-
return
|
|
282
|
+
return X, y
|
|
285
283
|
|
|
286
|
-
def _prepare_to_calculate(self,
|
|
287
|
-
|
|
288
|
-
return
|
|
284
|
+
def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
285
|
+
X, y, _ = self._prepare_data(X, y)
|
|
286
|
+
return X, y, {}
|
|
289
287
|
|
|
290
288
|
def cross_val_predict(
|
|
291
|
-
self,
|
|
289
|
+
self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
292
290
|
) -> Optional[float]:
|
|
293
|
-
|
|
291
|
+
X, y, groups, fit_params = self._prepare_to_fit(X, y)
|
|
294
292
|
|
|
295
|
-
if
|
|
293
|
+
if X.shape[1] == 0:
|
|
296
294
|
return None
|
|
297
295
|
|
|
298
296
|
scorer = check_scoring(self.estimator, scoring=self.scorer)
|
|
299
297
|
|
|
300
298
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
|
301
|
-
metric = roc_auc_score(y,
|
|
299
|
+
metric = roc_auc_score(y, X[baseline_score_column])
|
|
302
300
|
else:
|
|
303
301
|
cv_results = cross_validate(
|
|
304
302
|
estimator=self.estimator,
|
|
305
|
-
|
|
303
|
+
X=X,
|
|
306
304
|
y=y,
|
|
307
305
|
scoring=scorer,
|
|
308
306
|
cv=self.cv,
|
|
@@ -322,14 +320,14 @@ class EstimatorWrapper:
|
|
|
322
320
|
metric = 2 * metric - 1
|
|
323
321
|
return metric
|
|
324
322
|
|
|
325
|
-
def calculate_metric(self,
|
|
326
|
-
|
|
323
|
+
def calculate_metric(self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
|
|
324
|
+
X, y, _ = self._prepare_to_calculate(X, y)
|
|
327
325
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
|
328
|
-
metric = roc_auc_score(y,
|
|
326
|
+
metric = roc_auc_score(y, X[baseline_score_column])
|
|
329
327
|
else:
|
|
330
328
|
metrics = []
|
|
331
329
|
for est in self.cv_estimators:
|
|
332
|
-
metrics.append(self.scorer(est,
|
|
330
|
+
metrics.append(self.scorer(est, X, y))
|
|
333
331
|
|
|
334
332
|
metric = np.mean(metrics) * self.multiplier
|
|
335
333
|
return self.post_process_metric(metric)
|
|
@@ -340,13 +338,13 @@ class EstimatorWrapper:
|
|
|
340
338
|
logger: logging.Logger,
|
|
341
339
|
target_type: ModelTaskType,
|
|
342
340
|
cv: BaseCrossValidator,
|
|
343
|
-
|
|
341
|
+
X: pd.DataFrame,
|
|
344
342
|
scoring: Union[Callable, str, None] = None,
|
|
345
343
|
cat_features: Optional[List[str]] = None,
|
|
346
344
|
text_features: Optional[List[str]] = None,
|
|
347
345
|
add_params: Optional[Dict[str, Any]] = None,
|
|
348
346
|
groups: Optional[List[str]] = None,
|
|
349
|
-
) -> EstimatorWrapper:
|
|
347
|
+
) -> "EstimatorWrapper":
|
|
350
348
|
scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
|
|
351
349
|
kwargs = {
|
|
352
350
|
"scorer": scorer,
|
|
@@ -382,20 +380,20 @@ class EstimatorWrapper:
|
|
|
382
380
|
else:
|
|
383
381
|
estimator_copy = deepcopy(estimator)
|
|
384
382
|
kwargs["estimator"] = estimator_copy
|
|
385
|
-
if isinstance(estimator, (
|
|
383
|
+
if isinstance(estimator, CatBoostClassifier) or isinstance(estimator, CatBoostRegressor):
|
|
386
384
|
if cat_features is not None:
|
|
387
385
|
for cat_feature in cat_features:
|
|
388
|
-
if cat_feature not in
|
|
386
|
+
if cat_feature not in X.columns:
|
|
389
387
|
logger.error(
|
|
390
|
-
f"Client cat_feature `{cat_feature}` not found in
|
|
388
|
+
f"Client cat_feature `{cat_feature}` not found in X columns: {X.columns.to_list()}"
|
|
391
389
|
)
|
|
392
390
|
estimator_copy.set_params(
|
|
393
|
-
cat_features=[
|
|
391
|
+
cat_features=[X.columns.get_loc(cat_feature) for cat_feature in cat_features]
|
|
394
392
|
)
|
|
395
393
|
estimator = CatBoostWrapper(**kwargs)
|
|
396
394
|
else:
|
|
397
395
|
try:
|
|
398
|
-
if isinstance(estimator, (
|
|
396
|
+
if isinstance(estimator, LGBMClassifier) or isinstance(estimator, LGBMRegressor):
|
|
399
397
|
estimator = LightGBMWrapper(**kwargs)
|
|
400
398
|
else:
|
|
401
399
|
logger.warning(
|
|
@@ -441,20 +439,20 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
441
439
|
self.emb_features = None
|
|
442
440
|
self.exclude_features = []
|
|
443
441
|
|
|
444
|
-
def _prepare_to_fit(self,
|
|
445
|
-
|
|
442
|
+
def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
443
|
+
X, y, groups, params = super()._prepare_to_fit(X, y)
|
|
446
444
|
|
|
447
445
|
# Find embeddings
|
|
448
446
|
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
|
449
447
|
emb_pattern = r"(.+)_emb\d+"
|
|
450
|
-
self.emb_features = [c for c in
|
|
448
|
+
self.emb_features = [c for c in X.columns if re.match(emb_pattern, c) and is_numeric_dtype(X[c])]
|
|
451
449
|
embedding_features = []
|
|
452
450
|
if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
|
|
453
451
|
self.logger.info(
|
|
454
452
|
"Embedding features count more than 3, so group them into one vector for CatBoost: "
|
|
455
453
|
f"{self.emb_features}"
|
|
456
454
|
)
|
|
457
|
-
|
|
455
|
+
X, embedding_features = self.group_embeddings(X)
|
|
458
456
|
params["embedding_features"] = embedding_features
|
|
459
457
|
else:
|
|
460
458
|
self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
|
|
@@ -466,7 +464,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
466
464
|
if hasattr(CatBoostClassifier, "get_text_feature_indices"):
|
|
467
465
|
if self.text_features is not None:
|
|
468
466
|
self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
|
|
469
|
-
self.text_features = [f for f in self.text_features if f in
|
|
467
|
+
self.text_features = [f for f in self.text_features if f in X.columns and not is_numeric_dtype(X[f])]
|
|
470
468
|
self.logger.info(f"Rest text features after checks: {self.text_features}")
|
|
471
469
|
params["text_features"] = self.text_features
|
|
472
470
|
else:
|
|
@@ -474,15 +472,15 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
474
472
|
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
|
475
473
|
|
|
476
474
|
# Find rest categorical features
|
|
477
|
-
self.cat_features = _get_cat_features(
|
|
478
|
-
|
|
475
|
+
self.cat_features = _get_cat_features(X, self.text_features, embedding_features)
|
|
476
|
+
X = fill_na_cat_features(X, self.cat_features)
|
|
479
477
|
unique_cat_features = []
|
|
480
478
|
for name in self.cat_features:
|
|
481
479
|
# Remove constant categorical features
|
|
482
|
-
if
|
|
480
|
+
if X[name].nunique() > 1:
|
|
483
481
|
unique_cat_features.append(name)
|
|
484
482
|
else:
|
|
485
|
-
|
|
483
|
+
X = X.drop(columns=name)
|
|
486
484
|
self.cat_features = unique_cat_features
|
|
487
485
|
if (
|
|
488
486
|
hasattr(self.estimator, "get_param")
|
|
@@ -491,9 +489,9 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
491
489
|
):
|
|
492
490
|
estimator_cat_features = self.estimator.get_param("cat_features")
|
|
493
491
|
if all([isinstance(c, int) for c in estimator_cat_features]):
|
|
494
|
-
cat_features_idx = {
|
|
492
|
+
cat_features_idx = {X.columns.get_loc(c) for c in self.cat_features}
|
|
495
493
|
cat_features_idx.update(estimator_cat_features)
|
|
496
|
-
self.cat_features = [
|
|
494
|
+
self.cat_features = [X.columns[idx] for idx in sorted(cat_features_idx)]
|
|
497
495
|
elif all([isinstance(c, str) for c in estimator_cat_features]):
|
|
498
496
|
self.cat_features = list(set(self.cat_features + estimator_cat_features))
|
|
499
497
|
else:
|
|
@@ -504,7 +502,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
504
502
|
self.logger.info(f"Selected categorical features: {self.cat_features}")
|
|
505
503
|
params["cat_features"] = self.cat_features
|
|
506
504
|
|
|
507
|
-
return
|
|
505
|
+
return X, y, groups, params
|
|
508
506
|
|
|
509
507
|
def group_embeddings(self, df: pd.DataFrame):
|
|
510
508
|
emb_name = "__grouped_embeddings"
|
|
@@ -515,38 +513,38 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
515
513
|
|
|
516
514
|
return df, [emb_name]
|
|
517
515
|
|
|
518
|
-
def _prepare_to_calculate(self,
|
|
516
|
+
def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
519
517
|
if self.exclude_features:
|
|
520
|
-
|
|
521
|
-
|
|
518
|
+
X = X.drop(columns=self.exclude_features)
|
|
519
|
+
X, y, params = super()._prepare_to_calculate(X, y)
|
|
522
520
|
if self.text_features:
|
|
523
521
|
params["text_features"] = self.text_features
|
|
524
522
|
if self.emb_features:
|
|
525
|
-
|
|
523
|
+
X, emb_columns = self.group_embeddings(X)
|
|
526
524
|
params["embedding_features"] = emb_columns
|
|
527
525
|
if self.cat_features:
|
|
528
|
-
|
|
526
|
+
X = fill_na_cat_features(X, self.cat_features)
|
|
529
527
|
params["cat_features"] = self.cat_features
|
|
530
528
|
|
|
531
|
-
return
|
|
529
|
+
return X, y, params
|
|
532
530
|
|
|
533
531
|
def cross_val_predict(
|
|
534
|
-
self,
|
|
532
|
+
self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
535
533
|
) -> Optional[float]:
|
|
536
534
|
try:
|
|
537
|
-
return super().cross_val_predict(
|
|
535
|
+
return super().cross_val_predict(X, y, baseline_score_column)
|
|
538
536
|
except Exception as e:
|
|
539
537
|
if "Dictionary size is 0" in e.args[0] and self.text_features:
|
|
540
|
-
high_cardinality_features = FeaturesValidator.find_high_cardinality(
|
|
538
|
+
high_cardinality_features = FeaturesValidator.find_high_cardinality(X[self.text_features])
|
|
541
539
|
self.logger.warning(
|
|
542
|
-
"
|
|
540
|
+
"Failed to calculate metrics. Try to remove high cardinality"
|
|
543
541
|
f" text features {high_cardinality_features} and retry"
|
|
544
542
|
)
|
|
545
543
|
for f in high_cardinality_features:
|
|
546
544
|
self.text_features.remove(f)
|
|
547
545
|
self.exclude_features.append(f)
|
|
548
|
-
|
|
549
|
-
return super().cross_val_predict(
|
|
546
|
+
X = X.drop(columns=f)
|
|
547
|
+
return super().cross_val_predict(X, y, baseline_score_column)
|
|
550
548
|
else:
|
|
551
549
|
raise e
|
|
552
550
|
|
|
@@ -577,26 +575,26 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
577
575
|
)
|
|
578
576
|
self.cat_features = None
|
|
579
577
|
|
|
580
|
-
def _prepare_to_fit(self,
|
|
581
|
-
|
|
582
|
-
self.cat_features = _get_cat_features(
|
|
583
|
-
|
|
578
|
+
def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
|
579
|
+
X, y, groups, params = super()._prepare_to_fit(X, y)
|
|
580
|
+
self.cat_features = _get_cat_features(X)
|
|
581
|
+
X = fill_na_cat_features(X, self.cat_features)
|
|
584
582
|
for feature in self.cat_features:
|
|
585
|
-
|
|
583
|
+
X[feature] = X[feature].astype("category").cat.codes
|
|
586
584
|
if not is_numeric_dtype(y):
|
|
587
585
|
y = correct_string_target(y)
|
|
588
586
|
|
|
589
|
-
return
|
|
587
|
+
return X, y, groups, params
|
|
590
588
|
|
|
591
|
-
def _prepare_to_calculate(self,
|
|
592
|
-
|
|
589
|
+
def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
590
|
+
X, y, params = super()._prepare_to_calculate(X, y)
|
|
593
591
|
if self.cat_features is not None:
|
|
594
|
-
|
|
592
|
+
X = fill_na_cat_features(X, self.cat_features)
|
|
595
593
|
for feature in self.cat_features:
|
|
596
|
-
|
|
594
|
+
X[feature] = X[feature].astype("category").cat.codes
|
|
597
595
|
if not is_numeric_dtype(y):
|
|
598
596
|
y = correct_string_target(y)
|
|
599
|
-
return
|
|
597
|
+
return X, y, params
|
|
600
598
|
|
|
601
599
|
|
|
602
600
|
class OtherEstimatorWrapper(EstimatorWrapper):
|
|
@@ -625,31 +623,31 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
|
625
623
|
)
|
|
626
624
|
self.cat_features = None
|
|
627
625
|
|
|
628
|
-
def _prepare_to_fit(self,
|
|
629
|
-
|
|
630
|
-
self.cat_features = _get_cat_features(
|
|
631
|
-
num_features = [col for col in
|
|
632
|
-
|
|
633
|
-
|
|
626
|
+
def _prepare_to_fit(self, X: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
627
|
+
X, y, groups, params = super()._prepare_to_fit(X, y)
|
|
628
|
+
self.cat_features = _get_cat_features(X)
|
|
629
|
+
num_features = [col for col in X.columns if col not in self.cat_features]
|
|
630
|
+
X[num_features] = X[num_features].fillna(-999)
|
|
631
|
+
X = fill_na_cat_features(X, self.cat_features)
|
|
634
632
|
# TODO use one-hot encoding if cardinality is less 50
|
|
635
633
|
for feature in self.cat_features:
|
|
636
|
-
|
|
634
|
+
X[feature] = X[feature].astype("category").cat.codes
|
|
637
635
|
if not is_numeric_dtype(y):
|
|
638
636
|
y = correct_string_target(y)
|
|
639
|
-
return
|
|
637
|
+
return X, y, groups, params
|
|
640
638
|
|
|
641
|
-
def _prepare_to_calculate(self,
|
|
642
|
-
|
|
639
|
+
def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
640
|
+
X, y, params = super()._prepare_to_calculate(X, y)
|
|
643
641
|
if self.cat_features is not None:
|
|
644
|
-
num_features = [col for col in
|
|
645
|
-
|
|
646
|
-
|
|
642
|
+
num_features = [col for col in X.columns if col not in self.cat_features]
|
|
643
|
+
X[num_features] = X[num_features].fillna(-999)
|
|
644
|
+
X = fill_na_cat_features(X, self.cat_features)
|
|
647
645
|
# TODO use one-hot encoding if cardinality is less 50
|
|
648
646
|
for feature in self.cat_features:
|
|
649
|
-
|
|
647
|
+
X[feature] = X[feature].astype("category").cat.codes
|
|
650
648
|
if not is_numeric_dtype(y):
|
|
651
649
|
y = correct_string_target(y)
|
|
652
|
-
return
|
|
650
|
+
return X, y, params
|
|
653
651
|
|
|
654
652
|
|
|
655
653
|
def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
|
@@ -659,20 +657,20 @@ def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
|
|
659
657
|
spec = inspect.getfullargspec(scoring)
|
|
660
658
|
if len(spec.args) < 3:
|
|
661
659
|
raise ValidationError(
|
|
662
|
-
f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator,
|
|
660
|
+
f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, X, y"
|
|
663
661
|
)
|
|
664
662
|
|
|
665
663
|
|
|
666
664
|
def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
|
|
667
665
|
metric_name = scoring
|
|
668
666
|
multiplier = 1
|
|
669
|
-
if
|
|
667
|
+
if "mean_squared_log_error" == metric_name or "MSLE" == metric_name or "msle" == metric_name:
|
|
670
668
|
scoring = make_scorer(_ext_mean_squared_log_error, greater_is_better=False)
|
|
671
669
|
multiplier = -1
|
|
672
|
-
elif "root_mean_squared_log_error" in metric_name or
|
|
670
|
+
elif "root_mean_squared_log_error" in metric_name or "RMSLE" == metric_name or "rmsle" == metric_name:
|
|
673
671
|
scoring = make_scorer(_ext_root_mean_squared_log_error, greater_is_better=False)
|
|
674
672
|
multiplier = -1
|
|
675
|
-
elif
|
|
673
|
+
elif "root_mean_squared_error" == metric_name or "RMSE" == metric_name or "rmse" == metric_name:
|
|
676
674
|
scoring = get_scorer("neg_root_mean_squared_error")
|
|
677
675
|
multiplier = -1
|
|
678
676
|
elif scoring in available_scorers:
|
|
@@ -724,12 +722,12 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
|
|
|
724
722
|
|
|
725
723
|
|
|
726
724
|
def _get_cat_features(
|
|
727
|
-
|
|
725
|
+
X: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
|
|
728
726
|
) -> List[str]:
|
|
729
727
|
text_features = text_features or []
|
|
730
728
|
emb_features = emb_features or []
|
|
731
729
|
exclude_features = text_features + emb_features
|
|
732
|
-
return [c for c in
|
|
730
|
+
return [c for c in X.columns if c not in exclude_features and not is_numeric_dtype(X[c])]
|
|
733
731
|
|
|
734
732
|
|
|
735
733
|
def _get_add_params(input_params, add_params):
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
|
-
from pandas.api.types import is_float_dtype, is_int64_dtype,
|
|
4
|
+
from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype, is_object_dtype
|
|
5
5
|
|
|
6
6
|
from upgini.errors import ValidationError
|
|
7
7
|
|
|
@@ -17,7 +17,7 @@ __author__ = "Felix Zenk"
|
|
|
17
17
|
__email__ = "felix.zenk@web.de"
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
class _Parser:
|
|
20
|
+
class _Parser(object):
|
|
21
21
|
"""
|
|
22
22
|
A parser for the .properties file format.
|
|
23
23
|
"""
|
|
@@ -49,7 +49,7 @@ class _Parser:
|
|
|
49
49
|
return re.sub(pattern, lambda match: codecs.decode(match.group(0), "unicode-escape"), arg)
|
|
50
50
|
|
|
51
51
|
# I/O read
|
|
52
|
-
with open(file_path, encoding="utf-8") as f:
|
|
52
|
+
with open(file_path, mode="r", encoding="utf-8") as f:
|
|
53
53
|
lines = f.readlines()
|
|
54
54
|
|
|
55
55
|
# parse
|
|
@@ -83,7 +83,7 @@ class _Parser:
|
|
|
83
83
|
return mapping
|
|
84
84
|
|
|
85
85
|
|
|
86
|
-
class ResourceBundle:
|
|
86
|
+
class ResourceBundle(object):
|
|
87
87
|
"""
|
|
88
88
|
A ResourceBundle manages internationalization of string resources
|
|
89
89
|
"""
|
|
@@ -199,7 +199,7 @@ class ResourceBundle:
|
|
|
199
199
|
raise NotInResourceBundleError(self.name, item)
|
|
200
200
|
|
|
201
201
|
|
|
202
|
-
def get_bundle(bundle_name: str, locale: str | Sequence[str] = None, path: Path | str = None) -> ResourceBundle:
|
|
202
|
+
def get_bundle(bundle_name: str, locale: str | Sequence[str | str] = None, path: Path | str = None) -> ResourceBundle:
|
|
203
203
|
"""
|
|
204
204
|
Return a new :class:`ResourceBundle` after parsing the locale
|
|
205
205
|
|
|
@@ -224,7 +224,7 @@ bundle = ResourceBundle("strings", None, path=os.path.dirname(os.path.realpath(_
|
|
|
224
224
|
custom_bundles = dict()
|
|
225
225
|
|
|
226
226
|
|
|
227
|
-
def get_custom_bundle(custom_cfg: Optional[str] = None) -> ResourceBundle:
|
|
227
|
+
def get_custom_bundle(custom_cfg: Optional[str] = None) -> "ResourceBundle":
|
|
228
228
|
global custom_bundles
|
|
229
229
|
if custom_cfg is not None:
|
|
230
230
|
custom_bundle = custom_bundles.get(custom_cfg)
|
upgini/sampler/base.py
CHANGED
|
@@ -9,11 +9,13 @@ from abc import ABCMeta, abstractmethod
|
|
|
9
9
|
from typing import List, Optional
|
|
10
10
|
|
|
11
11
|
import numpy as np
|
|
12
|
+
|
|
12
13
|
from sklearn.base import BaseEstimator
|
|
13
14
|
from sklearn.preprocessing import label_binarize
|
|
14
15
|
from sklearn.utils.multiclass import check_classification_targets
|
|
15
16
|
|
|
16
|
-
from .utils import
|
|
17
|
+
from .utils import check_sampling_strategy, check_target_type
|
|
18
|
+
from .utils import ArraysTransformer
|
|
17
19
|
|
|
18
20
|
|
|
19
21
|
class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
|
|
@@ -105,6 +107,7 @@ class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
|
|
|
105
107
|
The corresponding label of `X_resampled`.
|
|
106
108
|
|
|
107
109
|
"""
|
|
110
|
+
pass
|
|
108
111
|
|
|
109
112
|
@abstractmethod
|
|
110
113
|
def _check_X_y(self, X, y, accept_sparse: Optional[List[str]] = None):
|
|
@@ -5,10 +5,13 @@
|
|
|
5
5
|
# License: MIT
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
|
-
|
|
8
|
+
|
|
9
|
+
from sklearn.utils import check_random_state
|
|
10
|
+
from sklearn.utils import _safe_indexing
|
|
9
11
|
|
|
10
12
|
from .base import BaseUnderSampler
|
|
11
|
-
from .utils import
|
|
13
|
+
from .utils import check_target_type
|
|
14
|
+
from .utils import _deprecate_positional_args
|
|
12
15
|
|
|
13
16
|
|
|
14
17
|
class RandomUnderSampler(BaseUnderSampler):
|
upgini/search_task.py
CHANGED
|
@@ -8,10 +8,10 @@ import pandas as pd
|
|
|
8
8
|
|
|
9
9
|
from upgini import dataset
|
|
10
10
|
from upgini.http import (
|
|
11
|
+
_RestClient,
|
|
11
12
|
ProviderTaskSummary,
|
|
12
13
|
SearchProgress,
|
|
13
14
|
SearchTaskSummary,
|
|
14
|
-
_RestClient,
|
|
15
15
|
get_rest_client,
|
|
16
16
|
is_demo_api_key,
|
|
17
17
|
)
|
|
@@ -295,7 +295,7 @@ class SearchTask:
|
|
|
295
295
|
return self.rest_client.get_search_file_metadata(self.search_task_id, trace_id)
|
|
296
296
|
|
|
297
297
|
|
|
298
|
-
@lru_cache
|
|
298
|
+
@lru_cache()
|
|
299
299
|
def _get_all_initial_raw_features_cached(
|
|
300
300
|
endpoint: Optional[str],
|
|
301
301
|
api_key: Optional[str],
|
|
@@ -328,7 +328,7 @@ def _get_all_initial_raw_features_cached(
|
|
|
328
328
|
return result_df
|
|
329
329
|
|
|
330
330
|
|
|
331
|
-
@lru_cache
|
|
331
|
+
@lru_cache()
|
|
332
332
|
def _get_all_validation_raw_features_cached(
|
|
333
333
|
endpoint: Optional[str],
|
|
334
334
|
api_key: Optional[str],
|
|
@@ -357,7 +357,7 @@ def _get_all_validation_raw_features_cached(
|
|
|
357
357
|
return result_df
|
|
358
358
|
|
|
359
359
|
|
|
360
|
-
@lru_cache
|
|
360
|
+
@lru_cache()
|
|
361
361
|
def _get_target_outliers_cached(
|
|
362
362
|
endpoint: Optional[str],
|
|
363
363
|
api_key: Optional[str],
|
upgini/spinner.py
CHANGED
upgini/utils/__init__.py
CHANGED
|
@@ -2,7 +2,7 @@ import itertools
|
|
|
2
2
|
from typing import List, Tuple
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
|
-
from pandas.api.types import
|
|
5
|
+
from pandas.api.types import is_string_dtype, is_object_dtype
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def combine_search_keys(search_keys: List[str]) -> List[Tuple[str]]:
|
|
@@ -5,10 +5,10 @@ import pandas as pd
|
|
|
5
5
|
|
|
6
6
|
class BaseSearchKeyDetector:
|
|
7
7
|
def _is_search_key_by_name(self, column_name: str) -> bool:
|
|
8
|
-
raise NotImplementedError
|
|
8
|
+
raise NotImplementedError()
|
|
9
9
|
|
|
10
10
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
11
|
-
raise NotImplementedError
|
|
11
|
+
raise NotImplementedError()
|
|
12
12
|
|
|
13
13
|
def _get_search_key_by_name(self, column_names: List[str]) -> Optional[str]:
|
|
14
14
|
for column_name in column_names:
|
|
@@ -1,10 +1,8 @@
|
|
|
1
|
-
import numbers
|
|
2
|
-
|
|
3
1
|
import numpy as np
|
|
4
|
-
|
|
2
|
+
import numbers
|
|
5
3
|
from sklearn.utils import indexable
|
|
6
4
|
from sklearn.utils.validation import _num_samples
|
|
7
|
-
|
|
5
|
+
from sklearn.model_selection import BaseCrossValidator
|
|
8
6
|
from upgini.resource_bundle import bundle
|
|
9
7
|
|
|
10
8
|
|
upgini/utils/country_utils.py
CHANGED
upgini/utils/cv_utils.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
from functools import reduce
|
|
2
2
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
3
|
-
|
|
4
3
|
import numpy as np
|
|
4
|
+
|
|
5
5
|
import pandas as pd
|
|
6
|
-
from sklearn.model_selection import BaseCrossValidator,
|
|
6
|
+
from sklearn.model_selection import BaseCrossValidator, KFold, TimeSeriesSplit, GroupKFold, GroupShuffleSplit
|
|
7
7
|
|
|
8
8
|
from upgini.metadata import CVType
|
|
9
9
|
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|