upgini 1.1.279a2__py3-none-any.whl → 1.1.279a2.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -0
- upgini/ads_management/ads_manager.py +4 -2
- upgini/autofe/all_operands.py +3 -2
- upgini/autofe/binary.py +2 -1
- upgini/autofe/date.py +2 -1
- upgini/autofe/feature.py +1 -1
- upgini/autofe/groupby.py +3 -1
- upgini/autofe/operand.py +4 -3
- upgini/autofe/unary.py +2 -1
- upgini/autofe/vector.py +2 -0
- upgini/dataset.py +4 -4
- upgini/errors.py +1 -1
- upgini/features_enricher.py +4 -4
- upgini/http.py +11 -10
- upgini/mdc/__init__.py +1 -3
- upgini/mdc/context.py +4 -6
- upgini/metadata.py +3 -0
- upgini/metrics.py +101 -99
- upgini/normalizer/phone_normalizer.py +1 -1
- upgini/resource_bundle/__init__.py +5 -5
- upgini/sampler/base.py +1 -4
- upgini/sampler/random_under_sampler.py +2 -5
- upgini/search_task.py +4 -4
- upgini/spinner.py +1 -1
- upgini/utils/__init__.py +1 -1
- upgini/utils/base_search_key_detector.py +2 -2
- upgini/utils/blocked_time_series.py +4 -2
- upgini/utils/country_utils.py +1 -1
- upgini/utils/custom_loss_utils.py +3 -2
- upgini/utils/cv_utils.py +2 -2
- upgini/utils/datetime_utils.py +9 -3
- upgini/utils/email_utils.py +2 -2
- upgini/utils/fallback_progress_bar.py +1 -1
- upgini/utils/progress_bar.py +1 -1
- upgini/utils/sklearn_ext.py +14 -13
- upgini/utils/track_info.py +2 -2
- upgini/version_validator.py +2 -2
- {upgini-1.1.279a2.dist-info → upgini-1.1.279a2.dev1.dist-info}/METADATA +21 -23
- upgini-1.1.279a2.dev1.dist-info/RECORD +62 -0
- {upgini-1.1.279a2.dist-info → upgini-1.1.279a2.dev1.dist-info}/WHEEL +1 -2
- upgini/fingerprint.js +0 -8
- upgini-1.1.279a2.dist-info/RECORD +0 -63
- upgini-1.1.279a2.dist-info/top_level.txt +0 -1
- {upgini-1.1.279a2.dist-info → upgini-1.1.279a2.dev1.dist-info/licenses}/LICENSE +0 -0
upgini/metrics.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import inspect
|
|
2
4
|
import logging
|
|
3
5
|
import re
|
|
@@ -125,7 +127,7 @@ NA_REPLACEMENT = "NA"
|
|
|
125
127
|
|
|
126
128
|
SUPPORTED_CATBOOST_METRICS = {
|
|
127
129
|
s.upper(): s
|
|
128
|
-
for s in
|
|
130
|
+
for s in (
|
|
129
131
|
"Logloss",
|
|
130
132
|
"CrossEntropy",
|
|
131
133
|
"CtrFactor",
|
|
@@ -204,7 +206,7 @@ SUPPORTED_CATBOOST_METRICS = {
|
|
|
204
206
|
"MultiLogloss",
|
|
205
207
|
"MultiCrossEntropy",
|
|
206
208
|
"Combination",
|
|
207
|
-
|
|
209
|
+
)
|
|
208
210
|
}
|
|
209
211
|
|
|
210
212
|
|
|
@@ -236,71 +238,71 @@ class EstimatorWrapper:
|
|
|
236
238
|
self.text_features = text_features
|
|
237
239
|
self.logger = logger or logging.getLogger()
|
|
238
240
|
|
|
239
|
-
def fit(self,
|
|
240
|
-
|
|
241
|
+
def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
|
|
242
|
+
x, y, _, fit_params = self._prepare_to_fit(x, y)
|
|
241
243
|
kwargs.update(fit_params)
|
|
242
|
-
self.estimator.fit(
|
|
244
|
+
self.estimator.fit(x, y, **kwargs)
|
|
243
245
|
return self
|
|
244
246
|
|
|
245
247
|
def predict(self, **kwargs):
|
|
246
248
|
return self.estimator.predict(**kwargs)
|
|
247
249
|
|
|
248
|
-
def _prepare_to_fit(self,
|
|
249
|
-
|
|
250
|
-
return
|
|
250
|
+
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
251
|
+
x, y, groups = self._prepare_data(x, y, groups=self.groups)
|
|
252
|
+
return x, y, groups, {}
|
|
251
253
|
|
|
252
254
|
def _prepare_data(
|
|
253
|
-
self,
|
|
255
|
+
self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
|
|
254
256
|
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
|
|
255
|
-
for c in
|
|
256
|
-
if is_numeric_dtype(
|
|
257
|
-
|
|
257
|
+
for c in x.columns:
|
|
258
|
+
if is_numeric_dtype(x[c]):
|
|
259
|
+
x[c] = x[c].astype(float)
|
|
258
260
|
else:
|
|
259
|
-
|
|
261
|
+
x[c] = x[c].astype(str)
|
|
260
262
|
|
|
261
263
|
if not isinstance(y, pd.Series):
|
|
262
264
|
raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
|
|
263
265
|
|
|
264
266
|
if groups is not None:
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
groups =
|
|
269
|
-
|
|
267
|
+
x = x.copy()
|
|
268
|
+
x["__groups"] = groups
|
|
269
|
+
x, y = self._remove_empty_target_rows(x, y)
|
|
270
|
+
groups = x["__groups"]
|
|
271
|
+
x = x.drop(columns="__groups")
|
|
270
272
|
else:
|
|
271
|
-
|
|
273
|
+
x, y = self._remove_empty_target_rows(x, y)
|
|
272
274
|
|
|
273
|
-
return
|
|
275
|
+
return x, y, groups
|
|
274
276
|
|
|
275
|
-
def _remove_empty_target_rows(self,
|
|
276
|
-
joined = pd.concat([
|
|
277
|
+
def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
|
|
278
|
+
joined = pd.concat([x, y], axis=1)
|
|
277
279
|
joined = joined[joined[y.name].notna()]
|
|
278
280
|
joined = joined.reset_index(drop=True)
|
|
279
|
-
|
|
281
|
+
x = joined.drop(columns=y.name)
|
|
280
282
|
y = np.array(list(joined[y.name].values))
|
|
281
283
|
|
|
282
|
-
return
|
|
284
|
+
return x, y
|
|
283
285
|
|
|
284
|
-
def _prepare_to_calculate(self,
|
|
285
|
-
|
|
286
|
-
return
|
|
286
|
+
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
287
|
+
x, y, _ = self._prepare_data(x, y)
|
|
288
|
+
return x, y, {}
|
|
287
289
|
|
|
288
290
|
def cross_val_predict(
|
|
289
|
-
self,
|
|
291
|
+
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
290
292
|
) -> Optional[float]:
|
|
291
|
-
|
|
293
|
+
x, y, groups, fit_params = self._prepare_to_fit(x, y)
|
|
292
294
|
|
|
293
|
-
if
|
|
295
|
+
if x.shape[1] == 0:
|
|
294
296
|
return None
|
|
295
297
|
|
|
296
298
|
scorer = check_scoring(self.estimator, scoring=self.scorer)
|
|
297
299
|
|
|
298
300
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
|
299
|
-
metric = roc_auc_score(y,
|
|
301
|
+
metric = roc_auc_score(y, x[baseline_score_column])
|
|
300
302
|
else:
|
|
301
303
|
cv_results = cross_validate(
|
|
302
304
|
estimator=self.estimator,
|
|
303
|
-
|
|
305
|
+
x=x,
|
|
304
306
|
y=y,
|
|
305
307
|
scoring=scorer,
|
|
306
308
|
cv=self.cv,
|
|
@@ -320,14 +322,14 @@ class EstimatorWrapper:
|
|
|
320
322
|
metric = 2 * metric - 1
|
|
321
323
|
return metric
|
|
322
324
|
|
|
323
|
-
def calculate_metric(self,
|
|
324
|
-
|
|
325
|
+
def calculate_metric(self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
|
|
326
|
+
x, y, _ = self._prepare_to_calculate(x, y)
|
|
325
327
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
|
326
|
-
metric = roc_auc_score(y,
|
|
328
|
+
metric = roc_auc_score(y, x[baseline_score_column])
|
|
327
329
|
else:
|
|
328
330
|
metrics = []
|
|
329
331
|
for est in self.cv_estimators:
|
|
330
|
-
metrics.append(self.scorer(est,
|
|
332
|
+
metrics.append(self.scorer(est, x, y))
|
|
331
333
|
|
|
332
334
|
metric = np.mean(metrics) * self.multiplier
|
|
333
335
|
return self.post_process_metric(metric)
|
|
@@ -338,13 +340,13 @@ class EstimatorWrapper:
|
|
|
338
340
|
logger: logging.Logger,
|
|
339
341
|
target_type: ModelTaskType,
|
|
340
342
|
cv: BaseCrossValidator,
|
|
341
|
-
|
|
343
|
+
x: pd.DataFrame,
|
|
342
344
|
scoring: Union[Callable, str, None] = None,
|
|
343
345
|
cat_features: Optional[List[str]] = None,
|
|
344
346
|
text_features: Optional[List[str]] = None,
|
|
345
347
|
add_params: Optional[Dict[str, Any]] = None,
|
|
346
348
|
groups: Optional[List[str]] = None,
|
|
347
|
-
) ->
|
|
349
|
+
) -> EstimatorWrapper:
|
|
348
350
|
scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
|
|
349
351
|
kwargs = {
|
|
350
352
|
"scorer": scorer,
|
|
@@ -380,20 +382,20 @@ class EstimatorWrapper:
|
|
|
380
382
|
else:
|
|
381
383
|
estimator_copy = deepcopy(estimator)
|
|
382
384
|
kwargs["estimator"] = estimator_copy
|
|
383
|
-
if isinstance(estimator, CatBoostClassifier
|
|
385
|
+
if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
|
|
384
386
|
if cat_features is not None:
|
|
385
387
|
for cat_feature in cat_features:
|
|
386
|
-
if cat_feature not in
|
|
388
|
+
if cat_feature not in x.columns:
|
|
387
389
|
logger.error(
|
|
388
|
-
f"Client cat_feature `{cat_feature}` not found in
|
|
390
|
+
f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
|
|
389
391
|
)
|
|
390
392
|
estimator_copy.set_params(
|
|
391
|
-
cat_features=[
|
|
393
|
+
cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
|
|
392
394
|
)
|
|
393
395
|
estimator = CatBoostWrapper(**kwargs)
|
|
394
396
|
else:
|
|
395
397
|
try:
|
|
396
|
-
if isinstance(estimator, LGBMClassifier
|
|
398
|
+
if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
|
|
397
399
|
estimator = LightGBMWrapper(**kwargs)
|
|
398
400
|
else:
|
|
399
401
|
logger.warning(
|
|
@@ -439,20 +441,20 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
439
441
|
self.emb_features = None
|
|
440
442
|
self.exclude_features = []
|
|
441
443
|
|
|
442
|
-
def _prepare_to_fit(self,
|
|
443
|
-
|
|
444
|
+
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
445
|
+
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
444
446
|
|
|
445
447
|
# Find embeddings
|
|
446
448
|
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
|
447
449
|
emb_pattern = r"(.+)_emb\d+"
|
|
448
|
-
self.emb_features = [c for c in
|
|
450
|
+
self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
|
449
451
|
embedding_features = []
|
|
450
452
|
if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
|
|
451
453
|
self.logger.info(
|
|
452
454
|
"Embedding features count more than 3, so group them into one vector for CatBoost: "
|
|
453
455
|
f"{self.emb_features}"
|
|
454
456
|
)
|
|
455
|
-
|
|
457
|
+
x, embedding_features = self.group_embeddings(x)
|
|
456
458
|
params["embedding_features"] = embedding_features
|
|
457
459
|
else:
|
|
458
460
|
self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
|
|
@@ -464,7 +466,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
464
466
|
if hasattr(CatBoostClassifier, "get_text_feature_indices"):
|
|
465
467
|
if self.text_features is not None:
|
|
466
468
|
self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
|
|
467
|
-
self.text_features = [f for f in self.text_features if f in
|
|
469
|
+
self.text_features = [f for f in self.text_features if f in x.columns and not is_numeric_dtype(x[f])]
|
|
468
470
|
self.logger.info(f"Rest text features after checks: {self.text_features}")
|
|
469
471
|
params["text_features"] = self.text_features
|
|
470
472
|
else:
|
|
@@ -472,15 +474,15 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
472
474
|
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
|
473
475
|
|
|
474
476
|
# Find rest categorical features
|
|
475
|
-
self.cat_features = _get_cat_features(
|
|
476
|
-
|
|
477
|
+
self.cat_features = _get_cat_features(x, self.text_features, embedding_features)
|
|
478
|
+
x = fill_na_cat_features(x, self.cat_features)
|
|
477
479
|
unique_cat_features = []
|
|
478
480
|
for name in self.cat_features:
|
|
479
481
|
# Remove constant categorical features
|
|
480
|
-
if
|
|
482
|
+
if x[name].nunique() > 1:
|
|
481
483
|
unique_cat_features.append(name)
|
|
482
484
|
else:
|
|
483
|
-
|
|
485
|
+
x = x.drop(columns=name)
|
|
484
486
|
self.cat_features = unique_cat_features
|
|
485
487
|
if (
|
|
486
488
|
hasattr(self.estimator, "get_param")
|
|
@@ -489,9 +491,9 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
489
491
|
):
|
|
490
492
|
estimator_cat_features = self.estimator.get_param("cat_features")
|
|
491
493
|
if all([isinstance(c, int) for c in estimator_cat_features]):
|
|
492
|
-
cat_features_idx = {
|
|
494
|
+
cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
|
|
493
495
|
cat_features_idx.update(estimator_cat_features)
|
|
494
|
-
self.cat_features = [
|
|
496
|
+
self.cat_features = [x.columns[idx] for idx in sorted(cat_features_idx)]
|
|
495
497
|
elif all([isinstance(c, str) for c in estimator_cat_features]):
|
|
496
498
|
self.cat_features = list(set(self.cat_features + estimator_cat_features))
|
|
497
499
|
else:
|
|
@@ -502,7 +504,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
502
504
|
self.logger.info(f"Selected categorical features: {self.cat_features}")
|
|
503
505
|
params["cat_features"] = self.cat_features
|
|
504
506
|
|
|
505
|
-
return
|
|
507
|
+
return x, y, groups, params
|
|
506
508
|
|
|
507
509
|
def group_embeddings(self, df: pd.DataFrame):
|
|
508
510
|
emb_name = "__grouped_embeddings"
|
|
@@ -513,38 +515,38 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
513
515
|
|
|
514
516
|
return df, [emb_name]
|
|
515
517
|
|
|
516
|
-
def _prepare_to_calculate(self,
|
|
518
|
+
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
517
519
|
if self.exclude_features:
|
|
518
|
-
|
|
519
|
-
|
|
520
|
+
x = x.drop(columns=self.exclude_features)
|
|
521
|
+
x, y, params = super()._prepare_to_calculate(x, y)
|
|
520
522
|
if self.text_features:
|
|
521
523
|
params["text_features"] = self.text_features
|
|
522
524
|
if self.emb_features:
|
|
523
|
-
|
|
525
|
+
x, emb_columns = self.group_embeddings(x)
|
|
524
526
|
params["embedding_features"] = emb_columns
|
|
525
527
|
if self.cat_features:
|
|
526
|
-
|
|
528
|
+
x = fill_na_cat_features(x, self.cat_features)
|
|
527
529
|
params["cat_features"] = self.cat_features
|
|
528
530
|
|
|
529
|
-
return
|
|
531
|
+
return x, y, params
|
|
530
532
|
|
|
531
533
|
def cross_val_predict(
|
|
532
|
-
self,
|
|
534
|
+
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
533
535
|
) -> Optional[float]:
|
|
534
536
|
try:
|
|
535
|
-
return super().cross_val_predict(
|
|
537
|
+
return super().cross_val_predict(x, y, baseline_score_column)
|
|
536
538
|
except Exception as e:
|
|
537
539
|
if "Dictionary size is 0" in e.args[0] and self.text_features:
|
|
538
|
-
high_cardinality_features = FeaturesValidator.find_high_cardinality(
|
|
540
|
+
high_cardinality_features = FeaturesValidator.find_high_cardinality(x[self.text_features])
|
|
539
541
|
self.logger.warning(
|
|
540
|
-
"
|
|
542
|
+
"Calculate metrics has problem with CatBoost text features. Try to remove high cardinality"
|
|
541
543
|
f" text features {high_cardinality_features} and retry"
|
|
542
544
|
)
|
|
543
545
|
for f in high_cardinality_features:
|
|
544
546
|
self.text_features.remove(f)
|
|
545
547
|
self.exclude_features.append(f)
|
|
546
|
-
|
|
547
|
-
return super().cross_val_predict(
|
|
548
|
+
x = x.drop(columns=f)
|
|
549
|
+
return super().cross_val_predict(x, y, baseline_score_column)
|
|
548
550
|
else:
|
|
549
551
|
raise e
|
|
550
552
|
|
|
@@ -575,26 +577,26 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
575
577
|
)
|
|
576
578
|
self.cat_features = None
|
|
577
579
|
|
|
578
|
-
def _prepare_to_fit(self,
|
|
579
|
-
|
|
580
|
-
self.cat_features = _get_cat_features(
|
|
581
|
-
|
|
580
|
+
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
|
581
|
+
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
582
|
+
self.cat_features = _get_cat_features(x)
|
|
583
|
+
x = fill_na_cat_features(x, self.cat_features)
|
|
582
584
|
for feature in self.cat_features:
|
|
583
|
-
|
|
585
|
+
x[feature] = x[feature].astype("category").cat.codes
|
|
584
586
|
if not is_numeric_dtype(y):
|
|
585
587
|
y = correct_string_target(y)
|
|
586
588
|
|
|
587
|
-
return
|
|
589
|
+
return x, y, groups, params
|
|
588
590
|
|
|
589
|
-
def _prepare_to_calculate(self,
|
|
590
|
-
|
|
591
|
+
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
592
|
+
x, y, params = super()._prepare_to_calculate(x, y)
|
|
591
593
|
if self.cat_features is not None:
|
|
592
|
-
|
|
594
|
+
x = fill_na_cat_features(x, self.cat_features)
|
|
593
595
|
for feature in self.cat_features:
|
|
594
|
-
|
|
596
|
+
x[feature] = x[feature].astype("category").cat.codes
|
|
595
597
|
if not is_numeric_dtype(y):
|
|
596
598
|
y = correct_string_target(y)
|
|
597
|
-
return
|
|
599
|
+
return x, y, params
|
|
598
600
|
|
|
599
601
|
|
|
600
602
|
class OtherEstimatorWrapper(EstimatorWrapper):
|
|
@@ -623,31 +625,31 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
|
623
625
|
)
|
|
624
626
|
self.cat_features = None
|
|
625
627
|
|
|
626
|
-
def _prepare_to_fit(self,
|
|
627
|
-
|
|
628
|
-
self.cat_features = _get_cat_features(
|
|
629
|
-
num_features = [col for col in
|
|
630
|
-
|
|
631
|
-
|
|
628
|
+
def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
629
|
+
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
630
|
+
self.cat_features = _get_cat_features(x)
|
|
631
|
+
num_features = [col for col in x.columns if col not in self.cat_features]
|
|
632
|
+
x[num_features] = x[num_features].fillna(-999)
|
|
633
|
+
x = fill_na_cat_features(x, self.cat_features)
|
|
632
634
|
# TODO use one-hot encoding if cardinality is less 50
|
|
633
635
|
for feature in self.cat_features:
|
|
634
|
-
|
|
636
|
+
x[feature] = x[feature].astype("category").cat.codes
|
|
635
637
|
if not is_numeric_dtype(y):
|
|
636
638
|
y = correct_string_target(y)
|
|
637
|
-
return
|
|
639
|
+
return x, y, groups, params
|
|
638
640
|
|
|
639
|
-
def _prepare_to_calculate(self,
|
|
640
|
-
|
|
641
|
+
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
642
|
+
x, y, params = super()._prepare_to_calculate(x, y)
|
|
641
643
|
if self.cat_features is not None:
|
|
642
|
-
num_features = [col for col in
|
|
643
|
-
|
|
644
|
-
|
|
644
|
+
num_features = [col for col in x.columns if col not in self.cat_features]
|
|
645
|
+
x[num_features] = x[num_features].fillna(-999)
|
|
646
|
+
x = fill_na_cat_features(x, self.cat_features)
|
|
645
647
|
# TODO use one-hot encoding if cardinality is less 50
|
|
646
648
|
for feature in self.cat_features:
|
|
647
|
-
|
|
649
|
+
x[feature] = x[feature].astype("category").cat.codes
|
|
648
650
|
if not is_numeric_dtype(y):
|
|
649
651
|
y = correct_string_target(y)
|
|
650
|
-
return
|
|
652
|
+
return x, y, params
|
|
651
653
|
|
|
652
654
|
|
|
653
655
|
def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
|
@@ -657,20 +659,20 @@ def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
|
|
657
659
|
spec = inspect.getfullargspec(scoring)
|
|
658
660
|
if len(spec.args) < 3:
|
|
659
661
|
raise ValidationError(
|
|
660
|
-
f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator,
|
|
662
|
+
f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
|
|
661
663
|
)
|
|
662
664
|
|
|
663
665
|
|
|
664
666
|
def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
|
|
665
667
|
metric_name = scoring
|
|
666
668
|
multiplier = 1
|
|
667
|
-
if "mean_squared_log_error"
|
|
669
|
+
if metric_name == "mean_squared_log_error" or metric_name == "MSLE" or metric_name == "msle":
|
|
668
670
|
scoring = make_scorer(_ext_mean_squared_log_error, greater_is_better=False)
|
|
669
671
|
multiplier = -1
|
|
670
|
-
elif "root_mean_squared_log_error" in metric_name or "RMSLE"
|
|
672
|
+
elif "root_mean_squared_log_error" in metric_name or metric_name == "RMSLE" or metric_name == "rmsle":
|
|
671
673
|
scoring = make_scorer(_ext_root_mean_squared_log_error, greater_is_better=False)
|
|
672
674
|
multiplier = -1
|
|
673
|
-
elif "root_mean_squared_error"
|
|
675
|
+
elif metric_name == "root_mean_squared_error" or metric_name == "RMSE" or metric_name == "rmse":
|
|
674
676
|
scoring = get_scorer("neg_root_mean_squared_error")
|
|
675
677
|
multiplier = -1
|
|
676
678
|
elif scoring in available_scorers:
|
|
@@ -722,12 +724,12 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
|
|
|
722
724
|
|
|
723
725
|
|
|
724
726
|
def _get_cat_features(
|
|
725
|
-
|
|
727
|
+
x: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
|
|
726
728
|
) -> List[str]:
|
|
727
729
|
text_features = text_features or []
|
|
728
730
|
emb_features = emb_features or []
|
|
729
731
|
exclude_features = text_features + emb_features
|
|
730
|
-
return [c for c in
|
|
732
|
+
return [c for c in x.columns if c not in exclude_features and not is_numeric_dtype(x[c])]
|
|
731
733
|
|
|
732
734
|
|
|
733
735
|
def _get_add_params(input_params, add_params):
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
|
-
from pandas.api.types import is_float_dtype, is_int64_dtype,
|
|
4
|
+
from pandas.api.types import is_float_dtype, is_int64_dtype, is_object_dtype, is_string_dtype
|
|
5
5
|
|
|
6
6
|
from upgini.errors import ValidationError
|
|
7
7
|
|
|
@@ -17,7 +17,7 @@ __author__ = "Felix Zenk"
|
|
|
17
17
|
__email__ = "felix.zenk@web.de"
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
class _Parser
|
|
20
|
+
class _Parser:
|
|
21
21
|
"""
|
|
22
22
|
A parser for the .properties file format.
|
|
23
23
|
"""
|
|
@@ -49,7 +49,7 @@ class _Parser(object):
|
|
|
49
49
|
return re.sub(pattern, lambda match: codecs.decode(match.group(0), "unicode-escape"), arg)
|
|
50
50
|
|
|
51
51
|
# I/O read
|
|
52
|
-
with open(file_path,
|
|
52
|
+
with open(file_path, encoding="utf-8") as f:
|
|
53
53
|
lines = f.readlines()
|
|
54
54
|
|
|
55
55
|
# parse
|
|
@@ -83,7 +83,7 @@ class _Parser(object):
|
|
|
83
83
|
return mapping
|
|
84
84
|
|
|
85
85
|
|
|
86
|
-
class ResourceBundle
|
|
86
|
+
class ResourceBundle:
|
|
87
87
|
"""
|
|
88
88
|
A ResourceBundle manages internationalization of string resources
|
|
89
89
|
"""
|
|
@@ -199,7 +199,7 @@ class ResourceBundle(object):
|
|
|
199
199
|
raise NotInResourceBundleError(self.name, item)
|
|
200
200
|
|
|
201
201
|
|
|
202
|
-
def get_bundle(bundle_name: str, locale: str | Sequence[str
|
|
202
|
+
def get_bundle(bundle_name: str, locale: str | Sequence[str] = None, path: Path | str = None) -> ResourceBundle:
|
|
203
203
|
"""
|
|
204
204
|
Return a new :class:`ResourceBundle` after parsing the locale
|
|
205
205
|
|
|
@@ -224,7 +224,7 @@ bundle = ResourceBundle("strings", None, path=os.path.dirname(os.path.realpath(_
|
|
|
224
224
|
custom_bundles = dict()
|
|
225
225
|
|
|
226
226
|
|
|
227
|
-
def get_custom_bundle(custom_cfg: Optional[str] = None) ->
|
|
227
|
+
def get_custom_bundle(custom_cfg: Optional[str] = None) -> ResourceBundle:
|
|
228
228
|
global custom_bundles
|
|
229
229
|
if custom_cfg is not None:
|
|
230
230
|
custom_bundle = custom_bundles.get(custom_cfg)
|
upgini/sampler/base.py
CHANGED
|
@@ -9,13 +9,11 @@ from abc import ABCMeta, abstractmethod
|
|
|
9
9
|
from typing import List, Optional
|
|
10
10
|
|
|
11
11
|
import numpy as np
|
|
12
|
-
|
|
13
12
|
from sklearn.base import BaseEstimator
|
|
14
13
|
from sklearn.preprocessing import label_binarize
|
|
15
14
|
from sklearn.utils.multiclass import check_classification_targets
|
|
16
15
|
|
|
17
|
-
from .utils import check_sampling_strategy, check_target_type
|
|
18
|
-
from .utils import ArraysTransformer
|
|
16
|
+
from .utils import ArraysTransformer, check_sampling_strategy, check_target_type
|
|
19
17
|
|
|
20
18
|
|
|
21
19
|
class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
|
|
@@ -107,7 +105,6 @@ class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
|
|
|
107
105
|
The corresponding label of `X_resampled`.
|
|
108
106
|
|
|
109
107
|
"""
|
|
110
|
-
pass
|
|
111
108
|
|
|
112
109
|
@abstractmethod
|
|
113
110
|
def _check_X_y(self, X, y, accept_sparse: Optional[List[str]] = None):
|
|
@@ -5,13 +5,10 @@
|
|
|
5
5
|
# License: MIT
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
|
-
|
|
9
|
-
from sklearn.utils import check_random_state
|
|
10
|
-
from sklearn.utils import _safe_indexing
|
|
8
|
+
from sklearn.utils import _safe_indexing, check_random_state
|
|
11
9
|
|
|
12
10
|
from .base import BaseUnderSampler
|
|
13
|
-
from .utils import check_target_type
|
|
14
|
-
from .utils import _deprecate_positional_args
|
|
11
|
+
from .utils import _deprecate_positional_args, check_target_type
|
|
15
12
|
|
|
16
13
|
|
|
17
14
|
class RandomUnderSampler(BaseUnderSampler):
|
upgini/search_task.py
CHANGED
|
@@ -8,10 +8,10 @@ import pandas as pd
|
|
|
8
8
|
|
|
9
9
|
from upgini import dataset
|
|
10
10
|
from upgini.http import (
|
|
11
|
-
_RestClient,
|
|
12
11
|
ProviderTaskSummary,
|
|
13
12
|
SearchProgress,
|
|
14
13
|
SearchTaskSummary,
|
|
14
|
+
_RestClient,
|
|
15
15
|
get_rest_client,
|
|
16
16
|
is_demo_api_key,
|
|
17
17
|
)
|
|
@@ -295,7 +295,7 @@ class SearchTask:
|
|
|
295
295
|
return self.rest_client.get_search_file_metadata(self.search_task_id, trace_id)
|
|
296
296
|
|
|
297
297
|
|
|
298
|
-
@lru_cache
|
|
298
|
+
@lru_cache
|
|
299
299
|
def _get_all_initial_raw_features_cached(
|
|
300
300
|
endpoint: Optional[str],
|
|
301
301
|
api_key: Optional[str],
|
|
@@ -328,7 +328,7 @@ def _get_all_initial_raw_features_cached(
|
|
|
328
328
|
return result_df
|
|
329
329
|
|
|
330
330
|
|
|
331
|
-
@lru_cache
|
|
331
|
+
@lru_cache
|
|
332
332
|
def _get_all_validation_raw_features_cached(
|
|
333
333
|
endpoint: Optional[str],
|
|
334
334
|
api_key: Optional[str],
|
|
@@ -357,7 +357,7 @@ def _get_all_validation_raw_features_cached(
|
|
|
357
357
|
return result_df
|
|
358
358
|
|
|
359
359
|
|
|
360
|
-
@lru_cache
|
|
360
|
+
@lru_cache
|
|
361
361
|
def _get_target_outliers_cached(
|
|
362
362
|
endpoint: Optional[str],
|
|
363
363
|
api_key: Optional[str],
|
upgini/spinner.py
CHANGED
upgini/utils/__init__.py
CHANGED
|
@@ -2,7 +2,7 @@ import itertools
|
|
|
2
2
|
from typing import List, Tuple
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
|
-
from pandas.api.types import
|
|
5
|
+
from pandas.api.types import is_object_dtype, is_string_dtype
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def combine_search_keys(search_keys: List[str]) -> List[Tuple[str]]:
|
|
@@ -5,10 +5,10 @@ import pandas as pd
|
|
|
5
5
|
|
|
6
6
|
class BaseSearchKeyDetector:
|
|
7
7
|
def _is_search_key_by_name(self, column_name: str) -> bool:
|
|
8
|
-
raise NotImplementedError
|
|
8
|
+
raise NotImplementedError
|
|
9
9
|
|
|
10
10
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
11
|
-
raise NotImplementedError
|
|
11
|
+
raise NotImplementedError
|
|
12
12
|
|
|
13
13
|
def _get_search_key_by_name(self, column_names: List[str]) -> Optional[str]:
|
|
14
14
|
for column_name in column_names:
|
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
import numpy as np
|
|
2
1
|
import numbers
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from sklearn.model_selection import BaseCrossValidator
|
|
3
5
|
from sklearn.utils import indexable
|
|
4
6
|
from sklearn.utils.validation import _num_samples
|
|
5
|
-
|
|
7
|
+
|
|
6
8
|
from upgini.resource_bundle import bundle
|
|
7
9
|
|
|
8
10
|
|
upgini/utils/country_utils.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
from upgini.metadata import ModelTaskType, RuntimeParameters
|
|
2
|
-
from typing import Optional, Dict, Any
|
|
3
1
|
import logging
|
|
2
|
+
from typing import Any, Dict, Optional
|
|
3
|
+
|
|
4
|
+
from upgini.metadata import ModelTaskType, RuntimeParameters
|
|
4
5
|
from upgini.resource_bundle import bundle
|
|
5
6
|
|
|
6
7
|
|
upgini/utils/cv_utils.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
from functools import reduce
|
|
2
2
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
3
|
-
import numpy as np
|
|
4
3
|
|
|
4
|
+
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
6
|
-
from sklearn.model_selection import BaseCrossValidator,
|
|
6
|
+
from sklearn.model_selection import BaseCrossValidator, GroupKFold, GroupShuffleSplit, KFold, TimeSeriesSplit
|
|
7
7
|
|
|
8
8
|
from upgini.metadata import CVType
|
|
9
9
|
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|