upgini 1.1.274a4__py3-none-any.whl → 1.1.280.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -0
- upgini/ads.py +6 -2
- upgini/ads_management/ads_manager.py +4 -2
- upgini/autofe/all_operands.py +3 -2
- upgini/autofe/binary.py +2 -1
- upgini/autofe/date.py +9 -2
- upgini/autofe/feature.py +1 -1
- upgini/autofe/groupby.py +3 -1
- upgini/autofe/operand.py +4 -3
- upgini/autofe/unary.py +2 -1
- upgini/autofe/vector.py +2 -0
- upgini/dataset.py +7 -6
- upgini/errors.py +1 -1
- upgini/features_enricher.py +52 -27
- upgini/http.py +11 -10
- upgini/mdc/__init__.py +1 -3
- upgini/mdc/context.py +4 -6
- upgini/metadata.py +3 -0
- upgini/metrics.py +110 -97
- upgini/normalizer/phone_normalizer.py +1 -1
- upgini/resource_bundle/__init__.py +5 -5
- upgini/resource_bundle/strings.properties +1 -1
- upgini/sampler/base.py +1 -4
- upgini/sampler/random_under_sampler.py +2 -5
- upgini/search_task.py +4 -4
- upgini/spinner.py +1 -1
- upgini/utils/__init__.py +3 -2
- upgini/utils/base_search_key_detector.py +2 -2
- upgini/utils/blocked_time_series.py +4 -2
- upgini/utils/country_utils.py +2 -2
- upgini/utils/custom_loss_utils.py +3 -2
- upgini/utils/cv_utils.py +2 -2
- upgini/utils/datetime_utils.py +25 -19
- upgini/utils/email_utils.py +3 -3
- upgini/utils/fallback_progress_bar.py +1 -1
- upgini/utils/features_validator.py +2 -1
- upgini/utils/progress_bar.py +1 -1
- upgini/utils/sklearn_ext.py +14 -13
- upgini/utils/target_utils.py +1 -1
- upgini/utils/track_info.py +27 -15
- upgini/version_validator.py +2 -2
- {upgini-1.1.274a4.dist-info → upgini-1.1.280.dev0.dist-info}/METADATA +21 -23
- upgini-1.1.280.dev0.dist-info/RECORD +62 -0
- {upgini-1.1.274a4.dist-info → upgini-1.1.280.dev0.dist-info}/WHEEL +1 -2
- upgini/fingerprint.js +0 -8
- upgini-1.1.274a4.dist-info/RECORD +0 -63
- upgini-1.1.274a4.dist-info/top_level.txt +0 -1
- {upgini-1.1.274a4.dist-info → upgini-1.1.280.dev0.dist-info/licenses}/LICENSE +0 -0
upgini/metrics.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import inspect
|
|
1
4
|
import logging
|
|
2
5
|
import re
|
|
3
6
|
from copy import deepcopy
|
|
@@ -124,7 +127,7 @@ NA_REPLACEMENT = "NA"
|
|
|
124
127
|
|
|
125
128
|
SUPPORTED_CATBOOST_METRICS = {
|
|
126
129
|
s.upper(): s
|
|
127
|
-
for s in
|
|
130
|
+
for s in (
|
|
128
131
|
"Logloss",
|
|
129
132
|
"CrossEntropy",
|
|
130
133
|
"CtrFactor",
|
|
@@ -203,7 +206,7 @@ SUPPORTED_CATBOOST_METRICS = {
|
|
|
203
206
|
"MultiLogloss",
|
|
204
207
|
"MultiCrossEntropy",
|
|
205
208
|
"Combination",
|
|
206
|
-
|
|
209
|
+
)
|
|
207
210
|
}
|
|
208
211
|
|
|
209
212
|
|
|
@@ -235,71 +238,71 @@ class EstimatorWrapper:
|
|
|
235
238
|
self.text_features = text_features
|
|
236
239
|
self.logger = logger or logging.getLogger()
|
|
237
240
|
|
|
238
|
-
def fit(self,
|
|
239
|
-
|
|
241
|
+
def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
|
|
242
|
+
x, y, _, fit_params = self._prepare_to_fit(x, y)
|
|
240
243
|
kwargs.update(fit_params)
|
|
241
|
-
self.estimator.fit(
|
|
244
|
+
self.estimator.fit(x, y, **kwargs)
|
|
242
245
|
return self
|
|
243
246
|
|
|
244
247
|
def predict(self, **kwargs):
|
|
245
248
|
return self.estimator.predict(**kwargs)
|
|
246
249
|
|
|
247
|
-
def _prepare_to_fit(self,
|
|
248
|
-
|
|
249
|
-
return
|
|
250
|
+
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
251
|
+
x, y, groups = self._prepare_data(x, y, groups=self.groups)
|
|
252
|
+
return x, y, groups, {}
|
|
250
253
|
|
|
251
254
|
def _prepare_data(
|
|
252
|
-
self,
|
|
255
|
+
self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
|
|
253
256
|
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
|
|
254
|
-
for c in
|
|
255
|
-
if is_numeric_dtype(
|
|
256
|
-
|
|
257
|
+
for c in x.columns:
|
|
258
|
+
if is_numeric_dtype(x[c]):
|
|
259
|
+
x[c] = x[c].astype(float)
|
|
257
260
|
else:
|
|
258
|
-
|
|
261
|
+
x[c] = x[c].astype(str)
|
|
259
262
|
|
|
260
263
|
if not isinstance(y, pd.Series):
|
|
261
264
|
raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
|
|
262
265
|
|
|
263
266
|
if groups is not None:
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
groups =
|
|
268
|
-
|
|
267
|
+
x = x.copy()
|
|
268
|
+
x["__groups"] = groups
|
|
269
|
+
x, y = self._remove_empty_target_rows(x, y)
|
|
270
|
+
groups = x["__groups"]
|
|
271
|
+
x = x.drop(columns="__groups")
|
|
269
272
|
else:
|
|
270
|
-
|
|
273
|
+
x, y = self._remove_empty_target_rows(x, y)
|
|
271
274
|
|
|
272
|
-
return
|
|
275
|
+
return x, y, groups
|
|
273
276
|
|
|
274
|
-
def _remove_empty_target_rows(self,
|
|
275
|
-
joined = pd.concat([
|
|
277
|
+
def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
|
|
278
|
+
joined = pd.concat([x, y], axis=1)
|
|
276
279
|
joined = joined[joined[y.name].notna()]
|
|
277
280
|
joined = joined.reset_index(drop=True)
|
|
278
|
-
|
|
281
|
+
x = joined.drop(columns=y.name)
|
|
279
282
|
y = np.array(list(joined[y.name].values))
|
|
280
283
|
|
|
281
|
-
return
|
|
284
|
+
return x, y
|
|
282
285
|
|
|
283
|
-
def _prepare_to_calculate(self,
|
|
284
|
-
|
|
285
|
-
return
|
|
286
|
+
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
287
|
+
x, y, _ = self._prepare_data(x, y)
|
|
288
|
+
return x, y, {}
|
|
286
289
|
|
|
287
290
|
def cross_val_predict(
|
|
288
|
-
self,
|
|
291
|
+
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
289
292
|
) -> Optional[float]:
|
|
290
|
-
|
|
293
|
+
x, y, groups, fit_params = self._prepare_to_fit(x, y)
|
|
291
294
|
|
|
292
|
-
if
|
|
295
|
+
if x.shape[1] == 0:
|
|
293
296
|
return None
|
|
294
297
|
|
|
295
298
|
scorer = check_scoring(self.estimator, scoring=self.scorer)
|
|
296
299
|
|
|
297
300
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
|
298
|
-
metric = roc_auc_score(y,
|
|
301
|
+
metric = roc_auc_score(y, x[baseline_score_column])
|
|
299
302
|
else:
|
|
300
303
|
cv_results = cross_validate(
|
|
301
304
|
estimator=self.estimator,
|
|
302
|
-
|
|
305
|
+
x=x,
|
|
303
306
|
y=y,
|
|
304
307
|
scoring=scorer,
|
|
305
308
|
cv=self.cv,
|
|
@@ -319,14 +322,14 @@ class EstimatorWrapper:
|
|
|
319
322
|
metric = 2 * metric - 1
|
|
320
323
|
return metric
|
|
321
324
|
|
|
322
|
-
def calculate_metric(self,
|
|
323
|
-
|
|
325
|
+
def calculate_metric(self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
|
|
326
|
+
x, y, _ = self._prepare_to_calculate(x, y)
|
|
324
327
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
|
325
|
-
metric = roc_auc_score(y,
|
|
328
|
+
metric = roc_auc_score(y, x[baseline_score_column])
|
|
326
329
|
else:
|
|
327
330
|
metrics = []
|
|
328
331
|
for est in self.cv_estimators:
|
|
329
|
-
metrics.append(self.scorer(est,
|
|
332
|
+
metrics.append(self.scorer(est, x, y))
|
|
330
333
|
|
|
331
334
|
metric = np.mean(metrics) * self.multiplier
|
|
332
335
|
return self.post_process_metric(metric)
|
|
@@ -337,13 +340,13 @@ class EstimatorWrapper:
|
|
|
337
340
|
logger: logging.Logger,
|
|
338
341
|
target_type: ModelTaskType,
|
|
339
342
|
cv: BaseCrossValidator,
|
|
340
|
-
|
|
343
|
+
x: pd.DataFrame,
|
|
341
344
|
scoring: Union[Callable, str, None] = None,
|
|
342
345
|
cat_features: Optional[List[str]] = None,
|
|
343
346
|
text_features: Optional[List[str]] = None,
|
|
344
347
|
add_params: Optional[Dict[str, Any]] = None,
|
|
345
348
|
groups: Optional[List[str]] = None,
|
|
346
|
-
) ->
|
|
349
|
+
) -> EstimatorWrapper:
|
|
347
350
|
scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
|
|
348
351
|
kwargs = {
|
|
349
352
|
"scorer": scorer,
|
|
@@ -379,15 +382,20 @@ class EstimatorWrapper:
|
|
|
379
382
|
else:
|
|
380
383
|
estimator_copy = deepcopy(estimator)
|
|
381
384
|
kwargs["estimator"] = estimator_copy
|
|
382
|
-
if isinstance(estimator, CatBoostClassifier
|
|
385
|
+
if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
|
|
383
386
|
if cat_features is not None:
|
|
387
|
+
for cat_feature in cat_features:
|
|
388
|
+
if cat_feature not in x.columns:
|
|
389
|
+
logger.error(
|
|
390
|
+
f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
|
|
391
|
+
)
|
|
384
392
|
estimator_copy.set_params(
|
|
385
|
-
cat_features=[
|
|
393
|
+
cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
|
|
386
394
|
)
|
|
387
395
|
estimator = CatBoostWrapper(**kwargs)
|
|
388
396
|
else:
|
|
389
397
|
try:
|
|
390
|
-
if isinstance(estimator, LGBMClassifier
|
|
398
|
+
if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
|
|
391
399
|
estimator = LightGBMWrapper(**kwargs)
|
|
392
400
|
else:
|
|
393
401
|
logger.warning(
|
|
@@ -433,20 +441,20 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
433
441
|
self.emb_features = None
|
|
434
442
|
self.exclude_features = []
|
|
435
443
|
|
|
436
|
-
def _prepare_to_fit(self,
|
|
437
|
-
|
|
444
|
+
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
445
|
+
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
438
446
|
|
|
439
447
|
# Find embeddings
|
|
440
448
|
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
|
441
449
|
emb_pattern = r"(.+)_emb\d+"
|
|
442
|
-
self.emb_features = [c for c in
|
|
450
|
+
self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
|
443
451
|
embedding_features = []
|
|
444
452
|
if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
|
|
445
453
|
self.logger.info(
|
|
446
454
|
"Embedding features count more than 3, so group them into one vector for CatBoost: "
|
|
447
455
|
f"{self.emb_features}"
|
|
448
456
|
)
|
|
449
|
-
|
|
457
|
+
x, embedding_features = self.group_embeddings(x)
|
|
450
458
|
params["embedding_features"] = embedding_features
|
|
451
459
|
else:
|
|
452
460
|
self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
|
|
@@ -458,7 +466,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
458
466
|
if hasattr(CatBoostClassifier, "get_text_feature_indices"):
|
|
459
467
|
if self.text_features is not None:
|
|
460
468
|
self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
|
|
461
|
-
self.text_features = [f for f in self.text_features if f in
|
|
469
|
+
self.text_features = [f for f in self.text_features if f in x.columns and not is_numeric_dtype(x[f])]
|
|
462
470
|
self.logger.info(f"Rest text features after checks: {self.text_features}")
|
|
463
471
|
params["text_features"] = self.text_features
|
|
464
472
|
else:
|
|
@@ -466,15 +474,15 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
466
474
|
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
|
467
475
|
|
|
468
476
|
# Find rest categorical features
|
|
469
|
-
self.cat_features = _get_cat_features(
|
|
470
|
-
|
|
477
|
+
self.cat_features = _get_cat_features(x, self.text_features, embedding_features)
|
|
478
|
+
x = fill_na_cat_features(x, self.cat_features)
|
|
471
479
|
unique_cat_features = []
|
|
472
480
|
for name in self.cat_features:
|
|
473
481
|
# Remove constant categorical features
|
|
474
|
-
if
|
|
482
|
+
if x[name].nunique() > 1:
|
|
475
483
|
unique_cat_features.append(name)
|
|
476
484
|
else:
|
|
477
|
-
|
|
485
|
+
x = x.drop(columns=name)
|
|
478
486
|
self.cat_features = unique_cat_features
|
|
479
487
|
if (
|
|
480
488
|
hasattr(self.estimator, "get_param")
|
|
@@ -483,9 +491,9 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
483
491
|
):
|
|
484
492
|
estimator_cat_features = self.estimator.get_param("cat_features")
|
|
485
493
|
if all([isinstance(c, int) for c in estimator_cat_features]):
|
|
486
|
-
cat_features_idx = {
|
|
494
|
+
cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
|
|
487
495
|
cat_features_idx.update(estimator_cat_features)
|
|
488
|
-
self.cat_features = [
|
|
496
|
+
self.cat_features = [x.columns[idx] for idx in sorted(cat_features_idx)]
|
|
489
497
|
elif all([isinstance(c, str) for c in estimator_cat_features]):
|
|
490
498
|
self.cat_features = list(set(self.cat_features + estimator_cat_features))
|
|
491
499
|
else:
|
|
@@ -496,7 +504,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
496
504
|
self.logger.info(f"Selected categorical features: {self.cat_features}")
|
|
497
505
|
params["cat_features"] = self.cat_features
|
|
498
506
|
|
|
499
|
-
return
|
|
507
|
+
return x, y, groups, params
|
|
500
508
|
|
|
501
509
|
def group_embeddings(self, df: pd.DataFrame):
|
|
502
510
|
emb_name = "__grouped_embeddings"
|
|
@@ -507,38 +515,38 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
507
515
|
|
|
508
516
|
return df, [emb_name]
|
|
509
517
|
|
|
510
|
-
def _prepare_to_calculate(self,
|
|
518
|
+
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
511
519
|
if self.exclude_features:
|
|
512
|
-
|
|
513
|
-
|
|
520
|
+
x = x.drop(columns=self.exclude_features)
|
|
521
|
+
x, y, params = super()._prepare_to_calculate(x, y)
|
|
514
522
|
if self.text_features:
|
|
515
523
|
params["text_features"] = self.text_features
|
|
516
524
|
if self.emb_features:
|
|
517
|
-
|
|
525
|
+
x, emb_columns = self.group_embeddings(x)
|
|
518
526
|
params["embedding_features"] = emb_columns
|
|
519
527
|
if self.cat_features:
|
|
520
|
-
|
|
528
|
+
x = fill_na_cat_features(x, self.cat_features)
|
|
521
529
|
params["cat_features"] = self.cat_features
|
|
522
530
|
|
|
523
|
-
return
|
|
531
|
+
return x, y, params
|
|
524
532
|
|
|
525
533
|
def cross_val_predict(
|
|
526
|
-
self,
|
|
534
|
+
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
527
535
|
) -> Optional[float]:
|
|
528
536
|
try:
|
|
529
|
-
return super().cross_val_predict(
|
|
537
|
+
return super().cross_val_predict(x, y, baseline_score_column)
|
|
530
538
|
except Exception as e:
|
|
531
539
|
if "Dictionary size is 0" in e.args[0] and self.text_features:
|
|
532
|
-
high_cardinality_features = FeaturesValidator.find_high_cardinality(
|
|
540
|
+
high_cardinality_features = FeaturesValidator.find_high_cardinality(x[self.text_features])
|
|
533
541
|
self.logger.warning(
|
|
534
|
-
"
|
|
542
|
+
"Calculate metrics has problem with CatBoost text features. Try to remove high cardinality"
|
|
535
543
|
f" text features {high_cardinality_features} and retry"
|
|
536
544
|
)
|
|
537
545
|
for f in high_cardinality_features:
|
|
538
546
|
self.text_features.remove(f)
|
|
539
547
|
self.exclude_features.append(f)
|
|
540
|
-
|
|
541
|
-
return super().cross_val_predict(
|
|
548
|
+
x = x.drop(columns=f)
|
|
549
|
+
return super().cross_val_predict(x, y, baseline_score_column)
|
|
542
550
|
else:
|
|
543
551
|
raise e
|
|
544
552
|
|
|
@@ -569,26 +577,26 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
569
577
|
)
|
|
570
578
|
self.cat_features = None
|
|
571
579
|
|
|
572
|
-
def _prepare_to_fit(self,
|
|
573
|
-
|
|
574
|
-
self.cat_features = _get_cat_features(
|
|
575
|
-
|
|
580
|
+
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
|
581
|
+
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
582
|
+
self.cat_features = _get_cat_features(x)
|
|
583
|
+
x = fill_na_cat_features(x, self.cat_features)
|
|
576
584
|
for feature in self.cat_features:
|
|
577
|
-
|
|
585
|
+
x[feature] = x[feature].astype("category").cat.codes
|
|
578
586
|
if not is_numeric_dtype(y):
|
|
579
587
|
y = correct_string_target(y)
|
|
580
588
|
|
|
581
|
-
return
|
|
589
|
+
return x, y, groups, params
|
|
582
590
|
|
|
583
|
-
def _prepare_to_calculate(self,
|
|
584
|
-
|
|
591
|
+
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
592
|
+
x, y, params = super()._prepare_to_calculate(x, y)
|
|
585
593
|
if self.cat_features is not None:
|
|
586
|
-
|
|
594
|
+
x = fill_na_cat_features(x, self.cat_features)
|
|
587
595
|
for feature in self.cat_features:
|
|
588
|
-
|
|
596
|
+
x[feature] = x[feature].astype("category").cat.codes
|
|
589
597
|
if not is_numeric_dtype(y):
|
|
590
598
|
y = correct_string_target(y)
|
|
591
|
-
return
|
|
599
|
+
return x, y, params
|
|
592
600
|
|
|
593
601
|
|
|
594
602
|
class OtherEstimatorWrapper(EstimatorWrapper):
|
|
@@ -617,49 +625,54 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
|
617
625
|
)
|
|
618
626
|
self.cat_features = None
|
|
619
627
|
|
|
620
|
-
def _prepare_to_fit(self,
|
|
621
|
-
|
|
622
|
-
self.cat_features = _get_cat_features(
|
|
623
|
-
num_features = [col for col in
|
|
624
|
-
|
|
625
|
-
|
|
628
|
+
def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
629
|
+
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
630
|
+
self.cat_features = _get_cat_features(x)
|
|
631
|
+
num_features = [col for col in x.columns if col not in self.cat_features]
|
|
632
|
+
x[num_features] = x[num_features].fillna(-999)
|
|
633
|
+
x = fill_na_cat_features(x, self.cat_features)
|
|
626
634
|
# TODO use one-hot encoding if cardinality is less 50
|
|
627
635
|
for feature in self.cat_features:
|
|
628
|
-
|
|
636
|
+
x[feature] = x[feature].astype("category").cat.codes
|
|
629
637
|
if not is_numeric_dtype(y):
|
|
630
638
|
y = correct_string_target(y)
|
|
631
|
-
return
|
|
639
|
+
return x, y, groups, params
|
|
632
640
|
|
|
633
|
-
def _prepare_to_calculate(self,
|
|
634
|
-
|
|
641
|
+
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
642
|
+
x, y, params = super()._prepare_to_calculate(x, y)
|
|
635
643
|
if self.cat_features is not None:
|
|
636
|
-
num_features = [col for col in
|
|
637
|
-
|
|
638
|
-
|
|
644
|
+
num_features = [col for col in x.columns if col not in self.cat_features]
|
|
645
|
+
x[num_features] = x[num_features].fillna(-999)
|
|
646
|
+
x = fill_na_cat_features(x, self.cat_features)
|
|
639
647
|
# TODO use one-hot encoding if cardinality is less 50
|
|
640
648
|
for feature in self.cat_features:
|
|
641
|
-
|
|
649
|
+
x[feature] = x[feature].astype("category").cat.codes
|
|
642
650
|
if not is_numeric_dtype(y):
|
|
643
651
|
y = correct_string_target(y)
|
|
644
|
-
return
|
|
652
|
+
return x, y, params
|
|
645
653
|
|
|
646
654
|
|
|
647
655
|
def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
|
648
|
-
# TODO validate that if it is Callable then it accepts 3 arguments
|
|
649
656
|
if isinstance(scoring, str) and scoring is not None:
|
|
650
657
|
_get_scorer_by_name(scoring)
|
|
658
|
+
elif isinstance(scoring, Callable):
|
|
659
|
+
spec = inspect.getfullargspec(scoring)
|
|
660
|
+
if len(spec.args) < 3:
|
|
661
|
+
raise ValidationError(
|
|
662
|
+
f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
|
|
663
|
+
)
|
|
651
664
|
|
|
652
665
|
|
|
653
666
|
def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
|
|
654
667
|
metric_name = scoring
|
|
655
668
|
multiplier = 1
|
|
656
|
-
if "mean_squared_log_error"
|
|
669
|
+
if metric_name == "mean_squared_log_error" or metric_name == "MSLE" or metric_name == "msle":
|
|
657
670
|
scoring = make_scorer(_ext_mean_squared_log_error, greater_is_better=False)
|
|
658
671
|
multiplier = -1
|
|
659
|
-
elif "root_mean_squared_log_error" in metric_name or "RMSLE"
|
|
672
|
+
elif "root_mean_squared_log_error" in metric_name or metric_name == "RMSLE" or metric_name == "rmsle":
|
|
660
673
|
scoring = make_scorer(_ext_root_mean_squared_log_error, greater_is_better=False)
|
|
661
674
|
multiplier = -1
|
|
662
|
-
elif "root_mean_squared_error"
|
|
675
|
+
elif metric_name == "root_mean_squared_error" or metric_name == "RMSE" or metric_name == "rmse":
|
|
663
676
|
scoring = get_scorer("neg_root_mean_squared_error")
|
|
664
677
|
multiplier = -1
|
|
665
678
|
elif scoring in available_scorers:
|
|
@@ -711,12 +724,12 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
|
|
|
711
724
|
|
|
712
725
|
|
|
713
726
|
def _get_cat_features(
|
|
714
|
-
|
|
727
|
+
x: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
|
|
715
728
|
) -> List[str]:
|
|
716
729
|
text_features = text_features or []
|
|
717
730
|
emb_features = emb_features or []
|
|
718
731
|
exclude_features = text_features + emb_features
|
|
719
|
-
return [c for c in
|
|
732
|
+
return [c for c in x.columns if c not in exclude_features and not is_numeric_dtype(x[c])]
|
|
720
733
|
|
|
721
734
|
|
|
722
735
|
def _get_add_params(input_params, add_params):
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
|
-
from pandas.api.types import is_float_dtype, is_int64_dtype,
|
|
4
|
+
from pandas.api.types import is_float_dtype, is_int64_dtype, is_object_dtype, is_string_dtype
|
|
5
5
|
|
|
6
6
|
from upgini.errors import ValidationError
|
|
7
7
|
|
|
@@ -17,7 +17,7 @@ __author__ = "Felix Zenk"
|
|
|
17
17
|
__email__ = "felix.zenk@web.de"
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
class _Parser
|
|
20
|
+
class _Parser:
|
|
21
21
|
"""
|
|
22
22
|
A parser for the .properties file format.
|
|
23
23
|
"""
|
|
@@ -49,7 +49,7 @@ class _Parser(object):
|
|
|
49
49
|
return re.sub(pattern, lambda match: codecs.decode(match.group(0), "unicode-escape"), arg)
|
|
50
50
|
|
|
51
51
|
# I/O read
|
|
52
|
-
with open(file_path,
|
|
52
|
+
with open(file_path, encoding="utf-8") as f:
|
|
53
53
|
lines = f.readlines()
|
|
54
54
|
|
|
55
55
|
# parse
|
|
@@ -83,7 +83,7 @@ class _Parser(object):
|
|
|
83
83
|
return mapping
|
|
84
84
|
|
|
85
85
|
|
|
86
|
-
class ResourceBundle
|
|
86
|
+
class ResourceBundle:
|
|
87
87
|
"""
|
|
88
88
|
A ResourceBundle manages internationalization of string resources
|
|
89
89
|
"""
|
|
@@ -199,7 +199,7 @@ class ResourceBundle(object):
|
|
|
199
199
|
raise NotInResourceBundleError(self.name, item)
|
|
200
200
|
|
|
201
201
|
|
|
202
|
-
def get_bundle(bundle_name: str, locale: str | Sequence[str
|
|
202
|
+
def get_bundle(bundle_name: str, locale: str | Sequence[str] = None, path: Path | str = None) -> ResourceBundle:
|
|
203
203
|
"""
|
|
204
204
|
Return a new :class:`ResourceBundle` after parsing the locale
|
|
205
205
|
|
|
@@ -224,7 +224,7 @@ bundle = ResourceBundle("strings", None, path=os.path.dirname(os.path.realpath(_
|
|
|
224
224
|
custom_bundles = dict()
|
|
225
225
|
|
|
226
226
|
|
|
227
|
-
def get_custom_bundle(custom_cfg: Optional[str] = None) ->
|
|
227
|
+
def get_custom_bundle(custom_cfg: Optional[str] = None) -> ResourceBundle:
|
|
228
228
|
global custom_bundles
|
|
229
229
|
if custom_cfg is not None:
|
|
230
230
|
custom_bundle = custom_bundles.get(custom_cfg)
|
|
@@ -159,7 +159,7 @@ dataset_invalid_multiclass_target=Unexpected dtype of target for multiclass task
|
|
|
159
159
|
dataset_invalid_regression_target=Unexpected dtype of target for regression task type: {}. Expected float
|
|
160
160
|
dataset_invalid_timeseries_target=Unexpected dtype of target for timeseries task type: {}. Expected float
|
|
161
161
|
dataset_to_many_multiclass_targets=The number of target classes {} exceeds the allowed threshold: {}. Please, correct your data and try again
|
|
162
|
-
dataset_rarest_class_less_min=
|
|
162
|
+
dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
|
|
163
163
|
dataset_rarest_class_less_threshold=\nWARNING: Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
|
|
164
164
|
dataset_date_features=\nWARNING: Columns {} is a datetime or period type but not used as a search key, removed from X
|
|
165
165
|
dataset_too_many_features=Too many features. Maximum number of features is {}
|
upgini/sampler/base.py
CHANGED
|
@@ -9,13 +9,11 @@ from abc import ABCMeta, abstractmethod
|
|
|
9
9
|
from typing import List, Optional
|
|
10
10
|
|
|
11
11
|
import numpy as np
|
|
12
|
-
|
|
13
12
|
from sklearn.base import BaseEstimator
|
|
14
13
|
from sklearn.preprocessing import label_binarize
|
|
15
14
|
from sklearn.utils.multiclass import check_classification_targets
|
|
16
15
|
|
|
17
|
-
from .utils import check_sampling_strategy, check_target_type
|
|
18
|
-
from .utils import ArraysTransformer
|
|
16
|
+
from .utils import ArraysTransformer, check_sampling_strategy, check_target_type
|
|
19
17
|
|
|
20
18
|
|
|
21
19
|
class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
|
|
@@ -107,7 +105,6 @@ class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
|
|
|
107
105
|
The corresponding label of `X_resampled`.
|
|
108
106
|
|
|
109
107
|
"""
|
|
110
|
-
pass
|
|
111
108
|
|
|
112
109
|
@abstractmethod
|
|
113
110
|
def _check_X_y(self, X, y, accept_sparse: Optional[List[str]] = None):
|
|
@@ -5,13 +5,10 @@
|
|
|
5
5
|
# License: MIT
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
|
-
|
|
9
|
-
from sklearn.utils import check_random_state
|
|
10
|
-
from sklearn.utils import _safe_indexing
|
|
8
|
+
from sklearn.utils import _safe_indexing, check_random_state
|
|
11
9
|
|
|
12
10
|
from .base import BaseUnderSampler
|
|
13
|
-
from .utils import check_target_type
|
|
14
|
-
from .utils import _deprecate_positional_args
|
|
11
|
+
from .utils import _deprecate_positional_args, check_target_type
|
|
15
12
|
|
|
16
13
|
|
|
17
14
|
class RandomUnderSampler(BaseUnderSampler):
|
upgini/search_task.py
CHANGED
|
@@ -8,10 +8,10 @@ import pandas as pd
|
|
|
8
8
|
|
|
9
9
|
from upgini import dataset
|
|
10
10
|
from upgini.http import (
|
|
11
|
-
_RestClient,
|
|
12
11
|
ProviderTaskSummary,
|
|
13
12
|
SearchProgress,
|
|
14
13
|
SearchTaskSummary,
|
|
14
|
+
_RestClient,
|
|
15
15
|
get_rest_client,
|
|
16
16
|
is_demo_api_key,
|
|
17
17
|
)
|
|
@@ -295,7 +295,7 @@ class SearchTask:
|
|
|
295
295
|
return self.rest_client.get_search_file_metadata(self.search_task_id, trace_id)
|
|
296
296
|
|
|
297
297
|
|
|
298
|
-
@lru_cache
|
|
298
|
+
@lru_cache
|
|
299
299
|
def _get_all_initial_raw_features_cached(
|
|
300
300
|
endpoint: Optional[str],
|
|
301
301
|
api_key: Optional[str],
|
|
@@ -328,7 +328,7 @@ def _get_all_initial_raw_features_cached(
|
|
|
328
328
|
return result_df
|
|
329
329
|
|
|
330
330
|
|
|
331
|
-
@lru_cache
|
|
331
|
+
@lru_cache
|
|
332
332
|
def _get_all_validation_raw_features_cached(
|
|
333
333
|
endpoint: Optional[str],
|
|
334
334
|
api_key: Optional[str],
|
|
@@ -357,7 +357,7 @@ def _get_all_validation_raw_features_cached(
|
|
|
357
357
|
return result_df
|
|
358
358
|
|
|
359
359
|
|
|
360
|
-
@lru_cache
|
|
360
|
+
@lru_cache
|
|
361
361
|
def _get_target_outliers_cached(
|
|
362
362
|
endpoint: Optional[str],
|
|
363
363
|
api_key: Optional[str],
|
upgini/spinner.py
CHANGED
upgini/utils/__init__.py
CHANGED
|
@@ -2,7 +2,7 @@ import itertools
|
|
|
2
2
|
from typing import List, Tuple
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
|
-
from pandas.api.types import is_string_dtype
|
|
5
|
+
from pandas.api.types import is_object_dtype, is_string_dtype
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def combine_search_keys(search_keys: List[str]) -> List[Tuple[str]]:
|
|
@@ -20,5 +20,6 @@ def find_numbers_with_decimal_comma(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
20
20
|
return [
|
|
21
21
|
col
|
|
22
22
|
for col in tmp.columns
|
|
23
|
-
if is_string_dtype(tmp[col])
|
|
23
|
+
if (is_string_dtype(tmp[col]) or is_object_dtype(tmp[col]))
|
|
24
|
+
and tmp[col].astype("string").str.match("^[0-9]+,[0-9]*$").any()
|
|
24
25
|
]
|
|
@@ -5,10 +5,10 @@ import pandas as pd
|
|
|
5
5
|
|
|
6
6
|
class BaseSearchKeyDetector:
|
|
7
7
|
def _is_search_key_by_name(self, column_name: str) -> bool:
|
|
8
|
-
raise NotImplementedError
|
|
8
|
+
raise NotImplementedError
|
|
9
9
|
|
|
10
10
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
11
|
-
raise NotImplementedError
|
|
11
|
+
raise NotImplementedError
|
|
12
12
|
|
|
13
13
|
def _get_search_key_by_name(self, column_names: List[str]) -> Optional[str]:
|
|
14
14
|
for column_name in column_names:
|
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
import numpy as np
|
|
2
1
|
import numbers
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from sklearn.model_selection import BaseCrossValidator
|
|
3
5
|
from sklearn.utils import indexable
|
|
4
6
|
from sklearn.utils.validation import _num_samples
|
|
5
|
-
|
|
7
|
+
|
|
6
8
|
from upgini.resource_bundle import bundle
|
|
7
9
|
|
|
8
10
|
|
upgini/utils/country_utils.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
-
from pandas.api.types import is_string_dtype
|
|
2
|
+
from pandas.api.types import is_object_dtype, is_string_dtype
|
|
3
3
|
|
|
4
4
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
5
5
|
|
|
@@ -9,7 +9,7 @@ class CountrySearchKeyDetector(BaseSearchKeyDetector):
|
|
|
9
9
|
return "country" in str(column_name).lower()
|
|
10
10
|
|
|
11
11
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
12
|
-
if not is_string_dtype(column):
|
|
12
|
+
if not is_string_dtype(column) and not is_object_dtype(column):
|
|
13
13
|
return False
|
|
14
14
|
|
|
15
15
|
all_count = len(column)
|