upgini 1.2.81a3832.dev10__py3-none-any.whl → 1.2.81a3832.dev12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +17 -3
- upgini/metrics.py +73 -50
- upgini/search_task.py +1 -0
- {upgini-1.2.81a3832.dev10.dist-info → upgini-1.2.81a3832.dev12.dist-info}/METADATA +1 -1
- {upgini-1.2.81a3832.dev10.dist-info → upgini-1.2.81a3832.dev12.dist-info}/RECORD +8 -8
- {upgini-1.2.81a3832.dev10.dist-info → upgini-1.2.81a3832.dev12.dist-info}/WHEEL +0 -0
- {upgini-1.2.81a3832.dev10.dist-info → upgini-1.2.81a3832.dev12.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.81a3832.
|
1
|
+
__version__ = "1.2.81a3832.dev12"
|
upgini/features_enricher.py
CHANGED
@@ -1017,6 +1017,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
1017
1017
|
else:
|
1018
1018
|
client_cat_features = []
|
1019
1019
|
|
1020
|
+
# rename baseline_score_column
|
1021
|
+
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
1022
|
+
baseline_score_column = self.baseline_score_column
|
1023
|
+
if baseline_score_column is not None:
|
1024
|
+
baseline_score_column = reversed_renaming[baseline_score_column]
|
1025
|
+
|
1020
1026
|
gc.collect()
|
1021
1027
|
|
1022
1028
|
if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
|
@@ -1069,7 +1075,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1069
1075
|
has_date=has_date,
|
1070
1076
|
)
|
1071
1077
|
etalon_cv_result = baseline_estimator.cross_val_predict(
|
1072
|
-
fitting_X, y_sorted,
|
1078
|
+
fitting_X, y_sorted, baseline_score_column
|
1073
1079
|
)
|
1074
1080
|
etalon_metric = etalon_cv_result.get_display_metric()
|
1075
1081
|
if etalon_metric is None:
|
@@ -1165,7 +1171,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1165
1171
|
f"on client features: {eval_X_sorted.columns.to_list()}"
|
1166
1172
|
)
|
1167
1173
|
etalon_eval_results = baseline_estimator.calculate_metric(
|
1168
|
-
eval_X_sorted, eval_y_sorted,
|
1174
|
+
eval_X_sorted, eval_y_sorted, baseline_score_column
|
1169
1175
|
)
|
1170
1176
|
etalon_eval_metric = etalon_eval_results.get_display_metric()
|
1171
1177
|
self.logger.info(
|
@@ -1959,6 +1965,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
1959
1965
|
enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
|
1960
1966
|
)
|
1961
1967
|
|
1968
|
+
# Add hash-suffixes because output of transform has original names
|
1969
|
+
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
1970
|
+
X_sampled.rename(columns=reversed_renaming, inplace=True)
|
1971
|
+
enriched_X.rename(columns=reversed_renaming, inplace=True)
|
1972
|
+
for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
|
1973
|
+
eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
|
1974
|
+
enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
|
1975
|
+
|
1962
1976
|
# Cache and return results
|
1963
1977
|
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
1964
1978
|
return self.__cache_and_return_results(
|
@@ -4245,7 +4259,7 @@ if response.status_code == 200:
|
|
4245
4259
|
def __show_selected_features(self, search_keys: Dict[str, SearchKey]):
|
4246
4260
|
search_key_names = [col for col, tpe in search_keys.items() if tpe != SearchKey.CUSTOM_KEY]
|
4247
4261
|
if self.fit_columns_renaming:
|
4248
|
-
search_key_names = [self.fit_columns_renaming.get(col, col) for col in search_key_names]
|
4262
|
+
search_key_names = sorted(set([self.fit_columns_renaming.get(col, col) for col in search_key_names]))
|
4249
4263
|
msg = self.bundle.get("features_info_header").format(len(self.feature_names_), search_key_names)
|
4250
4264
|
|
4251
4265
|
try:
|
upgini/metrics.py
CHANGED
@@ -15,7 +15,7 @@ from catboost import CatBoostClassifier, CatBoostRegressor
|
|
15
15
|
from category_encoders.cat_boost import CatBoostEncoder
|
16
16
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
17
17
|
from numpy import log1p
|
18
|
-
from pandas.api.types import is_numeric_dtype
|
18
|
+
from pandas.api.types import is_numeric_dtype, is_integer_dtype, is_float_dtype
|
19
19
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
20
20
|
|
21
21
|
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|
@@ -324,6 +324,9 @@ class EstimatorWrapper:
|
|
324
324
|
self.text_features = text_features
|
325
325
|
self.logger = logger or logging.getLogger()
|
326
326
|
self.droped_features = []
|
327
|
+
self.converted_to_int = []
|
328
|
+
self.converted_to_str = []
|
329
|
+
self.converted_to_numeric = []
|
327
330
|
|
328
331
|
def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
|
329
332
|
x, y, _, fit_params = self._prepare_to_fit(x, y)
|
@@ -335,44 +338,6 @@ class EstimatorWrapper:
|
|
335
338
|
x, _, _ = self._prepare_to_calculate(x, None)
|
336
339
|
return self.estimator.predict(x, **kwargs)
|
337
340
|
|
338
|
-
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
339
|
-
x, y, groups = self._prepare_data(x, y, groups=self.groups)
|
340
|
-
|
341
|
-
self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
|
342
|
-
self.droped_features = []
|
343
|
-
for c in x.columns:
|
344
|
-
if _get_unique_count(x[c]) < 2:
|
345
|
-
self.logger.warning(f"Remove feature {c} because it has less than 2 unique values")
|
346
|
-
self.droped_features.append(c)
|
347
|
-
if c in self.cat_features:
|
348
|
-
self.cat_features.remove(c)
|
349
|
-
x.drop(columns=[c], inplace=True)
|
350
|
-
elif c in self.cat_features:
|
351
|
-
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
352
|
-
x[c] = x[c].astype(np.int64)
|
353
|
-
elif is_numeric_object(x[c]):
|
354
|
-
self.logger.warning(
|
355
|
-
f"Convert numeric feature {c} of type {x[c].dtype} to numeric and remove from cat_features"
|
356
|
-
)
|
357
|
-
x[c] = pd.to_numeric(x[c], errors="coerce")
|
358
|
-
self.cat_features.remove(c)
|
359
|
-
elif x[c].dtype != "category":
|
360
|
-
x[c] = x[c].astype(str)
|
361
|
-
elif self.text_features is not None and c in self.text_features:
|
362
|
-
x[c] = x[c].astype(str)
|
363
|
-
else:
|
364
|
-
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
365
|
-
x[c] = x[c].astype(np.int64)
|
366
|
-
elif not is_valid_numeric_array_data(x[c]):
|
367
|
-
try:
|
368
|
-
x[c] = pd.to_numeric(x[c], errors="raise")
|
369
|
-
except (ValueError, TypeError):
|
370
|
-
self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
|
371
|
-
self.droped_features.append(c)
|
372
|
-
x.drop(columns=[c], inplace=True)
|
373
|
-
|
374
|
-
return x, y, groups, {}
|
375
|
-
|
376
341
|
def _prepare_data(
|
377
342
|
self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
|
378
343
|
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
|
@@ -403,26 +368,82 @@ class EstimatorWrapper:
|
|
403
368
|
|
404
369
|
return x, y
|
405
370
|
|
406
|
-
def
|
407
|
-
x, y,
|
408
|
-
|
409
|
-
if self.droped_features:
|
410
|
-
self.logger.warning(f"Dropped features: {self.droped_features}")
|
411
|
-
x = x.drop(columns=self.droped_features)
|
371
|
+
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
372
|
+
x, y, groups = self._prepare_data(x, y, groups=self.groups)
|
412
373
|
|
374
|
+
self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
|
375
|
+
self.droped_features = []
|
376
|
+
self.converted_to_int = []
|
377
|
+
self.converted_to_str = []
|
378
|
+
self.converted_to_numeric = []
|
413
379
|
for c in x.columns:
|
414
|
-
if c
|
380
|
+
if _get_unique_count(x[c]) < 2:
|
381
|
+
self.logger.warning(f"Remove feature {c} because it has less than 2 unique values")
|
382
|
+
if c in self.cat_features:
|
383
|
+
self.cat_features.remove(c)
|
384
|
+
x.drop(columns=[c], inplace=True)
|
385
|
+
self.droped_features.append(c)
|
386
|
+
elif self.text_features is not None and c in self.text_features:
|
387
|
+
x[c] = x[c].astype(str)
|
388
|
+
self.converted_to_str.append(c)
|
389
|
+
elif c in self.cat_features:
|
415
390
|
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
416
391
|
x[c] = x[c].astype(np.int64)
|
417
|
-
|
392
|
+
self.converted_to_int.append(c)
|
393
|
+
elif x[c].dtype == "category" and is_integer_dtype(x[c].cat.categories):
|
394
|
+
self.logger.info(
|
395
|
+
f"Convert categorical feature {c} with integer categories"
|
396
|
+
" to int64 and remove from cat_features"
|
397
|
+
)
|
398
|
+
x[c] = x[c].astype(np.int64)
|
399
|
+
self.converted_to_int.append(c)
|
400
|
+
self.cat_features.remove(c)
|
401
|
+
elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
|
402
|
+
self.logger.info(
|
403
|
+
f"Convert float cat feature {c} to string"
|
404
|
+
)
|
418
405
|
x[c] = x[c].astype(str)
|
419
|
-
|
420
|
-
x[c]
|
406
|
+
self.converted_to_str.append(c)
|
407
|
+
elif x[c].dtype not in ["category", "int64"]:
|
408
|
+
x[c] = x[c].astype(str)
|
409
|
+
self.converted_to_str.append(c)
|
421
410
|
else:
|
422
411
|
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
412
|
+
self.logger.info(f"Convert bool feature {c} to int64")
|
423
413
|
x[c] = x[c].astype(np.int64)
|
414
|
+
self.converted_to_int.append(c)
|
424
415
|
elif not is_valid_numeric_array_data(x[c]):
|
425
|
-
|
416
|
+
try:
|
417
|
+
x[c] = pd.to_numeric(x[c], errors="raise")
|
418
|
+
self.converted_to_numeric.append(c)
|
419
|
+
except (ValueError, TypeError):
|
420
|
+
self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
|
421
|
+
x.drop(columns=[c], inplace=True)
|
422
|
+
self.droped_features.append(c)
|
423
|
+
|
424
|
+
return x, y, groups, {}
|
425
|
+
|
426
|
+
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
427
|
+
x, y, _ = self._prepare_data(x, y)
|
428
|
+
|
429
|
+
if self.droped_features:
|
430
|
+
self.logger.info(f"Drop features on calculate metrics: {self.droped_features}")
|
431
|
+
x = x.drop(columns=self.droped_features)
|
432
|
+
|
433
|
+
if self.converted_to_int:
|
434
|
+
self.logger.info(f"Convert to int features on calculate metrics: {self.converted_to_int}")
|
435
|
+
for c in self.converted_to_int:
|
436
|
+
x[c] = x[c].astype(np.int64)
|
437
|
+
|
438
|
+
if self.converted_to_str:
|
439
|
+
self.logger.info(f"Convert to str features on calculate metrics: {self.converted_to_str}")
|
440
|
+
for c in self.converted_to_str:
|
441
|
+
x[c] = x[c].astype(str)
|
442
|
+
|
443
|
+
if self.converted_to_numeric:
|
444
|
+
self.logger.info(f"Convert to numeric features on calculate metrics: {self.converted_to_numeric}")
|
445
|
+
for c in self.converted_to_numeric:
|
446
|
+
x[c] = pd.to_numeric(x[c], errors="coerce")
|
426
447
|
|
427
448
|
return x, y, {}
|
428
449
|
|
@@ -443,6 +464,8 @@ class EstimatorWrapper:
|
|
443
464
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
444
465
|
self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
|
445
466
|
metric = roc_auc_score(y, x[baseline_score_column])
|
467
|
+
metric_std = None
|
468
|
+
average_shap_values = None
|
446
469
|
else:
|
447
470
|
self.logger.info(f"Cross validate with estimeator: {self.estimator}")
|
448
471
|
cv_results = cross_validate(
|
upgini/search_task.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.81a3832.
|
3
|
+
Version: 1.2.81a3832.dev12
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -1,13 +1,13 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=QoAMu0gkmwzsYvsLvBmcg4CfaE-sL6T-rz9s8HCGZY4,34
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=cbQydnSOr7-ioQuEs-X3KYd0ays1BPuwFE_sKmOQc5E,211702
|
7
7
|
upgini/http.py,sha256=AfaJ3c8z_tK2hZFEehNybDKE0mp1tYcyAP_l0_p8bLQ,43933
|
8
8
|
upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
|
9
|
-
upgini/metrics.py,sha256=
|
10
|
-
upgini/search_task.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=sbxnFyMWCUsVSAy-OwNmDYJxVlGEnTArVUnTOID7miU,43373
|
10
|
+
upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
13
13
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
|
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.81a3832.
|
74
|
-
upgini-1.2.81a3832.
|
75
|
-
upgini-1.2.81a3832.
|
76
|
-
upgini-1.2.81a3832.
|
73
|
+
upgini-1.2.81a3832.dev12.dist-info/METADATA,sha256=2cf3_AwHclmjPzAluKb_Y2I_4OecghsB-DqKoJVODls,49173
|
74
|
+
upgini-1.2.81a3832.dev12.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
75
|
+
upgini-1.2.81a3832.dev12.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.81a3832.dev12.dist-info/RECORD,,
|
File without changes
|
File without changes
|