upgini 1.2.81a3832.dev10__py3-none-any.whl → 1.2.81a3832.dev11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +1 -1
- upgini/metrics.py +71 -50
- upgini/search_task.py +1 -0
- {upgini-1.2.81a3832.dev10.dist-info → upgini-1.2.81a3832.dev11.dist-info}/METADATA +1 -1
- {upgini-1.2.81a3832.dev10.dist-info → upgini-1.2.81a3832.dev11.dist-info}/RECORD +8 -8
- {upgini-1.2.81a3832.dev10.dist-info → upgini-1.2.81a3832.dev11.dist-info}/WHEEL +0 -0
- {upgini-1.2.81a3832.dev10.dist-info → upgini-1.2.81a3832.dev11.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.81a3832.
|
1
|
+
__version__ = "1.2.81a3832.dev11"
|
upgini/features_enricher.py
CHANGED
@@ -4245,7 +4245,7 @@ if response.status_code == 200:
|
|
4245
4245
|
def __show_selected_features(self, search_keys: Dict[str, SearchKey]):
|
4246
4246
|
search_key_names = [col for col, tpe in search_keys.items() if tpe != SearchKey.CUSTOM_KEY]
|
4247
4247
|
if self.fit_columns_renaming:
|
4248
|
-
search_key_names = [self.fit_columns_renaming.get(col, col) for col in search_key_names]
|
4248
|
+
search_key_names = sorted(set([self.fit_columns_renaming.get(col, col) for col in search_key_names]))
|
4249
4249
|
msg = self.bundle.get("features_info_header").format(len(self.feature_names_), search_key_names)
|
4250
4250
|
|
4251
4251
|
try:
|
upgini/metrics.py
CHANGED
@@ -15,7 +15,7 @@ from catboost import CatBoostClassifier, CatBoostRegressor
|
|
15
15
|
from category_encoders.cat_boost import CatBoostEncoder
|
16
16
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
17
17
|
from numpy import log1p
|
18
|
-
from pandas.api.types import is_numeric_dtype
|
18
|
+
from pandas.api.types import is_numeric_dtype, is_integer_dtype, is_float_dtype
|
19
19
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
20
20
|
|
21
21
|
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|
@@ -324,6 +324,9 @@ class EstimatorWrapper:
|
|
324
324
|
self.text_features = text_features
|
325
325
|
self.logger = logger or logging.getLogger()
|
326
326
|
self.droped_features = []
|
327
|
+
self.converted_to_int = []
|
328
|
+
self.converted_to_str = []
|
329
|
+
self.converted_to_numeric = []
|
327
330
|
|
328
331
|
def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
|
329
332
|
x, y, _, fit_params = self._prepare_to_fit(x, y)
|
@@ -335,44 +338,6 @@ class EstimatorWrapper:
|
|
335
338
|
x, _, _ = self._prepare_to_calculate(x, None)
|
336
339
|
return self.estimator.predict(x, **kwargs)
|
337
340
|
|
338
|
-
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
339
|
-
x, y, groups = self._prepare_data(x, y, groups=self.groups)
|
340
|
-
|
341
|
-
self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
|
342
|
-
self.droped_features = []
|
343
|
-
for c in x.columns:
|
344
|
-
if _get_unique_count(x[c]) < 2:
|
345
|
-
self.logger.warning(f"Remove feature {c} because it has less than 2 unique values")
|
346
|
-
self.droped_features.append(c)
|
347
|
-
if c in self.cat_features:
|
348
|
-
self.cat_features.remove(c)
|
349
|
-
x.drop(columns=[c], inplace=True)
|
350
|
-
elif c in self.cat_features:
|
351
|
-
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
352
|
-
x[c] = x[c].astype(np.int64)
|
353
|
-
elif is_numeric_object(x[c]):
|
354
|
-
self.logger.warning(
|
355
|
-
f"Convert numeric feature {c} of type {x[c].dtype} to numeric and remove from cat_features"
|
356
|
-
)
|
357
|
-
x[c] = pd.to_numeric(x[c], errors="coerce")
|
358
|
-
self.cat_features.remove(c)
|
359
|
-
elif x[c].dtype != "category":
|
360
|
-
x[c] = x[c].astype(str)
|
361
|
-
elif self.text_features is not None and c in self.text_features:
|
362
|
-
x[c] = x[c].astype(str)
|
363
|
-
else:
|
364
|
-
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
365
|
-
x[c] = x[c].astype(np.int64)
|
366
|
-
elif not is_valid_numeric_array_data(x[c]):
|
367
|
-
try:
|
368
|
-
x[c] = pd.to_numeric(x[c], errors="raise")
|
369
|
-
except (ValueError, TypeError):
|
370
|
-
self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
|
371
|
-
self.droped_features.append(c)
|
372
|
-
x.drop(columns=[c], inplace=True)
|
373
|
-
|
374
|
-
return x, y, groups, {}
|
375
|
-
|
376
341
|
def _prepare_data(
|
377
342
|
self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
|
378
343
|
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
|
@@ -403,26 +368,82 @@ class EstimatorWrapper:
|
|
403
368
|
|
404
369
|
return x, y
|
405
370
|
|
406
|
-
def
|
407
|
-
x, y,
|
408
|
-
|
409
|
-
if self.droped_features:
|
410
|
-
self.logger.warning(f"Dropped features: {self.droped_features}")
|
411
|
-
x = x.drop(columns=self.droped_features)
|
371
|
+
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
372
|
+
x, y, groups = self._prepare_data(x, y, groups=self.groups)
|
412
373
|
|
374
|
+
self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
|
375
|
+
self.droped_features = []
|
376
|
+
self.converted_to_int = []
|
377
|
+
self.converted_to_str = []
|
378
|
+
self.converted_to_numeric = []
|
413
379
|
for c in x.columns:
|
414
|
-
if c
|
380
|
+
if _get_unique_count(x[c]) < 2:
|
381
|
+
self.logger.warning(f"Remove feature {c} because it has less than 2 unique values")
|
382
|
+
if c in self.cat_features:
|
383
|
+
self.cat_features.remove(c)
|
384
|
+
x.drop(columns=[c], inplace=True)
|
385
|
+
self.droped_features.append(c)
|
386
|
+
elif self.text_features is not None and c in self.text_features:
|
387
|
+
x[c] = x[c].astype(str)
|
388
|
+
self.converted_to_str.append(c)
|
389
|
+
elif c in self.cat_features:
|
415
390
|
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
416
391
|
x[c] = x[c].astype(np.int64)
|
417
|
-
|
392
|
+
self.converted_to_int.append(c)
|
393
|
+
elif x[c].dtype == "category" and is_integer_dtype(x[c].cat.categories):
|
394
|
+
self.logger.info(
|
395
|
+
f"Convert categorical feature {c} with integer categories"
|
396
|
+
" to int64 and remove from cat_features"
|
397
|
+
)
|
398
|
+
x[c] = x[c].astype(np.int64)
|
399
|
+
self.converted_to_int.append(c)
|
400
|
+
self.cat_features.remove(c)
|
401
|
+
elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
|
402
|
+
self.logger.info(
|
403
|
+
f"Convert float cat feature {c} to string"
|
404
|
+
)
|
418
405
|
x[c] = x[c].astype(str)
|
419
|
-
|
420
|
-
x[c]
|
406
|
+
self.converted_to_str.append(c)
|
407
|
+
elif x[c].dtype not in ["category", "int64"]:
|
408
|
+
x[c] = x[c].astype(str)
|
409
|
+
self.converted_to_str.append(c)
|
421
410
|
else:
|
422
411
|
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
412
|
+
self.logger.info(f"Convert bool feature {c} to int64")
|
423
413
|
x[c] = x[c].astype(np.int64)
|
414
|
+
self.converted_to_int.append(c)
|
424
415
|
elif not is_valid_numeric_array_data(x[c]):
|
425
|
-
|
416
|
+
try:
|
417
|
+
x[c] = pd.to_numeric(x[c], errors="raise")
|
418
|
+
self.converted_to_numeric.append(c)
|
419
|
+
except (ValueError, TypeError):
|
420
|
+
self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
|
421
|
+
x.drop(columns=[c], inplace=True)
|
422
|
+
self.droped_features.append(c)
|
423
|
+
|
424
|
+
return x, y, groups, {}
|
425
|
+
|
426
|
+
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
427
|
+
x, y, _ = self._prepare_data(x, y)
|
428
|
+
|
429
|
+
if self.droped_features:
|
430
|
+
self.logger.info(f"Drop features on calculate metrics: {self.droped_features}")
|
431
|
+
x = x.drop(columns=self.droped_features)
|
432
|
+
|
433
|
+
if self.converted_to_int:
|
434
|
+
self.logger.info(f"Convert to int features on calculate metrics: {self.converted_to_int}")
|
435
|
+
for c in self.converted_to_int:
|
436
|
+
x[c] = x[c].astype(np.int64)
|
437
|
+
|
438
|
+
if self.converted_to_str:
|
439
|
+
self.logger.info(f"Convert to str features on calculate metrics: {self.converted_to_str}")
|
440
|
+
for c in self.converted_to_str:
|
441
|
+
x[c] = x[c].astype(str)
|
442
|
+
|
443
|
+
if self.converted_to_numeric:
|
444
|
+
self.logger.info(f"Convert to numeric features on calculate metrics: {self.converted_to_numeric}")
|
445
|
+
for c in self.converted_to_numeric:
|
446
|
+
x[c] = pd.to_numeric(x[c], errors="coerce")
|
426
447
|
|
427
448
|
return x, y, {}
|
428
449
|
|
upgini/search_task.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.81a3832.
|
3
|
+
Version: 1.2.81a3832.dev11
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -1,13 +1,13 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=aMsoGp7JafbllKBjbZ_9sxh2xfd5oZMdcOt6Id_WaBU,34
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=aIG16mpdUZqV0GLMoDA4LiXMPbu3a-m72mhVqGnIww4,210860
|
7
7
|
upgini/http.py,sha256=AfaJ3c8z_tK2hZFEehNybDKE0mp1tYcyAP_l0_p8bLQ,43933
|
8
8
|
upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
|
9
|
-
upgini/metrics.py,sha256=
|
10
|
-
upgini/search_task.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=Zb-AwpstEKWaIuLqfIWLF--UGQwIoLbGYnHRlyPQ_cY,43304
|
10
|
+
upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
13
13
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
|
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.81a3832.
|
74
|
-
upgini-1.2.81a3832.
|
75
|
-
upgini-1.2.81a3832.
|
76
|
-
upgini-1.2.81a3832.
|
73
|
+
upgini-1.2.81a3832.dev11.dist-info/METADATA,sha256=h9Tlze7oWU3tEfYMuF9BYZTD7hlFeZM-zjrkIzMml4k,49173
|
74
|
+
upgini-1.2.81a3832.dev11.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
75
|
+
upgini-1.2.81a3832.dev11.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.81a3832.dev11.dist-info/RECORD,,
|
File without changes
|
File without changes
|