upgini 1.2.87.dev5__py3-none-any.whl → 1.2.88a3884.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +37 -1
- upgini/metrics.py +11 -10
- {upgini-1.2.87.dev5.dist-info → upgini-1.2.88a3884.dev1.dist-info}/METADATA +1 -1
- {upgini-1.2.87.dev5.dist-info → upgini-1.2.88a3884.dev1.dist-info}/RECORD +7 -7
- {upgini-1.2.87.dev5.dist-info → upgini-1.2.88a3884.dev1.dist-info}/WHEEL +1 -1
- {upgini-1.2.87.dev5.dist-info → upgini-1.2.88a3884.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.88a3884.dev1"
|
upgini/features_enricher.py
CHANGED
@@ -1671,6 +1671,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
1671
1671
|
enriched_eval_y_sorted,
|
1672
1672
|
)
|
1673
1673
|
|
1674
|
+
fitting_X, fitting_enriched_X, fitting_eval_set_dict = self._convert_id_columns_to_int(
|
1675
|
+
fitting_X, fitting_enriched_X, fitting_eval_set_dict, columns_renaming
|
1676
|
+
)
|
1677
|
+
|
1674
1678
|
return (
|
1675
1679
|
validated_X,
|
1676
1680
|
fitting_X,
|
@@ -1684,6 +1688,38 @@ class FeaturesEnricher(TransformerMixin):
|
|
1684
1688
|
columns_renaming,
|
1685
1689
|
)
|
1686
1690
|
|
1691
|
+
def _convert_id_columns_to_int(
|
1692
|
+
self,
|
1693
|
+
fitting_X: pd.DataFrame,
|
1694
|
+
fitting_enriched_X: pd.DataFrame,
|
1695
|
+
fitting_eval_set_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]],
|
1696
|
+
columns_renaming: Dict[str, str] = {},
|
1697
|
+
) -> pd.DataFrame:
|
1698
|
+
def _set_encoded(col_name: str, df: pd.DataFrame, slice: Tuple[int, int], combined_col: pd.Series):
|
1699
|
+
df[col_name] = combined_col.iloc[slice[0] : slice[1]]
|
1700
|
+
return slice[1]
|
1701
|
+
|
1702
|
+
inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
1703
|
+
|
1704
|
+
if self.id_columns:
|
1705
|
+
self.logger.info(f"Convert id columns to int: {self.id_columns}")
|
1706
|
+
for col in self.id_columns:
|
1707
|
+
col = inverse_columns_renaming.get(col, col)
|
1708
|
+
combined_col = pd.concat(
|
1709
|
+
[fitting_X[col], fitting_enriched_X[col]]
|
1710
|
+
+ [eval_set_pair[0][col] for eval_set_pair in fitting_eval_set_dict.values()]
|
1711
|
+
)
|
1712
|
+
combined_col = combined_col.astype("category").cat.codes
|
1713
|
+
slice_end = _set_encoded(col, fitting_X, (0, len(fitting_X)), combined_col)
|
1714
|
+
slice_end = _set_encoded(
|
1715
|
+
col, fitting_enriched_X, (slice_end, slice_end + len(fitting_enriched_X)), combined_col
|
1716
|
+
)
|
1717
|
+
for eval_set_pair in fitting_eval_set_dict.values():
|
1718
|
+
slice_end = _set_encoded(
|
1719
|
+
col, eval_set_pair[0], (slice_end, slice_end + len(eval_set_pair[0])), combined_col
|
1720
|
+
)
|
1721
|
+
return fitting_X, fitting_enriched_X, fitting_eval_set_dict
|
1722
|
+
|
1687
1723
|
@dataclass
|
1688
1724
|
class _SampledDataForMetrics:
|
1689
1725
|
X_sampled: pd.DataFrame
|
@@ -3976,7 +4012,7 @@ if response.status_code == 200:
|
|
3976
4012
|
if features_meta is None:
|
3977
4013
|
raise Exception(self.bundle.get("missing_features_meta"))
|
3978
4014
|
|
3979
|
-
return [f.name for f in features_meta if f.type == "categorical"]
|
4015
|
+
return [f.name for f in features_meta if f.type == "categorical" and f.name not in self.id_columns]
|
3980
4016
|
|
3981
4017
|
def __prepare_feature_importances(
|
3982
4018
|
self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
upgini/metrics.py
CHANGED
@@ -332,7 +332,7 @@ class EstimatorWrapper:
|
|
332
332
|
self.groups = groups
|
333
333
|
self.text_features = text_features
|
334
334
|
self.logger = logger or logging.getLogger()
|
335
|
-
self.
|
335
|
+
self.dropped_features = []
|
336
336
|
self.converted_to_int = []
|
337
337
|
self.converted_to_str = []
|
338
338
|
self.converted_to_numeric = []
|
@@ -381,10 +381,11 @@ class EstimatorWrapper:
|
|
381
381
|
x, y, groups = self._prepare_data(x, y, groups=self.groups)
|
382
382
|
|
383
383
|
self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
|
384
|
-
self.
|
384
|
+
self.dropped_features = []
|
385
385
|
self.converted_to_int = []
|
386
386
|
self.converted_to_str = []
|
387
387
|
self.converted_to_numeric = []
|
388
|
+
|
388
389
|
for c in x.columns:
|
389
390
|
|
390
391
|
if _get_unique_count(x[c]) < 2:
|
@@ -392,7 +393,7 @@ class EstimatorWrapper:
|
|
392
393
|
if c in self.cat_features:
|
393
394
|
self.cat_features.remove(c)
|
394
395
|
x.drop(columns=[c], inplace=True)
|
395
|
-
self.
|
396
|
+
self.dropped_features.append(c)
|
396
397
|
elif self.text_features is not None and c in self.text_features:
|
397
398
|
x[c] = x[c].astype(str)
|
398
399
|
self.converted_to_str.append(c)
|
@@ -427,16 +428,16 @@ class EstimatorWrapper:
|
|
427
428
|
except (ValueError, TypeError):
|
428
429
|
self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
|
429
430
|
x.drop(columns=[c], inplace=True)
|
430
|
-
self.
|
431
|
+
self.dropped_features.append(c)
|
431
432
|
|
432
433
|
return x, y, groups, {}
|
433
434
|
|
434
435
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
435
436
|
x, y, _ = self._prepare_data(x, y)
|
436
437
|
|
437
|
-
if self.
|
438
|
-
self.logger.info(f"Drop features on calculate metrics: {self.
|
439
|
-
x = x.drop(columns=self.
|
438
|
+
if self.dropped_features:
|
439
|
+
self.logger.info(f"Drop features on calculate metrics: {self.dropped_features}")
|
440
|
+
x = x.drop(columns=self.dropped_features)
|
440
441
|
|
441
442
|
if self.converted_to_int:
|
442
443
|
self.logger.info(f"Convert to int features on calculate metrics: {self.converted_to_int}")
|
@@ -797,7 +798,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
797
798
|
)
|
798
799
|
for f in high_cardinality_features:
|
799
800
|
self.text_features.remove(f)
|
800
|
-
self.
|
801
|
+
self.dropped_features.append(f)
|
801
802
|
x = x.drop(columns=f, errors="ignore")
|
802
803
|
return super().cross_val_predict(x, y, baseline_score_column)
|
803
804
|
else:
|
@@ -897,7 +898,7 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
897
898
|
for c in x.columns:
|
898
899
|
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
899
900
|
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
900
|
-
self.
|
901
|
+
self.dropped_features.append(c)
|
901
902
|
x = x.drop(columns=c, errors="ignore")
|
902
903
|
return x, y_numpy, groups, params
|
903
904
|
|
@@ -988,7 +989,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
988
989
|
for c in x.columns:
|
989
990
|
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
990
991
|
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
991
|
-
self.
|
992
|
+
self.dropped_features.append(c)
|
992
993
|
x = x.drop(columns=c, errors="ignore")
|
993
994
|
return x, y_numpy, groups, params
|
994
995
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.88a3884.dev1
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -1,12 +1,12 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=RCAVI4TwhC_It_MBONjiSYbrXtFotET-nMOyORfyw40,33
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=fRtqSkXNONLnPe6cCL967GMt349FTIpXzy_u8LUKncw,35354
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=kkNePcLwHKNOLuZLDD8XcIHKVoo_VPUbUM4TSwey60I,218038
|
7
7
|
upgini/http.py,sha256=6Qcepv0tDC72mBBJxYHnA2xqw6QwFaKrXN8o4vju8Es,44372
|
8
8
|
upgini/metadata.py,sha256=zt_9k0iQbWXuiRZcel4ORNPdQKt6Ou69ucZD_E1Q46o,12341
|
9
|
-
upgini/metrics.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=ju7JPwLUe8vtFUGbBV6w6ecySd952XucrqToc1edVBs,45306
|
10
10
|
upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
|
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.
|
74
|
-
upgini-1.2.
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
73
|
+
upgini-1.2.88a3884.dev1.dist-info/METADATA,sha256=KPOdFTBugj7fEYybkMyXP9uABuM75J-eKJmF7V-mEMs,49172
|
74
|
+
upgini-1.2.88a3884.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
75
|
+
upgini-1.2.88a3884.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.88a3884.dev1.dist-info/RECORD,,
|
File without changes
|