upgini 1.2.117a1__tar.gz → 1.2.118__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {upgini-1.2.117a1 → upgini-1.2.118}/.gitignore +1 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/PKG-INFO +1 -1
- upgini-1.2.118/src/upgini/__about__.py +1 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/features_enricher.py +39 -5
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/metrics.py +3 -2
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/resource_bundle/strings.properties +1 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/psi.py +0 -3
- upgini-1.2.117a1/src/upgini/__about__.py +0 -1
- {upgini-1.2.117a1 → upgini-1.2.118}/LICENSE +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/README.md +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/pyproject.toml +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/__init__.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/ads.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/autofe/all_operators.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/autofe/operator.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/autofe/timeseries/__init__.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/autofe/timeseries/base.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/autofe/timeseries/cross.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/autofe/timeseries/delta.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/autofe/timeseries/lag.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/autofe/timeseries/roll.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/autofe/timeseries/trend.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/autofe/timeseries/volatility.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/autofe/utils.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/dataset.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/errors.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/http.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/metadata.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/search_task.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/spinner.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/config.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/hash_utils.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/mstats.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/sample_utils.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/sort.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/ts_utils.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.117a1 → upgini-1.2.118}/src/upgini/version_validator.py +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "1.2.118"
|
@@ -1423,8 +1423,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
1423
1423
|
# Find latest eval set or earliest if all eval sets are before train set
|
1424
1424
|
date_column = self._get_date_column(search_keys)
|
1425
1425
|
|
1426
|
-
|
1427
|
-
|
1426
|
+
x_date = X[date_column].dropna()
|
1427
|
+
if not is_numeric_dtype(x_date):
|
1428
|
+
x_date = pd.to_datetime(x_date).dt.floor("D").astype(np.int64) / 10**6
|
1429
|
+
main_min_date = x_date.min()
|
1430
|
+
|
1431
|
+
for eval_x, _ in eval_set:
|
1432
|
+
eval_x_date = eval_x[date_column].dropna()
|
1433
|
+
if not is_numeric_dtype(eval_x_date):
|
1434
|
+
eval_x[date_column] = pd.to_datetime(eval_x_date).dt.floor("D").astype(np.int64) / 10**6
|
1428
1435
|
|
1429
1436
|
# Find minimum date for each eval_set and compare with main dataset
|
1430
1437
|
eval_dates = []
|
@@ -1433,8 +1440,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
1433
1440
|
if len(eval_x) < 1000:
|
1434
1441
|
self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
|
1435
1442
|
continue
|
1436
|
-
|
1437
|
-
|
1443
|
+
eval_x_date = eval_x[date_column].dropna()
|
1444
|
+
if not is_numeric_dtype(eval_x_date):
|
1445
|
+
eval_x_date = pd.to_datetime(eval_x_date).dt.floor("D").astype(np.int64) / 10**6
|
1446
|
+
eval_min_date = eval_x_date.min()
|
1447
|
+
eval_max_date = eval_x_date.max()
|
1438
1448
|
eval_dates.append((i, eval_min_date, eval_max_date))
|
1439
1449
|
|
1440
1450
|
if not eval_dates:
|
@@ -1460,6 +1470,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
1460
1470
|
checking_eval_set_df = checking_eval_set_df.copy()
|
1461
1471
|
|
1462
1472
|
checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
|
1473
|
+
if not is_numeric_dtype(checking_eval_set_df[date_column]):
|
1474
|
+
checking_eval_set_df[date_column] = (
|
1475
|
+
pd.to_datetime(checking_eval_set_df[date_column]).dt.floor("D").astype(np.int64) / 10**6
|
1476
|
+
)
|
1463
1477
|
|
1464
1478
|
psi_values_sparse = calculate_sparsity_psi(
|
1465
1479
|
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
|
@@ -3708,6 +3722,25 @@ if response.status_code == 200:
|
|
3708
3722
|
else:
|
3709
3723
|
raise ValidationError(self.bundle.get("eval_x_and_x_diff_shape"))
|
3710
3724
|
|
3725
|
+
if any(validated_eval_X.dtypes != X.dtypes):
|
3726
|
+
x_types = X.dtypes
|
3727
|
+
eval_types = validated_eval_X.dtypes
|
3728
|
+
# Find columns with different types
|
3729
|
+
diff_cols = [
|
3730
|
+
(col, x_types[col], eval_types[col])
|
3731
|
+
for col in x_types.index
|
3732
|
+
if x_types[col] != eval_types[col]
|
3733
|
+
]
|
3734
|
+
diff_col_names = [col for col, _, _ in diff_cols]
|
3735
|
+
# print columns with different types
|
3736
|
+
print("Columns with different types:")
|
3737
|
+
for col, x_type, eval_type in diff_cols:
|
3738
|
+
print("-" * 50)
|
3739
|
+
print(f"Column: {col}")
|
3740
|
+
print(f"X type: {x_type}")
|
3741
|
+
print(f"Eval_set type: {eval_type}")
|
3742
|
+
raise ValidationError(self.bundle.get("eval_x_and_x_diff_dtypes").format(diff_col_names))
|
3743
|
+
|
3711
3744
|
if _num_samples(validated_eval_X) != _num_samples(eval_y):
|
3712
3745
|
raise ValidationError(
|
3713
3746
|
self.bundle.get("x_and_y_diff_size_eval_set").format(
|
@@ -4420,7 +4453,8 @@ if response.status_code == 200:
|
|
4420
4453
|
|
4421
4454
|
if len(features_info) > 0:
|
4422
4455
|
self.features_info = pd.DataFrame(features_info)
|
4423
|
-
|
4456
|
+
# If all psi values are 0 or null, drop psi column
|
4457
|
+
if self.features_info[self.bundle.get("features_info_psi")].fillna(0.0).eq(0.0).all():
|
4424
4458
|
self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
|
4425
4459
|
self._features_info_without_links = pd.DataFrame(features_info_without_links)
|
4426
4460
|
self._internal_features_info = pd.DataFrame(internal_features_info)
|
@@ -847,7 +847,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
847
847
|
|
848
848
|
feature_importance = {}
|
849
849
|
for i, col in enumerate(x.columns):
|
850
|
-
feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
|
850
|
+
feature_importance[col] = float(np.mean(np.abs(shap_values[:, i])))
|
851
851
|
|
852
852
|
return feature_importance
|
853
853
|
|
@@ -922,6 +922,7 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
922
922
|
encoded = cat_encoder.transform(x_copy[self.cat_features]).astype(int)
|
923
923
|
else:
|
924
924
|
encoded = cat_encoder.transform(x_copy[self.cat_features]).astype("category")
|
925
|
+
x_copy = x_copy.drop(columns=self.cat_features, errors="ignore")
|
925
926
|
x_copy[self.cat_features] = encoded
|
926
927
|
|
927
928
|
shap_matrix = estimator.predict(
|
@@ -943,7 +944,7 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
943
944
|
|
944
945
|
feature_importance = {}
|
945
946
|
for i, col in enumerate(x.columns):
|
946
|
-
feature_importance[col] = np.mean(np.abs(shap_matrix[:, i]))
|
947
|
+
feature_importance[col] = float(np.mean(np.abs(shap_matrix[:, i])))
|
947
948
|
|
948
949
|
return feature_importance
|
949
950
|
|
@@ -123,6 +123,7 @@ unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of
|
|
123
123
|
eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y or X only
|
124
124
|
unsupported_x_type_eval_set=Unsupported type of X in eval_set: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list.
|
125
125
|
eval_x_and_x_diff_shape=The column set in eval_set are differ from the column set in X
|
126
|
+
eval_x_and_x_diff_dtypes=The column types in eval_set are different from the column types in X: {}
|
126
127
|
unsupported_y_type_eval_set=Unsupported type of y in eval_set: {}. Use pandas.Series, numpy.ndarray or list
|
127
128
|
y_is_constant_eval_set=y in eval_set is a constant. Relevant feature search requires a non-constant y
|
128
129
|
x_and_y_diff_size_eval_set=X and y in eval_set contain different number of rows: {}, {}
|
@@ -82,9 +82,6 @@ def calculate_features_psi(
|
|
82
82
|
) -> dict[str, float]:
|
83
83
|
empty_res = {col: 0.0 for col in df.columns if col not in [TARGET, date_column]}
|
84
84
|
|
85
|
-
if not is_numeric_dtype(df[date_column]):
|
86
|
-
df[date_column] = pd.to_datetime(df[date_column]).dt.floor("D").astype(np.int64) / 10**6
|
87
|
-
|
88
85
|
# Filter out rows with missing dates
|
89
86
|
df = df[df[date_column].notna()].copy()
|
90
87
|
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = "1.2.117a1"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|