upgini 1.2.87.dev4__py3-none-any.whl → 1.2.87.dev5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +37 -38
- upgini/metrics.py +2 -0
- upgini/resource_bundle/strings.properties +2 -0
- {upgini-1.2.87.dev4.dist-info → upgini-1.2.87.dev5.dist-info}/METADATA +1 -1
- {upgini-1.2.87.dev4.dist-info → upgini-1.2.87.dev5.dist-info}/RECORD +8 -8
- {upgini-1.2.87.dev4.dist-info → upgini-1.2.87.dev5.dist-info}/WHEEL +0 -0
- {upgini-1.2.87.dev4.dist-info → upgini-1.2.87.dev5.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.87.
|
1
|
+
__version__ = "1.2.87.dev5"
|
upgini/features_enricher.py
CHANGED
@@ -300,7 +300,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
300
300
|
self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
301
301
|
self.metrics: Optional[pd.DataFrame] = None
|
302
302
|
self.feature_names_ = []
|
303
|
-
self.
|
303
|
+
self.zero_shap_client_features = []
|
304
304
|
self.feature_importances_ = []
|
305
305
|
self.search_id = search_id
|
306
306
|
self.disable_force_downsampling = disable_force_downsampling
|
@@ -315,7 +315,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
315
315
|
self.logger.debug(f"FeaturesEnricher created from existing search: {search_id}")
|
316
316
|
self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
|
317
317
|
file_metadata = self._search_task.get_file_metadata(trace_id)
|
318
|
-
x_columns = [c.
|
318
|
+
x_columns = [c.name for c in file_metadata.columns]
|
319
319
|
self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
|
320
320
|
df = pd.DataFrame(columns=x_columns)
|
321
321
|
self.__prepare_feature_importances(trace_id, df, silent=True)
|
@@ -2347,9 +2347,7 @@ if response.status_code == 200:
|
|
2347
2347
|
|
2348
2348
|
is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
|
2349
2349
|
|
2350
|
-
columns_to_drop = [
|
2351
|
-
c for c in df.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
|
2352
|
-
]
|
2350
|
+
columns_to_drop = [c for c in df.columns if c in self.feature_names_]
|
2353
2351
|
if len(columns_to_drop) > 0:
|
2354
2352
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
2355
2353
|
self.logger.warning(msg)
|
@@ -2405,6 +2403,17 @@ if response.status_code == 200:
|
|
2405
2403
|
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
2406
2404
|
columns_renaming = normalizer.columns_renaming
|
2407
2405
|
|
2406
|
+
# If there are no external features, we don't call backend on transform
|
2407
|
+
external_features = [fm for fm in features_meta if fm.shap_value > 0 and fm.source != "etalon"]
|
2408
|
+
if not external_features:
|
2409
|
+
self.logger.warning(
|
2410
|
+
"No external features found, returning original dataframe"
|
2411
|
+
f" with generated important features: {filtered_columns}"
|
2412
|
+
)
|
2413
|
+
filtered_columns = [c for c in filtered_columns if c in df.columns]
|
2414
|
+
self.logger.warning(f"Filtered columns by existance in dataframe: {filtered_columns}")
|
2415
|
+
return df[filtered_columns], columns_renaming, generated_features, search_keys
|
2416
|
+
|
2408
2417
|
# Don't pass all features in backend on transform
|
2409
2418
|
runtime_parameters = self._get_copy_of_runtime_parameters()
|
2410
2419
|
features_for_transform = self._search_task.get_features_for_transform() or []
|
@@ -2491,26 +2500,6 @@ if response.status_code == 200:
|
|
2491
2500
|
converter = PostalCodeSearchKeyConverter(postal_code)
|
2492
2501
|
df = converter.convert(df)
|
2493
2502
|
|
2494
|
-
# TODO return X + generated features
|
2495
|
-
# external_features = [fm for fm in features_meta if fm.shap_value > 0 and fm.source != "etalon"]
|
2496
|
-
# if not external_features:
|
2497
|
-
# # Unexplode dataframe back to original shape
|
2498
|
-
# if len(unnest_search_keys) > 0:
|
2499
|
-
# df = df.groupby(ENTITY_SYSTEM_RECORD_ID).first().reset_index()
|
2500
|
-
|
2501
|
-
# # Get important features from etalon source
|
2502
|
-
# etalon_features = [fm.name for fm in features_meta if fm.shap_value > 0 and fm.source == "etalon"]
|
2503
|
-
|
2504
|
-
# # Select only etalon features that exist in dataframe
|
2505
|
-
# available_etalon_features = [f for f in etalon_features if f in df.columns]
|
2506
|
-
|
2507
|
-
# # Return original dataframe with only important etalon features
|
2508
|
-
# result = df[available_etalon_features].copy()
|
2509
|
-
# result.index = validated_Xy.index
|
2510
|
-
|
2511
|
-
# return result, columns_renaming, generated_features, search_keys
|
2512
|
-
# ...
|
2513
|
-
|
2514
2503
|
meaning_types = {}
|
2515
2504
|
meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
|
2516
2505
|
meaning_types.update({col: key.value for col, key in search_keys.items()})
|
@@ -2659,14 +2648,15 @@ if response.status_code == 200:
|
|
2659
2648
|
how="left",
|
2660
2649
|
)
|
2661
2650
|
|
2651
|
+
selected_generated_features = [
|
2652
|
+
c for c in generated_features if not self.fit_select_features or c in filtered_columns
|
2653
|
+
]
|
2662
2654
|
selecting_columns = [
|
2663
2655
|
c
|
2664
|
-
for c in itertools.chain(validated_Xy.columns.tolist(),
|
2665
|
-
if c not in self.
|
2656
|
+
for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
|
2657
|
+
if c not in self.zero_shap_client_features
|
2666
2658
|
]
|
2667
|
-
selecting_columns.extend(
|
2668
|
-
c for c in filtered_columns if c in result.columns and c not in validated_X.columns
|
2669
|
-
)
|
2659
|
+
selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
|
2670
2660
|
if add_fit_system_record_id:
|
2671
2661
|
selecting_columns.append(SORT_ID)
|
2672
2662
|
|
@@ -3372,9 +3362,13 @@ if response.status_code == 200:
|
|
3372
3362
|
Xy[TARGET] = y
|
3373
3363
|
validated_y = Xy[TARGET].copy()
|
3374
3364
|
|
3375
|
-
|
3365
|
+
y_nunique = validated_y.nunique()
|
3366
|
+
if y_nunique < 2:
|
3376
3367
|
raise ValidationError(self.bundle.get("y_is_constant"))
|
3377
3368
|
|
3369
|
+
if self.model_task_type == ModelTaskType.BINARY and y_nunique != 2:
|
3370
|
+
raise ValidationError(self.bundle.get("binary_target_unique_count_not_2").format(y_nunique))
|
3371
|
+
|
3378
3372
|
return validated_y
|
3379
3373
|
|
3380
3374
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
@@ -3449,9 +3443,13 @@ if response.status_code == 200:
|
|
3449
3443
|
else:
|
3450
3444
|
raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
|
3451
3445
|
|
3452
|
-
|
3446
|
+
eval_y_nunique = validated_eval_y.nunique()
|
3447
|
+
if eval_y_nunique < 2:
|
3453
3448
|
raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
|
3454
3449
|
|
3450
|
+
if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
|
3451
|
+
raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
|
3452
|
+
|
3455
3453
|
return validated_eval_X, validated_eval_y
|
3456
3454
|
|
3457
3455
|
def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
|
@@ -3993,10 +3991,11 @@ if response.status_code == 200:
|
|
3993
3991
|
original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
|
3994
3992
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
3995
3993
|
|
3994
|
+
# To be sure that names with hash suffixes
|
3996
3995
|
df = df.rename(columns=original_names_dict)
|
3997
3996
|
|
3998
3997
|
self.feature_names_ = []
|
3999
|
-
self.
|
3998
|
+
self.zero_shap_client_features = []
|
4000
3999
|
self.feature_importances_ = []
|
4001
4000
|
features_info = []
|
4002
4001
|
features_info_without_links = []
|
@@ -4008,7 +4007,7 @@ if response.status_code == 200:
|
|
4008
4007
|
if feature_meta.name in original_names_dict.keys():
|
4009
4008
|
feature_meta.name = original_names_dict[feature_meta.name]
|
4010
4009
|
|
4011
|
-
is_client_feature = feature_meta.name in df.columns
|
4010
|
+
is_client_feature = original_names_dict.get(feature_meta.name, feature_meta.name) in df.columns
|
4012
4011
|
|
4013
4012
|
# Show and update shap values for client features only if select_features is True
|
4014
4013
|
if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
|
@@ -4024,13 +4023,13 @@ if response.status_code == 200:
|
|
4024
4023
|
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
4025
4024
|
|
4026
4025
|
for feature_meta in features_meta:
|
4027
|
-
|
4028
|
-
is_client_feature =
|
4026
|
+
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
4027
|
+
is_client_feature = original_name in df.columns
|
4029
4028
|
|
4030
4029
|
# TODO make a decision about selected features based on special flag from mlb
|
4031
4030
|
if original_shaps.get(feature_meta.name, 0.0) == 0.0:
|
4032
|
-
if self.fit_select_features:
|
4033
|
-
self.
|
4031
|
+
if is_client_feature and self.fit_select_features:
|
4032
|
+
self.zero_shap_client_features.append(original_name)
|
4034
4033
|
continue
|
4035
4034
|
|
4036
4035
|
# Use only important features
|
upgini/metrics.py
CHANGED
@@ -815,6 +815,8 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
815
815
|
encoded = cat_encoder.transform(x[self.cat_features])
|
816
816
|
cat_features = encoded.columns.to_list()
|
817
817
|
x[self.cat_features] = encoded
|
818
|
+
else:
|
819
|
+
cat_features = self.cat_features
|
818
820
|
|
819
821
|
# Create Pool for fold data, if need (for example, when categorical features are present)
|
820
822
|
fold_pool = Pool(
|
@@ -68,6 +68,8 @@ too_many_generate_features=Too many columns passed in `generate_features` argume
|
|
68
68
|
invalid_round_embeddings=Argument `round_embeddings` should be non negative integer
|
69
69
|
no_important_features_for_transform=There are no important features for transform. Return input as transformed
|
70
70
|
search_task_not_initial=Passed search_id {} is transform id. Please use search task id of fit call: {}.
|
71
|
+
binary_target_unique_count_not_2=Binary target should contain only 2 unique values, but {} found
|
72
|
+
binary_target_eval_unique_count_not_2=Binary target should contain only 2 unique values, but {} found in eval_set
|
71
73
|
|
72
74
|
# Validation errors
|
73
75
|
# params validation
|
@@ -1,12 +1,12 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=wcphyJpGJs2mZPWvsK3omRtXm2Q4NsYXyO0X5zcwLMw,28
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=fRtqSkXNONLnPe6cCL967GMt349FTIpXzy_u8LUKncw,35354
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=eFnJVb8jM1INlT-imfjafhWtOfx9EJv2HSvlfyGy0_U,216188
|
7
7
|
upgini/http.py,sha256=6Qcepv0tDC72mBBJxYHnA2xqw6QwFaKrXN8o4vju8Es,44372
|
8
8
|
upgini/metadata.py,sha256=zt_9k0iQbWXuiRZcel4ORNPdQKt6Ou69ucZD_E1Q46o,12341
|
9
|
-
upgini/metrics.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=zIOaiyfQLedU9Fk4877drnlWh-KiImSkZpPeiq6Xr1E,45295
|
10
10
|
upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
38
38
|
upgini/normalizer/normalize_utils.py,sha256=g2TcDXZeJp9kAFO2sTqZ4CAsN4J1qHNgoJHZ8gtzUWo,7376
|
39
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
40
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
41
|
-
upgini/resource_bundle/strings.properties,sha256=
|
41
|
+
upgini/resource_bundle/strings.properties,sha256=xpHD-3mW1U6Nca0QghC6FSrQLDci9pInuMpOBPPiB8M,28212
|
42
42
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
43
43
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
44
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
|
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.87.
|
74
|
-
upgini-1.2.87.
|
75
|
-
upgini-1.2.87.
|
76
|
-
upgini-1.2.87.
|
73
|
+
upgini-1.2.87.dev5.dist-info/METADATA,sha256=Jdb6gn8ijXK4ccs5hC9yEPA6dQBzc5FtelPXOJgBfJA,49167
|
74
|
+
upgini-1.2.87.dev5.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
75
|
+
upgini-1.2.87.dev5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.87.dev5.dist-info/RECORD,,
|
File without changes
|
File without changes
|