upgini 1.2.87.dev3__py3-none-any.whl → 1.2.87.dev5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +52 -26
- upgini/metrics.py +3 -1
- upgini/resource_bundle/strings.properties +3 -1
- upgini/utils/deduplicate_utils.py +1 -1
- {upgini-1.2.87.dev3.dist-info → upgini-1.2.87.dev5.dist-info}/METADATA +1 -1
- {upgini-1.2.87.dev3.dist-info → upgini-1.2.87.dev5.dist-info}/RECORD +9 -9
- {upgini-1.2.87.dev3.dist-info → upgini-1.2.87.dev5.dist-info}/WHEEL +0 -0
- {upgini-1.2.87.dev3.dist-info → upgini-1.2.87.dev5.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.87.
|
1
|
+
__version__ = "1.2.87.dev5"
|
upgini/features_enricher.py
CHANGED
@@ -300,7 +300,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
300
300
|
self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
301
301
|
self.metrics: Optional[pd.DataFrame] = None
|
302
302
|
self.feature_names_ = []
|
303
|
-
self.
|
303
|
+
self.zero_shap_client_features = []
|
304
304
|
self.feature_importances_ = []
|
305
305
|
self.search_id = search_id
|
306
306
|
self.disable_force_downsampling = disable_force_downsampling
|
@@ -315,7 +315,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
315
315
|
self.logger.debug(f"FeaturesEnricher created from existing search: {search_id}")
|
316
316
|
self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
|
317
317
|
file_metadata = self._search_task.get_file_metadata(trace_id)
|
318
|
-
x_columns = [c.
|
318
|
+
x_columns = [c.name for c in file_metadata.columns]
|
319
319
|
self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
|
320
320
|
df = pd.DataFrame(columns=x_columns)
|
321
321
|
self.__prepare_feature_importances(trace_id, df, silent=True)
|
@@ -2299,11 +2299,16 @@ if response.status_code == 200:
|
|
2299
2299
|
|
2300
2300
|
self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
|
2301
2301
|
|
2302
|
-
self.
|
2302
|
+
filtered_columns = self.__filtered_enriched_features(
|
2303
|
+
importance_threshold, max_features, trace_id, validated_X
|
2304
|
+
)
|
2305
|
+
# If there are no important features, return original dataframe
|
2306
|
+
if not filtered_columns:
|
2307
|
+
msg = self.bundle.get("no_important_features_for_transform")
|
2308
|
+
self.__log_warning(msg, show_support_link=True)
|
2309
|
+
return X, {c: c for c in X.columns}, [], dict()
|
2303
2310
|
|
2304
|
-
|
2305
|
-
self.logger.warning(self.bundle.get("no_important_features_for_transform"))
|
2306
|
-
return X, {c: c for c in X.columns}, [], {}
|
2311
|
+
self.__validate_search_keys(self.search_keys, self.search_id)
|
2307
2312
|
|
2308
2313
|
if self._has_paid_features(exclude_features_sources):
|
2309
2314
|
msg = self.bundle.get("transform_with_paid_features")
|
@@ -2342,9 +2347,7 @@ if response.status_code == 200:
|
|
2342
2347
|
|
2343
2348
|
is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
|
2344
2349
|
|
2345
|
-
columns_to_drop = [
|
2346
|
-
c for c in df.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
|
2347
|
-
]
|
2350
|
+
columns_to_drop = [c for c in df.columns if c in self.feature_names_]
|
2348
2351
|
if len(columns_to_drop) > 0:
|
2349
2352
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
2350
2353
|
self.logger.warning(msg)
|
@@ -2400,6 +2403,17 @@ if response.status_code == 200:
|
|
2400
2403
|
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
2401
2404
|
columns_renaming = normalizer.columns_renaming
|
2402
2405
|
|
2406
|
+
# If there are no external features, we don't call backend on transform
|
2407
|
+
external_features = [fm for fm in features_meta if fm.shap_value > 0 and fm.source != "etalon"]
|
2408
|
+
if not external_features:
|
2409
|
+
self.logger.warning(
|
2410
|
+
"No external features found, returning original dataframe"
|
2411
|
+
f" with generated important features: {filtered_columns}"
|
2412
|
+
)
|
2413
|
+
filtered_columns = [c for c in filtered_columns if c in df.columns]
|
2414
|
+
self.logger.warning(f"Filtered columns by existance in dataframe: {filtered_columns}")
|
2415
|
+
return df[filtered_columns], columns_renaming, generated_features, search_keys
|
2416
|
+
|
2403
2417
|
# Don't pass all features in backend on transform
|
2404
2418
|
runtime_parameters = self._get_copy_of_runtime_parameters()
|
2405
2419
|
features_for_transform = self._search_task.get_features_for_transform() or []
|
@@ -2444,6 +2458,8 @@ if response.status_code == 200:
|
|
2444
2458
|
# Explode multiple search keys
|
2445
2459
|
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
|
2446
2460
|
|
2461
|
+
# Convert search keys and generate features on them
|
2462
|
+
|
2447
2463
|
email_column = self._get_email_column(search_keys)
|
2448
2464
|
hem_column = self._get_hem_column(search_keys)
|
2449
2465
|
if email_column:
|
@@ -2632,17 +2648,15 @@ if response.status_code == 200:
|
|
2632
2648
|
how="left",
|
2633
2649
|
)
|
2634
2650
|
|
2651
|
+
selected_generated_features = [
|
2652
|
+
c for c in generated_features if not self.fit_select_features or c in filtered_columns
|
2653
|
+
]
|
2635
2654
|
selecting_columns = [
|
2636
2655
|
c
|
2637
|
-
for c in itertools.chain(validated_Xy.columns.tolist(),
|
2638
|
-
if c not in self.
|
2656
|
+
for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
|
2657
|
+
if c not in self.zero_shap_client_features
|
2639
2658
|
]
|
2640
|
-
filtered_columns
|
2641
|
-
importance_threshold, max_features, trace_id, validated_X
|
2642
|
-
)
|
2643
|
-
selecting_columns.extend(
|
2644
|
-
c for c in filtered_columns if c in result.columns and c not in validated_X.columns
|
2645
|
-
)
|
2659
|
+
selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
|
2646
2660
|
if add_fit_system_record_id:
|
2647
2661
|
selecting_columns.append(SORT_ID)
|
2648
2662
|
|
@@ -2942,7 +2956,10 @@ if response.status_code == 200:
|
|
2942
2956
|
self.__log_warning(fintech_warning)
|
2943
2957
|
df, full_duplicates_warning = clean_full_duplicates(df, self.logger, bundle=self.bundle)
|
2944
2958
|
if full_duplicates_warning:
|
2945
|
-
|
2959
|
+
if len(df) == 0:
|
2960
|
+
raise ValidationError(full_duplicates_warning)
|
2961
|
+
else:
|
2962
|
+
self.__log_warning(full_duplicates_warning)
|
2946
2963
|
|
2947
2964
|
# Explode multiple search keys
|
2948
2965
|
df = self.__add_fit_system_record_id(
|
@@ -3345,9 +3362,13 @@ if response.status_code == 200:
|
|
3345
3362
|
Xy[TARGET] = y
|
3346
3363
|
validated_y = Xy[TARGET].copy()
|
3347
3364
|
|
3348
|
-
|
3365
|
+
y_nunique = validated_y.nunique()
|
3366
|
+
if y_nunique < 2:
|
3349
3367
|
raise ValidationError(self.bundle.get("y_is_constant"))
|
3350
3368
|
|
3369
|
+
if self.model_task_type == ModelTaskType.BINARY and y_nunique != 2:
|
3370
|
+
raise ValidationError(self.bundle.get("binary_target_unique_count_not_2").format(y_nunique))
|
3371
|
+
|
3351
3372
|
return validated_y
|
3352
3373
|
|
3353
3374
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
@@ -3422,9 +3443,13 @@ if response.status_code == 200:
|
|
3422
3443
|
else:
|
3423
3444
|
raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
|
3424
3445
|
|
3425
|
-
|
3446
|
+
eval_y_nunique = validated_eval_y.nunique()
|
3447
|
+
if eval_y_nunique < 2:
|
3426
3448
|
raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
|
3427
3449
|
|
3450
|
+
if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
|
3451
|
+
raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
|
3452
|
+
|
3428
3453
|
return validated_eval_X, validated_eval_y
|
3429
3454
|
|
3430
3455
|
def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
|
@@ -3966,10 +3991,11 @@ if response.status_code == 200:
|
|
3966
3991
|
original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
|
3967
3992
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
3968
3993
|
|
3994
|
+
# To be sure that names with hash suffixes
|
3969
3995
|
df = df.rename(columns=original_names_dict)
|
3970
3996
|
|
3971
3997
|
self.feature_names_ = []
|
3972
|
-
self.
|
3998
|
+
self.zero_shap_client_features = []
|
3973
3999
|
self.feature_importances_ = []
|
3974
4000
|
features_info = []
|
3975
4001
|
features_info_without_links = []
|
@@ -3981,7 +4007,7 @@ if response.status_code == 200:
|
|
3981
4007
|
if feature_meta.name in original_names_dict.keys():
|
3982
4008
|
feature_meta.name = original_names_dict[feature_meta.name]
|
3983
4009
|
|
3984
|
-
is_client_feature = feature_meta.name in df.columns
|
4010
|
+
is_client_feature = original_names_dict.get(feature_meta.name, feature_meta.name) in df.columns
|
3985
4011
|
|
3986
4012
|
# Show and update shap values for client features only if select_features is True
|
3987
4013
|
if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
|
@@ -3997,13 +4023,13 @@ if response.status_code == 200:
|
|
3997
4023
|
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
3998
4024
|
|
3999
4025
|
for feature_meta in features_meta:
|
4000
|
-
|
4001
|
-
is_client_feature =
|
4026
|
+
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
4027
|
+
is_client_feature = original_name in df.columns
|
4002
4028
|
|
4003
4029
|
# TODO make a decision about selected features based on special flag from mlb
|
4004
4030
|
if original_shaps.get(feature_meta.name, 0.0) == 0.0:
|
4005
|
-
if self.fit_select_features:
|
4006
|
-
self.
|
4031
|
+
if is_client_feature and self.fit_select_features:
|
4032
|
+
self.zero_shap_client_features.append(original_name)
|
4007
4033
|
continue
|
4008
4034
|
|
4009
4035
|
# Use only important features
|
upgini/metrics.py
CHANGED
@@ -807,14 +807,16 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
807
807
|
try:
|
808
808
|
from catboost import Pool
|
809
809
|
|
810
|
+
cat_features = None
|
810
811
|
if cat_encoder is not None:
|
811
812
|
if isinstance(self.cv, TimeSeriesSplit) or isinstance(self.cv, BlockedTimeSeriesSplit):
|
812
813
|
encoded = cat_encoder.transform(x[self.cat_features]).astype(int)
|
813
|
-
cat_features = None
|
814
814
|
else:
|
815
815
|
encoded = cat_encoder.transform(x[self.cat_features])
|
816
816
|
cat_features = encoded.columns.to_list()
|
817
817
|
x[self.cat_features] = encoded
|
818
|
+
else:
|
819
|
+
cat_features = self.cat_features
|
818
820
|
|
819
821
|
# Create Pool for fold data, if need (for example, when categorical features are present)
|
820
822
|
fold_pool = Pool(
|
@@ -68,6 +68,8 @@ too_many_generate_features=Too many columns passed in `generate_features` argume
|
|
68
68
|
invalid_round_embeddings=Argument `round_embeddings` should be non negative integer
|
69
69
|
no_important_features_for_transform=There are no important features for transform. Return input as transformed
|
70
70
|
search_task_not_initial=Passed search_id {} is transform id. Please use search task id of fit call: {}.
|
71
|
+
binary_target_unique_count_not_2=Binary target should contain only 2 unique values, but {} found
|
72
|
+
binary_target_eval_unique_count_not_2=Binary target should contain only 2 unique values, but {} found in eval_set
|
71
73
|
|
72
74
|
# Validation errors
|
73
75
|
# params validation
|
@@ -156,7 +158,7 @@ dataset_too_few_rows=X size should be at least {} rows after validation
|
|
156
158
|
dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample X
|
157
159
|
dataset_empty_column_names=Some column names are empty. Add names please
|
158
160
|
dataset_full_duplicates={:.5f}% of the rows are fully duplicated
|
159
|
-
dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\
|
161
|
+
dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nSample of incorrect row indexes: {}
|
160
162
|
dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
161
163
|
dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
162
164
|
dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset
|
@@ -192,7 +192,7 @@ def clean_full_duplicates(
|
|
192
192
|
unique_columns.remove(TARGET)
|
193
193
|
marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
|
194
194
|
if marked_duplicates.sum() > 0:
|
195
|
-
dups_indices = df[marked_duplicates].index.to_list()
|
195
|
+
dups_indices = df[marked_duplicates].index.to_list()[:100]
|
196
196
|
nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
|
197
197
|
num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
|
198
198
|
share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
|
@@ -1,12 +1,12 @@
|
|
1
|
-
upgini/__about__.py,sha256
|
1
|
+
upgini/__about__.py,sha256=wcphyJpGJs2mZPWvsK3omRtXm2Q4NsYXyO0X5zcwLMw,28
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=fRtqSkXNONLnPe6cCL967GMt349FTIpXzy_u8LUKncw,35354
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=eFnJVb8jM1INlT-imfjafhWtOfx9EJv2HSvlfyGy0_U,216188
|
7
7
|
upgini/http.py,sha256=6Qcepv0tDC72mBBJxYHnA2xqw6QwFaKrXN8o4vju8Es,44372
|
8
8
|
upgini/metadata.py,sha256=zt_9k0iQbWXuiRZcel4ORNPdQKt6Ou69ucZD_E1Q46o,12341
|
9
|
-
upgini/metrics.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=zIOaiyfQLedU9Fk4877drnlWh-KiImSkZpPeiq6Xr1E,45295
|
10
10
|
upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
38
38
|
upgini/normalizer/normalize_utils.py,sha256=g2TcDXZeJp9kAFO2sTqZ4CAsN4J1qHNgoJHZ8gtzUWo,7376
|
39
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
40
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
41
|
-
upgini/resource_bundle/strings.properties,sha256=
|
41
|
+
upgini/resource_bundle/strings.properties,sha256=xpHD-3mW1U6Nca0QghC6FSrQLDci9pInuMpOBPPiB8M,28212
|
42
42
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
43
43
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
44
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
@@ -52,7 +52,7 @@ upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk
|
|
52
52
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
53
53
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
54
54
|
upgini/utils/datetime_utils.py,sha256=UL1ernnawW0LV9mPDpCIc6sFy0HUhFscWVNwfH4V7rI,14366
|
55
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
55
|
+
upgini/utils/deduplicate_utils.py,sha256=EpBVCov42-FJIAPfa4jY_ZRct3N2MFaC7i-oJNZ_MGI,8954
|
56
56
|
upgini/utils/display_utils.py,sha256=hAeWEcJtPDg8fAVcMNrNB-azFD2WJp1nvbPAhR7SeP4,12071
|
57
57
|
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
58
58
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
|
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.87.
|
74
|
-
upgini-1.2.87.
|
75
|
-
upgini-1.2.87.
|
76
|
-
upgini-1.2.87.
|
73
|
+
upgini-1.2.87.dev5.dist-info/METADATA,sha256=Jdb6gn8ijXK4ccs5hC9yEPA6dQBzc5FtelPXOJgBfJA,49167
|
74
|
+
upgini-1.2.87.dev5.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
75
|
+
upgini-1.2.87.dev5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.87.dev5.dist-info/RECORD,,
|
File without changes
|
File without changes
|