upgini 1.2.87.dev3__py3-none-any.whl → 1.2.87.dev4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +35 -8
- upgini/metrics.py +1 -1
- upgini/resource_bundle/strings.properties +1 -1
- upgini/utils/deduplicate_utils.py +1 -1
- {upgini-1.2.87.dev3.dist-info → upgini-1.2.87.dev4.dist-info}/METADATA +1 -1
- {upgini-1.2.87.dev3.dist-info → upgini-1.2.87.dev4.dist-info}/RECORD +9 -9
- {upgini-1.2.87.dev3.dist-info → upgini-1.2.87.dev4.dist-info}/WHEEL +0 -0
- {upgini-1.2.87.dev3.dist-info → upgini-1.2.87.dev4.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.87.
|
1
|
+
__version__ = "1.2.87.dev4"
|
upgini/features_enricher.py
CHANGED
@@ -2299,11 +2299,16 @@ if response.status_code == 200:
|
|
2299
2299
|
|
2300
2300
|
self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
|
2301
2301
|
|
2302
|
-
self.
|
2302
|
+
filtered_columns = self.__filtered_enriched_features(
|
2303
|
+
importance_threshold, max_features, trace_id, validated_X
|
2304
|
+
)
|
2305
|
+
# If there are no important features, return original dataframe
|
2306
|
+
if not filtered_columns:
|
2307
|
+
msg = self.bundle.get("no_important_features_for_transform")
|
2308
|
+
self.__log_warning(msg, show_support_link=True)
|
2309
|
+
return X, {c: c for c in X.columns}, [], dict()
|
2303
2310
|
|
2304
|
-
|
2305
|
-
self.logger.warning(self.bundle.get("no_important_features_for_transform"))
|
2306
|
-
return X, {c: c for c in X.columns}, [], {}
|
2311
|
+
self.__validate_search_keys(self.search_keys, self.search_id)
|
2307
2312
|
|
2308
2313
|
if self._has_paid_features(exclude_features_sources):
|
2309
2314
|
msg = self.bundle.get("transform_with_paid_features")
|
@@ -2444,6 +2449,8 @@ if response.status_code == 200:
|
|
2444
2449
|
# Explode multiple search keys
|
2445
2450
|
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
|
2446
2451
|
|
2452
|
+
# Convert search keys and generate features on them
|
2453
|
+
|
2447
2454
|
email_column = self._get_email_column(search_keys)
|
2448
2455
|
hem_column = self._get_hem_column(search_keys)
|
2449
2456
|
if email_column:
|
@@ -2484,6 +2491,26 @@ if response.status_code == 200:
|
|
2484
2491
|
converter = PostalCodeSearchKeyConverter(postal_code)
|
2485
2492
|
df = converter.convert(df)
|
2486
2493
|
|
2494
|
+
# TODO return X + generated features
|
2495
|
+
# external_features = [fm for fm in features_meta if fm.shap_value > 0 and fm.source != "etalon"]
|
2496
|
+
# if not external_features:
|
2497
|
+
# # Unexplode dataframe back to original shape
|
2498
|
+
# if len(unnest_search_keys) > 0:
|
2499
|
+
# df = df.groupby(ENTITY_SYSTEM_RECORD_ID).first().reset_index()
|
2500
|
+
|
2501
|
+
# # Get important features from etalon source
|
2502
|
+
# etalon_features = [fm.name for fm in features_meta if fm.shap_value > 0 and fm.source == "etalon"]
|
2503
|
+
|
2504
|
+
# # Select only etalon features that exist in dataframe
|
2505
|
+
# available_etalon_features = [f for f in etalon_features if f in df.columns]
|
2506
|
+
|
2507
|
+
# # Return original dataframe with only important etalon features
|
2508
|
+
# result = df[available_etalon_features].copy()
|
2509
|
+
# result.index = validated_Xy.index
|
2510
|
+
|
2511
|
+
# return result, columns_renaming, generated_features, search_keys
|
2512
|
+
# ...
|
2513
|
+
|
2487
2514
|
meaning_types = {}
|
2488
2515
|
meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
|
2489
2516
|
meaning_types.update({col: key.value for col, key in search_keys.items()})
|
@@ -2637,9 +2664,6 @@ if response.status_code == 200:
|
|
2637
2664
|
for c in itertools.chain(validated_Xy.columns.tolist(), generated_features)
|
2638
2665
|
if c not in self.dropped_client_feature_names_
|
2639
2666
|
]
|
2640
|
-
filtered_columns = self.__filtered_enriched_features(
|
2641
|
-
importance_threshold, max_features, trace_id, validated_X
|
2642
|
-
)
|
2643
2667
|
selecting_columns.extend(
|
2644
2668
|
c for c in filtered_columns if c in result.columns and c not in validated_X.columns
|
2645
2669
|
)
|
@@ -2942,7 +2966,10 @@ if response.status_code == 200:
|
|
2942
2966
|
self.__log_warning(fintech_warning)
|
2943
2967
|
df, full_duplicates_warning = clean_full_duplicates(df, self.logger, bundle=self.bundle)
|
2944
2968
|
if full_duplicates_warning:
|
2945
|
-
|
2969
|
+
if len(df) == 0:
|
2970
|
+
raise ValidationError(full_duplicates_warning)
|
2971
|
+
else:
|
2972
|
+
self.__log_warning(full_duplicates_warning)
|
2946
2973
|
|
2947
2974
|
# Explode multiple search keys
|
2948
2975
|
df = self.__add_fit_system_record_id(
|
upgini/metrics.py
CHANGED
@@ -807,10 +807,10 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
807
807
|
try:
|
808
808
|
from catboost import Pool
|
809
809
|
|
810
|
+
cat_features = None
|
810
811
|
if cat_encoder is not None:
|
811
812
|
if isinstance(self.cv, TimeSeriesSplit) or isinstance(self.cv, BlockedTimeSeriesSplit):
|
812
813
|
encoded = cat_encoder.transform(x[self.cat_features]).astype(int)
|
813
|
-
cat_features = None
|
814
814
|
else:
|
815
815
|
encoded = cat_encoder.transform(x[self.cat_features])
|
816
816
|
cat_features = encoded.columns.to_list()
|
@@ -156,7 +156,7 @@ dataset_too_few_rows=X size should be at least {} rows after validation
|
|
156
156
|
dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample X
|
157
157
|
dataset_empty_column_names=Some column names are empty. Add names please
|
158
158
|
dataset_full_duplicates={:.5f}% of the rows are fully duplicated
|
159
|
-
dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\
|
159
|
+
dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nSample of incorrect row indexes: {}
|
160
160
|
dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
161
161
|
dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
162
162
|
dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset
|
@@ -192,7 +192,7 @@ def clean_full_duplicates(
|
|
192
192
|
unique_columns.remove(TARGET)
|
193
193
|
marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
|
194
194
|
if marked_duplicates.sum() > 0:
|
195
|
-
dups_indices = df[marked_duplicates].index.to_list()
|
195
|
+
dups_indices = df[marked_duplicates].index.to_list()[:100]
|
196
196
|
nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
|
197
197
|
num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
|
198
198
|
share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
|
@@ -1,12 +1,12 @@
|
|
1
|
-
upgini/__about__.py,sha256
|
1
|
+
upgini/__about__.py,sha256=snYX5GSOXf809cKcpmiRzx30DuIAydReavaEB237z1A,28
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=fRtqSkXNONLnPe6cCL967GMt349FTIpXzy_u8LUKncw,35354
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=8KJiPXTFSiJUl5hJPEhMwhpXqPnGm3LrX31pKwlYe3k,215900
|
7
7
|
upgini/http.py,sha256=6Qcepv0tDC72mBBJxYHnA2xqw6QwFaKrXN8o4vju8Es,44372
|
8
8
|
upgini/metadata.py,sha256=zt_9k0iQbWXuiRZcel4ORNPdQKt6Ou69ucZD_E1Q46o,12341
|
9
|
-
upgini/metrics.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=64M7RGbr9dItbXPYqWmeKhpBGHO4B69eV9Rj6P18_qg,45228
|
10
10
|
upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
38
38
|
upgini/normalizer/normalize_utils.py,sha256=g2TcDXZeJp9kAFO2sTqZ4CAsN4J1qHNgoJHZ8gtzUWo,7376
|
39
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
40
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
41
|
-
upgini/resource_bundle/strings.properties,sha256=
|
41
|
+
upgini/resource_bundle/strings.properties,sha256=Q__3SNuespbG9bRJ9Gq4E_w665NPe8EZ7Pcng8B1V8Y,28001
|
42
42
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
43
43
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
44
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
@@ -52,7 +52,7 @@ upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk
|
|
52
52
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
53
53
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
54
54
|
upgini/utils/datetime_utils.py,sha256=UL1ernnawW0LV9mPDpCIc6sFy0HUhFscWVNwfH4V7rI,14366
|
55
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
55
|
+
upgini/utils/deduplicate_utils.py,sha256=EpBVCov42-FJIAPfa4jY_ZRct3N2MFaC7i-oJNZ_MGI,8954
|
56
56
|
upgini/utils/display_utils.py,sha256=hAeWEcJtPDg8fAVcMNrNB-azFD2WJp1nvbPAhR7SeP4,12071
|
57
57
|
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
58
58
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
|
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.87.
|
74
|
-
upgini-1.2.87.
|
75
|
-
upgini-1.2.87.
|
76
|
-
upgini-1.2.87.
|
73
|
+
upgini-1.2.87.dev4.dist-info/METADATA,sha256=PpZ-d4CiDjy-RnXvTGmyEXh-Q_Mjkdf1UaGyVFniqCw,49167
|
74
|
+
upgini-1.2.87.dev4.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
75
|
+
upgini-1.2.87.dev4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.87.dev4.dist-info/RECORD,,
|
File without changes
|
File without changes
|