upgini 1.2.118__py3-none-any.whl → 1.2.120__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +74 -48
- upgini/utils/display_utils.py +12 -9
- upgini/utils/sklearn_ext.py +3 -4
- {upgini-1.2.118.dist-info → upgini-1.2.120.dist-info}/METADATA +1 -1
- {upgini-1.2.118.dist-info → upgini-1.2.120.dist-info}/RECORD +8 -8
- {upgini-1.2.118.dist-info → upgini-1.2.120.dist-info}/WHEEL +0 -0
- {upgini-1.2.118.dist-info → upgini-1.2.120.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.120"
|
upgini/features_enricher.py
CHANGED
@@ -854,7 +854,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
854
854
|
raise e
|
855
855
|
finally:
|
856
856
|
self.logger.info(f"Transform elapsed time: {time.time() - start_time}")
|
857
|
-
|
857
|
+
|
858
858
|
return result
|
859
859
|
|
860
860
|
def calculate_metrics(
|
@@ -1028,13 +1028,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1028
1028
|
columns_renaming,
|
1029
1029
|
_,
|
1030
1030
|
) = prepared_data
|
1031
|
-
|
1032
|
-
# rename baseline_score_column
|
1033
|
-
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
1034
|
-
baseline_score_column = self.baseline_score_column
|
1035
|
-
if baseline_score_column is not None:
|
1036
|
-
baseline_score_column = reversed_renaming[baseline_score_column]
|
1037
|
-
|
1031
|
+
|
1038
1032
|
gc.collect()
|
1039
1033
|
|
1040
1034
|
if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
|
@@ -1089,7 +1083,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1089
1083
|
has_time=has_time,
|
1090
1084
|
)
|
1091
1085
|
baseline_cv_result = baseline_estimator.cross_val_predict(
|
1092
|
-
fitting_X, y_sorted, baseline_score_column
|
1086
|
+
fitting_X, y_sorted, self.baseline_score_column
|
1093
1087
|
)
|
1094
1088
|
baseline_metric = baseline_cv_result.get_display_metric()
|
1095
1089
|
if baseline_metric is None:
|
@@ -1192,7 +1186,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1192
1186
|
f"on client features: {eval_X_sorted.columns.to_list()}"
|
1193
1187
|
)
|
1194
1188
|
etalon_eval_results = baseline_estimator.calculate_metric(
|
1195
|
-
eval_X_sorted, eval_y_sorted, baseline_score_column
|
1189
|
+
eval_X_sorted, eval_y_sorted, self.baseline_score_column
|
1196
1190
|
)
|
1197
1191
|
etalon_eval_metric = etalon_eval_results.get_display_metric()
|
1198
1192
|
self.logger.info(
|
@@ -1741,7 +1735,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1741
1735
|
c
|
1742
1736
|
for c in (validated_X.columns.to_list() + generated_features)
|
1743
1737
|
if (not self.fit_select_features or c in set(self.feature_names_).union(self.id_columns or []))
|
1744
|
-
and c
|
1738
|
+
and c
|
1739
|
+
not in (
|
1745
1740
|
excluding_search_keys
|
1746
1741
|
+ list(self.fit_dropped_features)
|
1747
1742
|
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
@@ -2215,7 +2210,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
2215
2210
|
progress_callback=progress_callback,
|
2216
2211
|
add_fit_system_record_id=True,
|
2217
2212
|
)
|
2218
|
-
if enriched_df is None:
|
2213
|
+
if enriched_df is None or len(enriched_df) == 0 or len(enriched_df.columns) == 0:
|
2214
|
+
self.logger.warning(f"Empty enriched dataframe returned: {enriched_df}, returning None")
|
2219
2215
|
return None
|
2220
2216
|
|
2221
2217
|
x_columns = [
|
@@ -2500,6 +2496,9 @@ if response.status_code == 200:
|
|
2500
2496
|
) -> tuple[pd.DataFrame, dict[str, str], list[str], dict[str, SearchKey]]:
|
2501
2497
|
if self._search_task is None:
|
2502
2498
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
2499
|
+
features_meta = self._search_task.get_all_features_metadata_v2()
|
2500
|
+
if features_meta is None:
|
2501
|
+
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
2503
2502
|
|
2504
2503
|
start_time = time.time()
|
2505
2504
|
search_id = self.search_id or (self._search_task.search_task_id if self._search_task is not None else None)
|
@@ -2519,7 +2518,7 @@ if response.status_code == 200:
|
|
2519
2518
|
if len(self.feature_names_) == 0:
|
2520
2519
|
msg = self.bundle.get("no_important_features_for_transform")
|
2521
2520
|
self.__log_warning(msg, show_support_link=True)
|
2522
|
-
return
|
2521
|
+
return None, {}, [], self.search_keys
|
2523
2522
|
|
2524
2523
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
2525
2524
|
|
@@ -2527,9 +2526,8 @@ if response.status_code == 200:
|
|
2527
2526
|
msg = self.bundle.get("transform_with_paid_features")
|
2528
2527
|
self.logger.warning(msg)
|
2529
2528
|
self.__display_support_link(msg)
|
2530
|
-
return None, {
|
2529
|
+
return None, {}, [], self.search_keys
|
2531
2530
|
|
2532
|
-
features_meta = self._search_task.get_all_features_metadata_v2()
|
2533
2531
|
online_api_features = [fm.name for fm in features_meta if fm.from_online_api and fm.shap_value > 0]
|
2534
2532
|
if len(online_api_features) > 0:
|
2535
2533
|
self.logger.warning(
|
@@ -2550,7 +2548,7 @@ if response.status_code == 200:
|
|
2550
2548
|
self.logger.warning(msg)
|
2551
2549
|
print(msg)
|
2552
2550
|
show_request_quote_button()
|
2553
|
-
return None, {
|
2551
|
+
return None, {}, [], {}
|
2554
2552
|
else:
|
2555
2553
|
msg = self.bundle.get("transform_usage_info").format(
|
2556
2554
|
transform_usage.limit, transform_usage.transformed_rows
|
@@ -2620,14 +2618,33 @@ if response.status_code == 200:
|
|
2620
2618
|
|
2621
2619
|
# If there are no external features, we don't call backend on transform
|
2622
2620
|
external_features = [fm for fm in features_meta if fm.shap_value > 0 and fm.source != "etalon"]
|
2623
|
-
if
|
2621
|
+
if len(external_features) == 0:
|
2624
2622
|
self.logger.warning(
|
2625
2623
|
"No external features found, returning original dataframe"
|
2626
2624
|
f" with generated important features: {self.feature_names_}"
|
2627
2625
|
)
|
2628
|
-
|
2629
|
-
|
2630
|
-
|
2626
|
+
df = df.rename(columns=columns_renaming)
|
2627
|
+
generated_features = [columns_renaming.get(c, c) for c in generated_features]
|
2628
|
+
search_keys = {columns_renaming.get(c, c): t for c, t in search_keys.items()}
|
2629
|
+
selecting_columns = self._selecting_input_and_generated_columns(
|
2630
|
+
validated_Xy, generated_features, keep_input, trace_id
|
2631
|
+
)
|
2632
|
+
self.logger.warning(f"Filtered columns by existance in dataframe: {selecting_columns}")
|
2633
|
+
if add_fit_system_record_id:
|
2634
|
+
df = self._add_fit_system_record_id(
|
2635
|
+
df,
|
2636
|
+
search_keys,
|
2637
|
+
SYSTEM_RECORD_ID,
|
2638
|
+
TARGET,
|
2639
|
+
columns_renaming,
|
2640
|
+
self.id_columns,
|
2641
|
+
self.cv,
|
2642
|
+
self.model_task_type,
|
2643
|
+
self.logger,
|
2644
|
+
self.bundle,
|
2645
|
+
)
|
2646
|
+
selecting_columns.append(SYSTEM_RECORD_ID)
|
2647
|
+
return df[selecting_columns], columns_renaming, generated_features, search_keys
|
2631
2648
|
|
2632
2649
|
# Don't pass all features in backend on transform
|
2633
2650
|
runtime_parameters = self._get_copy_of_runtime_parameters()
|
@@ -2845,29 +2862,12 @@ if response.status_code == 200:
|
|
2845
2862
|
how="left",
|
2846
2863
|
)
|
2847
2864
|
|
2848
|
-
|
2849
|
-
|
2850
|
-
|
2851
|
-
selected_generated_features = [
|
2852
|
-
c for c in generated_features if not self.fit_select_features or c in self.feature_names_
|
2853
|
-
]
|
2854
|
-
if keep_input is True:
|
2855
|
-
selected_input_columns = [
|
2856
|
-
c
|
2857
|
-
for c in validated_Xy.columns
|
2858
|
-
if not self.fit_select_features
|
2859
|
-
or c in self.feature_names_
|
2860
|
-
or c in new_columns_on_transform
|
2861
|
-
or c in self.search_keys
|
2862
|
-
or c in (self.id_columns or [])
|
2863
|
-
or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
|
2864
|
-
]
|
2865
|
-
else:
|
2866
|
-
selected_input_columns = []
|
2867
|
-
|
2868
|
-
selecting_columns = selected_input_columns + selected_generated_features
|
2865
|
+
selecting_columns = self._selecting_input_and_generated_columns(
|
2866
|
+
validated_Xy, generated_features, keep_input, trace_id
|
2867
|
+
)
|
2869
2868
|
selecting_columns.extend(
|
2870
|
-
c
|
2869
|
+
c
|
2870
|
+
for c in result.columns
|
2871
2871
|
if c in self.feature_names_ and c not in selecting_columns and c not in validated_Xy.columns
|
2872
2872
|
)
|
2873
2873
|
if add_fit_system_record_id:
|
@@ -2895,6 +2895,35 @@ if response.status_code == 200:
|
|
2895
2895
|
|
2896
2896
|
return result, columns_renaming, generated_features, search_keys
|
2897
2897
|
|
2898
|
+
def _selecting_input_and_generated_columns(
|
2899
|
+
self,
|
2900
|
+
validated_Xy: pd.DataFrame,
|
2901
|
+
generated_features: list[str],
|
2902
|
+
keep_input: bool,
|
2903
|
+
trace_id: str,
|
2904
|
+
):
|
2905
|
+
fit_input_columns = [c.originalName for c in self._search_task.get_file_metadata(trace_id).columns]
|
2906
|
+
new_columns_on_transform = [c for c in validated_Xy.columns if c not in fit_input_columns]
|
2907
|
+
|
2908
|
+
selected_generated_features = [
|
2909
|
+
c for c in generated_features if not self.fit_select_features or c in self.feature_names_
|
2910
|
+
]
|
2911
|
+
if keep_input is True:
|
2912
|
+
selected_input_columns = [
|
2913
|
+
c
|
2914
|
+
for c in validated_Xy.columns
|
2915
|
+
if not self.fit_select_features
|
2916
|
+
or c in self.feature_names_
|
2917
|
+
or c in new_columns_on_transform
|
2918
|
+
or c in self.search_keys
|
2919
|
+
or c in (self.id_columns or [])
|
2920
|
+
or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
|
2921
|
+
]
|
2922
|
+
else:
|
2923
|
+
selected_input_columns = []
|
2924
|
+
|
2925
|
+
return selected_input_columns + selected_generated_features
|
2926
|
+
|
2898
2927
|
def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
|
2899
2928
|
if (search_keys is None or len(search_keys) == 0) and self.country_code is None:
|
2900
2929
|
if search_id:
|
@@ -3349,6 +3378,7 @@ if response.status_code == 200:
|
|
3349
3378
|
except KeyboardInterrupt as e:
|
3350
3379
|
print(self.bundle.get("search_stopping"))
|
3351
3380
|
self.rest_client.stop_search_task_v2(trace_id, self._search_task.search_task_id)
|
3381
|
+
self._search_task = None
|
3352
3382
|
self.logger.warning(f"Search {self._search_task.search_task_id} stopped by user")
|
3353
3383
|
print(self.bundle.get("search_stopped"))
|
3354
3384
|
raise e
|
@@ -3727,9 +3757,7 @@ if response.status_code == 200:
|
|
3727
3757
|
eval_types = validated_eval_X.dtypes
|
3728
3758
|
# Find columns with different types
|
3729
3759
|
diff_cols = [
|
3730
|
-
(col, x_types[col], eval_types[col])
|
3731
|
-
for col in x_types.index
|
3732
|
-
if x_types[col] != eval_types[col]
|
3760
|
+
(col, x_types[col], eval_types[col]) for col in x_types.index if x_types[col] != eval_types[col]
|
3733
3761
|
]
|
3734
3762
|
diff_col_names = [col for col, _, _ in diff_cols]
|
3735
3763
|
# print columns with different types
|
@@ -3815,9 +3843,7 @@ if response.status_code == 200:
|
|
3815
3843
|
return Xy[X.columns].copy(), Xy[TARGET].copy()
|
3816
3844
|
|
3817
3845
|
@staticmethod
|
3818
|
-
def _sort_by_system_record_id(
|
3819
|
-
X: pd.DataFrame, y: pd.Series, cv: CVType | None
|
3820
|
-
) -> tuple[pd.DataFrame, pd.Series]:
|
3846
|
+
def _sort_by_system_record_id(X: pd.DataFrame, y: pd.Series, cv: CVType | None) -> tuple[pd.DataFrame, pd.Series]:
|
3821
3847
|
if cv not in [CVType.time_series, CVType.blocked_time_series]:
|
3822
3848
|
record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
|
3823
3849
|
Xy = X.copy()
|
upgini/utils/display_utils.py
CHANGED
@@ -269,19 +269,22 @@ def make_html_report(
|
|
269
269
|
if search_keys is not None
|
270
270
|
else ""
|
271
271
|
}
|
272
|
-
{
|
273
|
-
|
274
|
-
|
272
|
+
{
|
273
|
+
"<h3>All relevant features. Accuracy after enrichment</h3>" + make_table(metrics_df)
|
274
|
+
if metrics_df is not None
|
275
|
+
else ""
|
275
276
|
}
|
276
|
-
{
|
277
|
-
|
278
|
-
|
277
|
+
{
|
278
|
+
"<h3>Relevant data sources</h3>" + make_table(relevant_datasources_df)
|
279
|
+
if len(relevant_datasources_df) > 0
|
280
|
+
else ""
|
279
281
|
}
|
280
282
|
<h3>All relevant features. Listing ({len(relevant_features_df)} items)</h3>
|
281
283
|
{make_table(relevant_features_df, wrap_long_string=25)}
|
282
|
-
{
|
283
|
-
|
284
|
-
|
284
|
+
{
|
285
|
+
"<h3>Description of AutoFE feature names</h3>" + make_table(autofe_descriptions_df, wrap_long_string=25)
|
286
|
+
if autofe_descriptions_df is not None and len(autofe_descriptions_df) > 0
|
287
|
+
else ""
|
285
288
|
}
|
286
289
|
<p>To buy found data sources, please contact: <a href='mailto:sales@upgini.com'>sales@upgini.com</a></p>
|
287
290
|
<p>Best regards, </br><b>Upgini Team</b></p>
|
upgini/utils/sklearn_ext.py
CHANGED
@@ -1301,6 +1301,7 @@ def _encode_cat_features(X_train, y_train, X_test, y_test, cat_features, estimat
|
|
1301
1301
|
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
|
1302
1302
|
encoder.fit(X_train[cat_features], y_train)
|
1303
1303
|
|
1304
|
+
# OrdinalEncoder doesn't support progressive encoding with target
|
1304
1305
|
X_train[cat_features] = encoder.transform(X_train[cat_features]).astype(int)
|
1305
1306
|
X_test[cat_features] = encoder.transform(X_test[cat_features]).astype(int)
|
1306
1307
|
|
@@ -1314,10 +1315,8 @@ def _encode_cat_features(X_train, y_train, X_test, y_test, cat_features, estimat
|
|
1314
1315
|
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
|
1315
1316
|
encoder.fit(X_train[cat_features], y_train)
|
1316
1317
|
|
1317
|
-
#
|
1318
|
-
X_train[cat_features] = encoder.transform(X_train[cat_features]
|
1319
|
-
|
1320
|
-
# Static encoding on validation (no y)
|
1318
|
+
# OrdinalEncoder doesn't support progressive encoding with target
|
1319
|
+
X_train[cat_features] = encoder.transform(X_train[cat_features]).astype(int)
|
1321
1320
|
X_test[cat_features] = encoder.transform(X_test[cat_features]).astype(int)
|
1322
1321
|
|
1323
1322
|
return X_train, y_train, X_test, y_test, [], encoder
|
@@ -1,9 +1,9 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=C4MPkUjPY8txHqkpCAHzv554Bvc9hUrOFMic1aakSTI,24
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=pQ8JQe0cdygD-W9GefJmfE6bnj4EYzXsjlgWdIS9nS8,31578
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=Du1S72F55cqyKbHT3VGSPnJO3XicWABFVkA2-G3chdA,231696
|
7
7
|
upgini/http.py,sha256=-J_wOpnwVnT0ebPC6sOs6fN3AWtCD0LJLu6nlYmxaqk,44348
|
8
8
|
upgini/metadata.py,sha256=VzgtgEbPPtNxTrj9LM5qSDP3DujHwAXqbUSKBjPcb9c,12477
|
9
9
|
upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
|
@@ -54,7 +54,7 @@ upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDc
|
|
54
54
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
55
55
|
upgini/utils/datetime_utils.py,sha256=UL1ernnawW0LV9mPDpCIc6sFy0HUhFscWVNwfH4V7rI,14366
|
56
56
|
upgini/utils/deduplicate_utils.py,sha256=oZEiZeN-A92zwAPysV4OP9hO-niC2RLt-Dhc_hynBTU,11273
|
57
|
-
upgini/utils/display_utils.py,sha256=
|
57
|
+
upgini/utils/display_utils.py,sha256=uSG3JwpwCIgRJXsp-8ktuJ0Dh-WFti7IrRLMUfHfoDc,11973
|
58
58
|
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
59
59
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
60
60
|
upgini/utils/feature_info.py,sha256=6vihytwKma_TlXtTn4l6Aj4kqlOj0ouLy-yWVV6VUw8,7551
|
@@ -68,13 +68,13 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
|
|
68
68
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
69
69
|
upgini/utils/psi.py,sha256=vw8QEktXSx29IiMJMxmDeFU_4lJInJBXt_XL5Muekzo,11114
|
70
70
|
upgini/utils/sample_utils.py,sha256=xpfYaZ2cYP7I2JrcooVc13QNBFawB81cJRuh38451Q4,15123
|
71
|
-
upgini/utils/sklearn_ext.py,sha256=
|
71
|
+
upgini/utils/sklearn_ext.py,sha256=Pcy8sWD6f4YcE5Bu0UmXD4j0ICmXtrT8DJlTArM-_a0,49356
|
72
72
|
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
73
73
|
upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,10882
|
74
74
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
75
75
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
76
76
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
77
|
-
upgini-1.2.
|
78
|
-
upgini-1.2.
|
79
|
-
upgini-1.2.
|
80
|
-
upgini-1.2.
|
77
|
+
upgini-1.2.120.dist-info/METADATA,sha256=KFxeOoYvqFTE347dhf5EmvIskXqWMZvxYWy3AAwOyWI,50743
|
78
|
+
upgini-1.2.120.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
79
|
+
upgini-1.2.120.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
80
|
+
upgini-1.2.120.dist-info/RECORD,,
|
File without changes
|
File without changes
|