upgini 1.2.65a3818.dev8__py3-none-any.whl → 1.2.66__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +11 -8
- upgini/utils/feature_info.py +17 -9
- upgini/utils/sort.py +8 -2
- {upgini-1.2.65a3818.dev8.dist-info → upgini-1.2.66.dist-info}/METADATA +1 -1
- {upgini-1.2.65a3818.dev8.dist-info → upgini-1.2.66.dist-info}/RECORD +8 -8
- {upgini-1.2.65a3818.dev8.dist-info → upgini-1.2.66.dist-info}/WHEEL +1 -1
- {upgini-1.2.65a3818.dev8.dist-info → upgini-1.2.66.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.66"
|
upgini/features_enricher.py
CHANGED
|
@@ -308,7 +308,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
308
308
|
self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
|
|
309
309
|
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
310
310
|
x_columns = [c.originalName or c.name for c in file_metadata.columns]
|
|
311
|
-
|
|
311
|
+
df = pd.DataFrame(columns=x_columns)
|
|
312
|
+
self.__prepare_feature_importances(trace_id, df, silent=True)
|
|
312
313
|
# TODO validate search_keys with search_keys from file_metadata
|
|
313
314
|
print(self.bundle.get("search_by_task_id_finish"))
|
|
314
315
|
self.logger.debug(f"Successfully initialized with search_id: {search_id}")
|
|
@@ -1087,7 +1088,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1087
1088
|
enriched_shaps = enriched_cv_result.shap_values
|
|
1088
1089
|
|
|
1089
1090
|
if enriched_shaps is not None:
|
|
1090
|
-
self._update_shap_values(trace_id,
|
|
1091
|
+
self._update_shap_values(trace_id, fitting_X, enriched_shaps)
|
|
1091
1092
|
|
|
1092
1093
|
if enriched_metric is None:
|
|
1093
1094
|
self.logger.warning(
|
|
@@ -1255,14 +1256,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1255
1256
|
finally:
|
|
1256
1257
|
self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
|
|
1257
1258
|
|
|
1258
|
-
def _update_shap_values(self, trace_id: str,
|
|
1259
|
+
def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float]):
|
|
1259
1260
|
renaming = self.fit_columns_renaming or {}
|
|
1260
1261
|
new_shaps = {
|
|
1261
1262
|
renaming.get(feature, feature): _round_shap_value(shap)
|
|
1262
1263
|
for feature, shap in new_shaps.items()
|
|
1263
1264
|
if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
|
|
1264
1265
|
}
|
|
1265
|
-
self.__prepare_feature_importances(trace_id,
|
|
1266
|
+
self.__prepare_feature_importances(trace_id, df, new_shaps)
|
|
1266
1267
|
|
|
1267
1268
|
if self.features_info_display_handle is not None:
|
|
1268
1269
|
try:
|
|
@@ -3021,7 +3022,7 @@ if response.status_code == 200:
|
|
|
3021
3022
|
msg = self.bundle.get("features_not_generated").format(unused_features_for_generation)
|
|
3022
3023
|
self.__log_warning(msg)
|
|
3023
3024
|
|
|
3024
|
-
self.__prepare_feature_importances(trace_id,
|
|
3025
|
+
self.__prepare_feature_importances(trace_id, df)
|
|
3025
3026
|
|
|
3026
3027
|
self.__show_selected_features(self.fit_search_keys)
|
|
3027
3028
|
|
|
@@ -3796,7 +3797,7 @@ if response.status_code == 200:
|
|
|
3796
3797
|
return result_train, result_eval_sets
|
|
3797
3798
|
|
|
3798
3799
|
def __prepare_feature_importances(
|
|
3799
|
-
self, trace_id: str,
|
|
3800
|
+
self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
|
3800
3801
|
):
|
|
3801
3802
|
if self._search_task is None:
|
|
3802
3803
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
@@ -3807,6 +3808,8 @@ if response.status_code == 200:
|
|
|
3807
3808
|
original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
|
|
3808
3809
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
|
3809
3810
|
|
|
3811
|
+
df = df.rename(columns=original_names_dict)
|
|
3812
|
+
|
|
3810
3813
|
self.feature_names_ = []
|
|
3811
3814
|
self.dropped_client_feature_names_ = []
|
|
3812
3815
|
self.feature_importances_ = []
|
|
@@ -3825,7 +3828,7 @@ if response.status_code == 200:
|
|
|
3825
3828
|
if feature_meta.name in original_names_dict.keys():
|
|
3826
3829
|
feature_meta.name = original_names_dict[feature_meta.name]
|
|
3827
3830
|
|
|
3828
|
-
is_client_feature = feature_meta.name in
|
|
3831
|
+
is_client_feature = feature_meta.name in df.columns
|
|
3829
3832
|
|
|
3830
3833
|
# TODO make a decision about selected features based on special flag from mlb
|
|
3831
3834
|
if original_shaps.get(feature_meta.name, 0.0) == 0.0:
|
|
@@ -3845,7 +3848,7 @@ if response.status_code == 200:
|
|
|
3845
3848
|
self.feature_names_.append(feature_meta.name)
|
|
3846
3849
|
self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
|
|
3847
3850
|
|
|
3848
|
-
df_for_sample = features_df if feature_meta.name in features_df.columns else
|
|
3851
|
+
df_for_sample = features_df if feature_meta.name in features_df.columns else df
|
|
3849
3852
|
feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
|
|
3850
3853
|
features_info.append(feature_info.to_row(self.bundle))
|
|
3851
3854
|
features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
|
upgini/utils/feature_info.py
CHANGED
|
@@ -123,7 +123,11 @@ def _get_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) ->
|
|
|
123
123
|
|
|
124
124
|
|
|
125
125
|
def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
126
|
-
|
|
126
|
+
providers = _list_or_single(feature_meta.data_providers, feature_meta.data_provider)
|
|
127
|
+
if providers:
|
|
128
|
+
return ", ".join(providers)
|
|
129
|
+
else:
|
|
130
|
+
return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
|
|
127
131
|
|
|
128
132
|
|
|
129
133
|
def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
@@ -137,13 +141,17 @@ def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> st
|
|
|
137
141
|
|
|
138
142
|
|
|
139
143
|
def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
144
|
+
sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
|
|
145
|
+
if sources:
|
|
146
|
+
return ", ".join(sources)
|
|
147
|
+
else:
|
|
148
|
+
return feature_meta.data_source or (
|
|
149
|
+
LLM_SOURCE
|
|
150
|
+
if not feature_meta.name.endswith("_country")
|
|
151
|
+
and not feature_meta.name.endswith("_postal_code")
|
|
152
|
+
and not is_client_feature
|
|
153
|
+
else ""
|
|
154
|
+
)
|
|
147
155
|
|
|
148
156
|
|
|
149
157
|
def _list_or_single(lst: List[str], single: str):
|
|
@@ -161,7 +169,7 @@ def _to_anchor(link: str, value: str) -> str:
|
|
|
161
169
|
return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
|
|
162
170
|
|
|
163
171
|
|
|
164
|
-
def _make_links(names: List[str], links: List[str]):
|
|
172
|
+
def _make_links(names: List[str], links: List[str]) -> str:
|
|
165
173
|
all_links = [_to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
|
|
166
174
|
return ",".join(all_links)
|
|
167
175
|
|
upgini/utils/sort.py
CHANGED
|
@@ -49,7 +49,7 @@ def sort_columns(
|
|
|
49
49
|
target = target_column if isinstance(target_column, pd.Series) else df[target_column]
|
|
50
50
|
target = prepare_target(target, model_task_type)
|
|
51
51
|
sort_dict = get_sort_columns_dict(
|
|
52
|
-
df[sorted_keys + other_columns], target, sorted_keys,
|
|
52
|
+
df[sorted_keys + other_columns], target, sorted_keys, sort_all_columns=sort_all_columns
|
|
53
53
|
)
|
|
54
54
|
other_columns = [c for c in other_columns if c in sort_dict]
|
|
55
55
|
columns_for_sort = sorted_keys + sorted(other_columns, key=lambda e: sort_dict[e], reverse=True)
|
|
@@ -60,7 +60,6 @@ def get_sort_columns_dict(
|
|
|
60
60
|
df: pd.DataFrame,
|
|
61
61
|
target: pd.Series,
|
|
62
62
|
sorted_keys: List[str],
|
|
63
|
-
omit_nan: bool,
|
|
64
63
|
n_jobs: Optional[int] = None,
|
|
65
64
|
sort_all_columns: bool = False,
|
|
66
65
|
) -> Dict[str, Any]:
|
|
@@ -78,6 +77,13 @@ def get_sort_columns_dict(
|
|
|
78
77
|
return {}
|
|
79
78
|
|
|
80
79
|
df = df[columns_for_sort]
|
|
80
|
+
df_with_target = pd.concat([df, target], axis=1)
|
|
81
|
+
# Drop rows where target is NaN
|
|
82
|
+
df_with_target = df_with_target.loc[~target.isna()]
|
|
83
|
+
df = df_with_target.iloc[:, :-1]
|
|
84
|
+
target = df_with_target.iloc[:, -1]
|
|
85
|
+
df = df.fillna(df.mean())
|
|
86
|
+
omit_nan = False
|
|
81
87
|
hashes = [hash_series(df[col]) for col in columns_for_sort]
|
|
82
88
|
df = np.asarray(df, dtype=np.float32)
|
|
83
89
|
correlations = get_sort_columns_correlations(df, target, omit_nan, n_jobs)
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=RKwg9WzPF8ERs3C4gJn8-Lc6A63EciDAbvbVv58AqFY,23
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=YXG5uUBN1Qo-3X5EUV4Y--Pyqbvg4Gta3WIoWQMTYkU,205359
|
|
7
7
|
upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
|
|
@@ -56,7 +56,7 @@ upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuM
|
|
|
56
56
|
upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
|
|
57
57
|
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
|
58
58
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
59
|
-
upgini/utils/feature_info.py,sha256=
|
|
59
|
+
upgini/utils/feature_info.py,sha256=rr96FwwPabkR1Q4O7QMu9oOGVWnXkvj44MfeSPjolyw,7105
|
|
60
60
|
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
|
61
61
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
62
62
|
upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
|
|
@@ -65,12 +65,12 @@ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,1
|
|
|
65
65
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
66
66
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
67
67
|
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
68
|
-
upgini/utils/sort.py,sha256=
|
|
68
|
+
upgini/utils/sort.py,sha256=VDXgZObIVAuGzXlAEejlKCNQcHmN5pN2bMou58sDKFI,6729
|
|
69
69
|
upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
|
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
73
|
-
upgini-1.2.
|
|
74
|
-
upgini-1.2.
|
|
75
|
-
upgini-1.2.
|
|
76
|
-
upgini-1.2.
|
|
73
|
+
upgini-1.2.66.dist-info/METADATA,sha256=XE_9NydkNtrWbOERCEFaQD_dOo5aPYt0URD0xMoS7kg,49113
|
|
74
|
+
upgini-1.2.66.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
75
|
+
upgini-1.2.66.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
76
|
+
upgini-1.2.66.dist-info/RECORD,,
|
|
File without changes
|