upgini 1.1.315a1__py3-none-any.whl → 1.1.316__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/binary.py +4 -1
- upgini/autofe/unary.py +3 -0
- upgini/data_source/data_source_publisher.py +9 -0
- upgini/dataset.py +1 -1
- upgini/features_enricher.py +42 -24
- upgini/utils/datetime_utils.py +0 -1
- {upgini-1.1.315a1.dist-info → upgini-1.1.316.dist-info}/METADATA +1 -1
- {upgini-1.1.315a1.dist-info → upgini-1.1.316.dist-info}/RECORD +11 -11
- {upgini-1.1.315a1.dist-info → upgini-1.1.316.dist-info}/WHEEL +0 -0
- {upgini-1.1.315a1.dist-info → upgini-1.1.316.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.1.
|
|
1
|
+
__version__ = "1.1.316"
|
upgini/autofe/binary.py
CHANGED
|
@@ -141,7 +141,7 @@ class Distance(PandasOperand):
|
|
|
141
141
|
|
|
142
142
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
143
143
|
return pd.Series(
|
|
144
|
-
1 - self.__dot(left, right) / (self.
|
|
144
|
+
1 - self.__dot(left, right) / (self.__norm(left) * self.__norm(right)), index=left.index
|
|
145
145
|
)
|
|
146
146
|
|
|
147
147
|
# row-wise dot product
|
|
@@ -152,6 +152,9 @@ class Distance(PandasOperand):
|
|
|
152
152
|
res = res.reindex(left.index.union(right.index))
|
|
153
153
|
return res
|
|
154
154
|
|
|
155
|
+
def __norm(self, vector: pd.Series) -> pd.Series:
|
|
156
|
+
return np.sqrt(self.__dot(vector, vector))
|
|
157
|
+
|
|
155
158
|
|
|
156
159
|
# Left for backward compatibility
|
|
157
160
|
class Sim(Distance):
|
upgini/autofe/unary.py
CHANGED
|
@@ -121,6 +121,9 @@ class Norm(PandasOperand):
|
|
|
121
121
|
|
|
122
122
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
123
123
|
data_dropna = data.dropna()
|
|
124
|
+
if data_dropna.empty:
|
|
125
|
+
return data
|
|
126
|
+
|
|
124
127
|
normalized_data = Normalizer().transform(data_dropna.to_frame().T).T
|
|
125
128
|
normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
|
|
126
129
|
normalized_data = normalized_data.reindex(data.index)
|
|
@@ -63,6 +63,7 @@ class DataSourcePublisher:
|
|
|
63
63
|
keep_features: Optional[List[str]] = None,
|
|
64
64
|
date_features: Optional[List[str]] = None,
|
|
65
65
|
date_vector_features: Optional[List[str]] = None,
|
|
66
|
+
generate_runtime_embeddings: Optional[List[str]] = None,
|
|
66
67
|
_force_generation=False,
|
|
67
68
|
_silent=False,
|
|
68
69
|
) -> str:
|
|
@@ -163,6 +164,8 @@ class DataSourcePublisher:
|
|
|
163
164
|
if date_format is None:
|
|
164
165
|
raise ValidationError("date_format should be presented if you use date vector features")
|
|
165
166
|
request["dateVectorFeatures"] = date_vector_features
|
|
167
|
+
if generate_runtime_embeddings is not None:
|
|
168
|
+
request["generateRuntimeEmbeddingsFeatures"] = generate_runtime_embeddings
|
|
166
169
|
self.logger.info(f"Start registering data table {request}")
|
|
167
170
|
|
|
168
171
|
task_id = self._rest_client.register_ads(request, trace_id)
|
|
@@ -276,6 +279,8 @@ class DataSourcePublisher:
|
|
|
276
279
|
client_emails: Optional[List[str]] = None,
|
|
277
280
|
date_features: Optional[List[str]] = None,
|
|
278
281
|
date_vector_features: Optional[List[str]] = None,
|
|
282
|
+
exclude_from_autofe_generation: Optional[List[str]] = None,
|
|
283
|
+
generate_runtime_embeddings: Optional[List[str]] = None,
|
|
279
284
|
):
|
|
280
285
|
trace_id = str(uuid.uuid4())
|
|
281
286
|
with MDC(trace_id=trace_id):
|
|
@@ -327,6 +332,10 @@ class DataSourcePublisher:
|
|
|
327
332
|
request["dateFeatures"] = date_features
|
|
328
333
|
if date_vector_features is not None:
|
|
329
334
|
request["dateVectorFeatures"] = date_vector_features
|
|
335
|
+
if exclude_from_autofe_generation is not None:
|
|
336
|
+
request["excludeFromGenerationFeatures"] = exclude_from_autofe_generation
|
|
337
|
+
if generate_runtime_embeddings is not None:
|
|
338
|
+
request["generateRuntimeEmbeddingsFeatures"] = generate_runtime_embeddings
|
|
330
339
|
self.logger.info(f"Activating data tables with request {request}")
|
|
331
340
|
|
|
332
341
|
self._rest_client.activate_datatables(request, trace_id)
|
upgini/dataset.py
CHANGED
|
@@ -692,7 +692,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
692
692
|
parquet_file_path = f"{base_path}/{self.dataset_name}.parquet"
|
|
693
693
|
self.data.to_parquet(path=parquet_file_path, index=False, compression="gzip", engine="fastparquet")
|
|
694
694
|
uploading_file_size = Path(parquet_file_path).stat().st_size
|
|
695
|
-
self.logger.info(f"Size of prepared uploading file: {uploading_file_size}")
|
|
695
|
+
self.logger.info(f"Size of prepared uploading file: {uploading_file_size}. {len(self.data)} rows")
|
|
696
696
|
if uploading_file_size > self.MAX_UPLOADING_FILE_SIZE:
|
|
697
697
|
raise ValidationError(self.bundle.get("dataset_too_big_file"))
|
|
698
698
|
return parquet_file_path
|
upgini/features_enricher.py
CHANGED
|
@@ -846,17 +846,37 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
846
846
|
self.logger.warning(msg)
|
|
847
847
|
print(msg)
|
|
848
848
|
|
|
849
|
+
if X is not None and y is None:
|
|
850
|
+
raise ValidationError("X passed without y")
|
|
851
|
+
|
|
849
852
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
850
853
|
effective_X = X if X is not None else self.X
|
|
851
854
|
effective_y = y if y is not None else self.y
|
|
852
855
|
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
|
853
856
|
effective_eval_set = self._check_eval_set(effective_eval_set, effective_X, self.bundle)
|
|
854
857
|
|
|
858
|
+
if (
|
|
859
|
+
self._search_task is None
|
|
860
|
+
or self._search_task.provider_metadata_v2 is None
|
|
861
|
+
or len(self._search_task.provider_metadata_v2) == 0
|
|
862
|
+
or effective_X is None
|
|
863
|
+
or effective_y is None
|
|
864
|
+
):
|
|
865
|
+
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
|
866
|
+
|
|
867
|
+
validated_X = self._validate_X(effective_X)
|
|
868
|
+
validated_y = self._validate_y(validated_X, effective_y)
|
|
869
|
+
validated_eval_set = (
|
|
870
|
+
[self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in effective_eval_set]
|
|
871
|
+
if effective_eval_set is not None
|
|
872
|
+
else None
|
|
873
|
+
)
|
|
874
|
+
|
|
855
875
|
try:
|
|
856
876
|
self.__log_debug_information(
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
877
|
+
validated_X,
|
|
878
|
+
validated_y,
|
|
879
|
+
validated_eval_set,
|
|
860
880
|
exclude_features_sources=exclude_features_sources,
|
|
861
881
|
cv=cv if cv is not None else self.cv,
|
|
862
882
|
importance_threshold=importance_threshold,
|
|
@@ -866,21 +886,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
866
886
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
|
867
887
|
)
|
|
868
888
|
|
|
869
|
-
if (
|
|
870
|
-
self._search_task is None
|
|
871
|
-
or self._search_task.provider_metadata_v2 is None
|
|
872
|
-
or len(self._search_task.provider_metadata_v2) == 0
|
|
873
|
-
or effective_X is None
|
|
874
|
-
or effective_y is None
|
|
875
|
-
):
|
|
876
|
-
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
|
877
|
-
|
|
878
|
-
if X is not None and y is None:
|
|
879
|
-
raise ValidationError("X passed without y")
|
|
880
|
-
|
|
881
889
|
validate_scoring_argument(scoring)
|
|
882
890
|
|
|
883
|
-
self._validate_baseline_score(
|
|
891
|
+
self._validate_baseline_score(validated_X, validated_eval_set)
|
|
884
892
|
|
|
885
893
|
if self._has_paid_features(exclude_features_sources):
|
|
886
894
|
msg = self.bundle.get("metrics_with_paid_features")
|
|
@@ -889,7 +897,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
889
897
|
return None
|
|
890
898
|
|
|
891
899
|
cat_features, search_keys_for_metrics = self._get_client_cat_features(
|
|
892
|
-
estimator,
|
|
900
|
+
estimator, validated_X, self.search_keys
|
|
893
901
|
)
|
|
894
902
|
|
|
895
903
|
prepared_data = self._prepare_data_for_metrics(
|
|
@@ -1034,10 +1042,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1034
1042
|
self.bundle.get("quality_metrics_rows_header"): _num_samples(effective_X),
|
|
1035
1043
|
}
|
|
1036
1044
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
|
1037
|
-
|
|
1045
|
+
validated_y
|
|
1038
1046
|
):
|
|
1039
1047
|
train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1040
|
-
np.mean(
|
|
1048
|
+
np.mean(validated_y), 4
|
|
1041
1049
|
)
|
|
1042
1050
|
if etalon_metric is not None:
|
|
1043
1051
|
train_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
|
|
@@ -1107,10 +1115,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1107
1115
|
# self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
|
|
1108
1116
|
}
|
|
1109
1117
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
|
1110
|
-
|
|
1118
|
+
validated_eval_set[idx][1]
|
|
1111
1119
|
):
|
|
1112
1120
|
eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1113
|
-
np.mean(
|
|
1121
|
+
np.mean(validated_eval_set[idx][1]), 4
|
|
1114
1122
|
)
|
|
1115
1123
|
if etalon_eval_metric is not None:
|
|
1116
1124
|
eval_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = (
|
|
@@ -3158,6 +3166,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3158
3166
|
if len(search_key_names_by_type) == 0:
|
|
3159
3167
|
return df, {}
|
|
3160
3168
|
|
|
3169
|
+
self.logger.info(f"Start exploding dataset by {search_key_names_by_type}. Size before: {len(df)}")
|
|
3161
3170
|
multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
|
|
3162
3171
|
other_columns = [col for col in df.columns if col not in multiple_keys_columns]
|
|
3163
3172
|
exploded_dfs = []
|
|
@@ -3176,6 +3185,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3176
3185
|
columns_renaming[new_search_key] = new_search_key
|
|
3177
3186
|
|
|
3178
3187
|
df = pd.concat(exploded_dfs, ignore_index=True)
|
|
3188
|
+
self.logger.info(f"Finished explosion. Size after: {len(df)}")
|
|
3179
3189
|
return df, unnest_search_keys
|
|
3180
3190
|
|
|
3181
3191
|
def __add_fit_system_record_id(
|
|
@@ -3209,18 +3219,26 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3209
3219
|
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
3210
3220
|
sort_columns = [date_column] if date_column is not None else []
|
|
3211
3221
|
|
|
3222
|
+
sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
|
|
3223
|
+
sorted_other_keys = [k for k in sorted_other_keys if k not in sort_exclude_columns]
|
|
3224
|
+
|
|
3212
3225
|
other_columns = sorted(
|
|
3213
3226
|
[
|
|
3214
3227
|
c
|
|
3215
3228
|
for c in df.columns
|
|
3216
|
-
if c not in sort_columns
|
|
3229
|
+
if c not in sort_columns
|
|
3230
|
+
and c not in sorted_other_keys
|
|
3231
|
+
and c not in sort_exclude_columns
|
|
3232
|
+
and df[c].nunique() > 1
|
|
3217
3233
|
]
|
|
3218
3234
|
)
|
|
3219
3235
|
|
|
3236
|
+
all_other_columns = sorted_other_keys + other_columns
|
|
3237
|
+
|
|
3220
3238
|
search_keys_hash = "search_keys_hash"
|
|
3221
|
-
if len(
|
|
3239
|
+
if len(all_other_columns) > 0:
|
|
3222
3240
|
sort_columns.append(search_keys_hash)
|
|
3223
|
-
df[search_keys_hash] = pd.util.hash_pandas_object(df[
|
|
3241
|
+
df[search_keys_hash] = pd.util.hash_pandas_object(df[all_other_columns], index=False)
|
|
3224
3242
|
|
|
3225
3243
|
df = df.sort_values(by=sort_columns)
|
|
3226
3244
|
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=DQCLPSfZIiyKQ88S6JJcAEA3dURvJk2NhtYNJeB5Mq8,24
|
|
2
2
|
upgini/__init__.py,sha256=Xs0YFVBu1KUdtZzbStGRPQtLt3YLzJnjx5nIUBlX8BE,415
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=yAWIygHejxdKXOA4g3QjtCu0VRa9at-4nPPuugCr77U,30857
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=Gu4gsnMVjcsfWnJlu4Np3jpE9Au1UywhuHQb0Xv5YNg,187982
|
|
7
7
|
upgini/http.py,sha256=a4Epc9YLIJBuYk4t8E_2-QDLBtJFqKO35jn2SnYQZCg,42920
|
|
8
8
|
upgini/lazy_import.py,sha256=EwoM0msNGbSmWBhGbrLDny1DSnOlvTxCjmMKPxYlDms,610
|
|
9
9
|
upgini/metadata.py,sha256=YQ-1HZGyPOksP2iM50ff_pMHXLyzvpChqSfNh8Z0ke4,10833
|
|
@@ -15,15 +15,15 @@ upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9Jvf
|
|
|
15
15
|
upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
upgini/autofe/all_operands.py,sha256=3LiH9iU-ArGmYpS8FHWH7yCFx40ILfvlSXJlKIa75BQ,2542
|
|
18
|
-
upgini/autofe/binary.py,sha256=
|
|
18
|
+
upgini/autofe/binary.py,sha256=2Z5FrfdCtesKEHBuabEBiRvwOAzcRoFKAX1wvGpHL0I,7003
|
|
19
19
|
upgini/autofe/date.py,sha256=AO3P8GtUHD6vPE_1Vrj3nsnXYBxiXe7vun6aLHReZgQ,9064
|
|
20
20
|
upgini/autofe/feature.py,sha256=gwGWY2UcX_0wHAvfEiu1rRU7GFZyzMWZIaPVcf6kD80,14223
|
|
21
21
|
upgini/autofe/groupby.py,sha256=4WjDzQxqpZxB79Ih4ihMMI5GDxaFqiH6ZelfV82ClT4,3091
|
|
22
22
|
upgini/autofe/operand.py,sha256=MKEsl3zxpWzRDpTkE0sNJxTu62U20sWOvEKhPjUWS6s,2915
|
|
23
|
-
upgini/autofe/unary.py,sha256=
|
|
23
|
+
upgini/autofe/unary.py,sha256=oIMf-IVy7L7GkzxMmQyExX0tOH9RhWeQh7cGxxMDiPk,3832
|
|
24
24
|
upgini/autofe/vector.py,sha256=dLxfAstJs-gw_OQ1xxoxcM6pVzORlV0HVzdzt7cLXVQ,606
|
|
25
25
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
-
upgini/data_source/data_source_publisher.py,sha256=
|
|
26
|
+
upgini/data_source/data_source_publisher.py,sha256=Vg0biG86YB0OEaoxbK9YYrr4yARm11_h3bTWIBgoScA,22115
|
|
27
27
|
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
28
28
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
|
29
29
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -42,7 +42,7 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
|
|
|
42
42
|
upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
|
|
43
43
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
44
44
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
45
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
45
|
+
upgini/utils/datetime_utils.py,sha256=niZcf2YqAwokUFUW474zajlzv9HAMf7nv9v_WPJHpyc,12123
|
|
46
46
|
upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
|
|
47
47
|
upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
|
|
48
48
|
upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
|
|
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
57
57
|
upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
62
|
-
upgini-1.1.
|
|
63
|
-
upgini-1.1.
|
|
60
|
+
upgini-1.1.316.dist-info/METADATA,sha256=12UKpdX0d9nky8XWhKtyQjDK2MVWtbsEr811NSWrKmE,48222
|
|
61
|
+
upgini-1.1.316.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
62
|
+
upgini-1.1.316.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.1.316.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|