upgini 1.1.280a3418.post2__py3-none-any.whl → 1.2.31a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/__init__.py +4 -20
- upgini/autofe/all_operands.py +39 -10
- upgini/autofe/binary.py +148 -45
- upgini/autofe/date.py +197 -26
- upgini/autofe/feature.py +102 -19
- upgini/autofe/groupby.py +22 -22
- upgini/autofe/operand.py +9 -6
- upgini/autofe/unary.py +78 -54
- upgini/autofe/vector.py +8 -8
- upgini/data_source/data_source_publisher.py +128 -5
- upgini/dataset.py +50 -386
- upgini/features_enricher.py +936 -541
- upgini/http.py +27 -16
- upgini/lazy_import.py +35 -0
- upgini/metadata.py +84 -59
- upgini/metrics.py +164 -34
- upgini/normalizer/normalize_utils.py +197 -0
- upgini/resource_bundle/strings.properties +66 -51
- upgini/search_task.py +10 -4
- upgini/utils/Roboto-Regular.ttf +0 -0
- upgini/utils/base_search_key_detector.py +14 -12
- upgini/utils/country_utils.py +16 -0
- upgini/utils/custom_loss_utils.py +39 -36
- upgini/utils/datetime_utils.py +98 -45
- upgini/utils/deduplicate_utils.py +135 -112
- upgini/utils/display_utils.py +46 -15
- upgini/utils/email_utils.py +54 -16
- upgini/utils/feature_info.py +172 -0
- upgini/utils/features_validator.py +34 -20
- upgini/utils/ip_utils.py +100 -1
- upgini/utils/phone_utils.py +343 -0
- upgini/utils/postal_code_utils.py +34 -0
- upgini/utils/sklearn_ext.py +28 -19
- upgini/utils/target_utils.py +113 -57
- upgini/utils/warning_counter.py +1 -0
- upgini/version_validator.py +8 -4
- {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31a1.dist-info}/METADATA +31 -16
- upgini-1.2.31a1.dist-info/RECORD +65 -0
- upgini/normalizer/phone_normalizer.py +0 -340
- upgini-1.1.280a3418.post2.dist-info/RECORD +0 -62
- {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31a1.dist-info}/WHEEL +0 -0
- {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31a1.dist-info}/licenses/LICENSE +0 -0
|
@@ -9,37 +9,34 @@ search_stopped=Search request stopped
|
|
|
9
9
|
polling_search_task=\nRunning search request, search_id={}
|
|
10
10
|
polling_unregister_information=We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
|
|
11
11
|
ads_upload_finish=Thank you for your submission!\nWe'll check your data sharing proposal and get back to you
|
|
12
|
-
demo_dataset_info=Demo training dataset detected. Registration for an API key is not required
|
|
12
|
+
demo_dataset_info=Demo training dataset detected. Registration for an API key is not required.\n
|
|
13
13
|
transform_usage_info=You use Trial access to Upgini data enrichment. Limit for Trial: {} rows. You have already enriched: {} rows.
|
|
14
14
|
transform_usage_warning=You are trying to launch enrichment for {} rows, which will exceed the rest limit {}.
|
|
15
15
|
|
|
16
16
|
# Warnings
|
|
17
17
|
support_link=https://upgini.com/support
|
|
18
|
-
|
|
19
|
-
# slack_community_text=\nWARNING: Looks like you've run into an error. For help request write us in the Upgini community
|
|
20
|
-
support_text=\nWARNING: Looks like you've run into an error. For help request write us in support
|
|
18
|
+
support_text=Looks like you've run into an error. For help request write us in support
|
|
21
19
|
slack_community_bage=https://img.shields.io/badge/slack-@upgini-orange.svg?logo=slack
|
|
22
20
|
slack_community_alt=Upgini Slack community
|
|
23
|
-
version_warning
|
|
24
|
-
unregistered_with_personal_keys
|
|
25
|
-
date_only_search
|
|
26
|
-
date_search_without_time_series
|
|
27
|
-
metrics_exclude_paid_features
|
|
28
|
-
metrics_no_important_free_features
|
|
29
|
-
metrics_no_important_features
|
|
21
|
+
version_warning=Unsupported library version detected {},\nplease update with “%pip install -U upgini” to the latest {} and restart Jupyter kernel
|
|
22
|
+
unregistered_with_personal_keys=Search key {} can be used only with personal api_key from profile.upgini.com It will be ignored
|
|
23
|
+
date_only_search=Search started with DATE search key only\nTry to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IP to your training dataset\nfor search through all the available data sources.\nSee docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history
|
|
24
|
+
date_search_without_time_series=Looks like your training dataset is a time series. We recommend to set `cv=CVType.time_series` param for correct search results.\nSee docs https://github.com/upgini/upgini#-time-series-prediction-support
|
|
25
|
+
metrics_exclude_paid_features=Metrics calculated after enrichment has a free features only. To calculate metrics with a full set of relevant features, including commercial data sources, please contact support team:
|
|
26
|
+
metrics_no_important_free_features=No important free features to calculate metrics
|
|
27
|
+
metrics_no_important_features=No important features to calculate metrics
|
|
30
28
|
metrics_negative_uplift_without_cv=Please re-check that your task is not a time series prediction. If so, restart search with cv=CVType.time_series param for correct search results. See docs https://github.com/upgini/upgini#-time-series-prediction-support
|
|
31
29
|
# metrics_with_trial_features=The calculation of final accuracy metrics using Trial data is not available for unauthorized users.\nGet a free API key on https://upgini.com and repeat your request.
|
|
32
|
-
# transform_with_trial_features
|
|
30
|
+
# transform_with_trial_features=Your search results contain Trial data sources. To enrich your dataframe using transform or fit_transform with features from these Trial data sources, please register for a Free API key at https://upgini.com and resubmit your request.
|
|
33
31
|
# Enriching with Trial data is not available for unauthorized users.\nGet a free API key on https://upgini.com and repeat your request.
|
|
34
|
-
metrics_with_paid_features
|
|
35
|
-
transform_with_paid_features
|
|
36
|
-
trial_quota_limit_riched
|
|
37
|
-
loss_selection_warn
|
|
38
|
-
loss_calc_metrics_warn
|
|
39
|
-
multivariate_timeseries_detected
|
|
40
|
-
group_k_fold_in_classification
|
|
41
|
-
current_date_added
|
|
42
|
-
|
|
32
|
+
metrics_with_paid_features=The calculation of final accuracy metrics using Paid data is not available.\nContact Upgini support for the data access
|
|
33
|
+
transform_with_paid_features=Enriching with Paid data is not available.\nContact Upgini support for the data access
|
|
34
|
+
trial_quota_limit_riched=You have reached the quota limit of trial data usage. Please contact Upgini support to remove restriction
|
|
35
|
+
loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
|
|
36
|
+
loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
|
|
37
|
+
multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
|
|
38
|
+
group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
|
|
39
|
+
current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
|
|
43
40
|
# Errors
|
|
44
41
|
failed_search_by_task_id=Failed to retrieve the specified search results
|
|
45
42
|
metrics_unfitted_enricher=Call fit method before calling calculate_metrics
|
|
@@ -81,24 +78,26 @@ date_and_datetime_simultanious=DATE and DATETIME search keys cannot be used simu
|
|
|
81
78
|
email_and_hem_simultanious=EMAIL and HEM search keys cannot be used simultaneously. Choose one to keep
|
|
82
79
|
postal_code_without_country=COUNTRY search key required if POSTAL_CODE is present
|
|
83
80
|
multiple_search_key=Search key {} passed multiple times
|
|
84
|
-
unregistered_only_personal_keys=Only personal search keys used. Api_key from profile.upgini.com required for EMAIL/HEM, PHONE NUMBER or IPv4 search keys\nSee docs https://github.com/upgini/upgini#-open-up-all-capabilities-of-upgini
|
|
81
|
+
unregistered_only_personal_keys=Only personal search keys used. Api_key from profile.upgini.com required for EMAIL/HEM, PHONE NUMBER or IPv4/IPv6 search keys\nSee docs https://github.com/upgini/upgini#-open-up-all-capabilities-of-upgini
|
|
85
82
|
search_key_not_found=Column `{}` from search_keys was not found in X dataframe: {}
|
|
86
83
|
numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
|
|
87
84
|
unsupported_search_key_type=Unsupported type of key in search_keys: {}
|
|
88
|
-
|
|
85
|
+
unsupported_type_of_search_key=Unsupported type of search key: {}. It should be a member of SearchKey
|
|
86
|
+
search_key_country_and_country_code=SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
|
|
89
87
|
empty_search_key=Search key {} is empty. Please fill values or remove this search key
|
|
90
|
-
single_constant_search_key
|
|
91
|
-
|
|
88
|
+
single_constant_search_key=Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
|
|
89
|
+
unsupported_multi_key=Search key {} cannot be used multiple times
|
|
90
|
+
unsupported_index_column=Your column with name `index` was dropped because it's reserved name is booked for system needs.
|
|
92
91
|
date_string_without_format=Date column `{}` has string type, but date_format is not specified. Convert column to datetime type or pass date_format
|
|
93
92
|
invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit date format in date_format argument of FeaturesEnricher constructor
|
|
94
93
|
unsupported_date_type=Unsupported type of date column `{}`. Convert to datetime please.
|
|
95
94
|
invalid_postal_code=All values of POSTAL_CODE column `{}` are invalid
|
|
96
95
|
invalid_country=All values of COUNTRY column `{}` are invalid
|
|
97
|
-
invalid_ip=All values of
|
|
96
|
+
invalid_ip=All values of IP column `{}` are invalid
|
|
98
97
|
# X and y validation
|
|
99
98
|
unsupported_x_type=Unsupported type of X: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list
|
|
100
99
|
x_contains_dup_columns=X contains duplicate column names. Please rename or drop duplicates
|
|
101
|
-
x_contains_enriching_columns
|
|
100
|
+
x_contains_enriching_columns=X contains column names that match the names of features from external data sources. They will be dropped from the dataframe before the enrichment: {}
|
|
102
101
|
unsupported_y_type=Unsupported type of y: {}. Use pandas.DataFrame, pandas.Series, numpy.ndarray or list
|
|
103
102
|
y_is_constant=y is a constant. Relevant feature search requires a non-constant y
|
|
104
103
|
x_and_y_diff_size=X and y has different size: {}, {}.
|
|
@@ -111,10 +110,10 @@ y_multiindex_unsupported=Multi index in y is not supported
|
|
|
111
110
|
x_is_empty=X is empty
|
|
112
111
|
y_is_empty=y is empty
|
|
113
112
|
x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
|
|
114
|
-
missing_generate_feature
|
|
115
|
-
x_unstable_by_date
|
|
116
|
-
train_unstable_target
|
|
117
|
-
eval_unstable_target
|
|
113
|
+
missing_generate_feature=Feature {} specified in `generate_features` is not present in input columns: {}
|
|
114
|
+
x_unstable_by_date=Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
|
|
115
|
+
train_unstable_target=Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
|
|
116
|
+
eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
|
|
118
117
|
# eval set validation
|
|
119
118
|
unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
|
|
120
119
|
eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
|
|
@@ -134,24 +133,27 @@ eval_y_is_empty=y in eval_set is empty.
|
|
|
134
133
|
x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
|
|
135
134
|
baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
|
|
136
135
|
baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
|
|
136
|
+
missing_features_for_transform=Missing some features for transform that were presented on fit: {}
|
|
137
137
|
# target validation
|
|
138
138
|
empty_target=Target is empty in all rows
|
|
139
139
|
# non_numeric_target=Binary target should be numerical type
|
|
140
|
-
uneven_eval_target_distribution
|
|
141
|
-
target_outliers_warning
|
|
140
|
+
uneven_eval_target_distribution=y distributions from the training sample and eval_set differ according to the Kolmogorov-Smirnov test,\nwhich makes metrics between the train and eval_set incomparable.
|
|
141
|
+
target_outliers_warning=We detected {} outliers in your sample.\nExamples of outliers with maximum value of target:\n{}\nOutliers will {}be excluded during the metrics calculation.
|
|
142
|
+
|
|
142
143
|
# features validation
|
|
143
|
-
empty_or_contant_features
|
|
144
|
-
high_cardinality_features
|
|
145
|
-
# one_hot_encoded_features
|
|
144
|
+
empty_or_contant_features=Columns {} has value with frequency more than 99%, removed from X
|
|
145
|
+
high_cardinality_features=Columns {} has high cardinality (>90% unique values), removed from X
|
|
146
|
+
# one_hot_encoded_features=One hot encoded features detected. Use int encoding for correct results of fit.\n{}
|
|
147
|
+
|
|
146
148
|
# Dataset validation
|
|
147
149
|
dataset_too_few_rows=X size should be at least {} rows after validation
|
|
148
150
|
dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample X
|
|
149
151
|
dataset_empty_column_names=Some column names are empty. Add names please
|
|
150
|
-
dataset_full_duplicates
|
|
151
|
-
dataset_diff_target_duplicates
|
|
152
|
-
dataset_train_diff_target_duplicates_fintech
|
|
153
|
-
dataset_eval_diff_target_duplicates_fintech
|
|
154
|
-
dataset_drop_old_dates
|
|
152
|
+
dataset_full_duplicates={:.5f}% of the rows are fully duplicated
|
|
153
|
+
dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
|
|
154
|
+
dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
|
155
|
+
dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
|
156
|
+
dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset
|
|
155
157
|
dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
|
|
156
158
|
dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
|
|
157
159
|
dataset_invalid_binary_target=Binary task type should contain only 2 target values, but {} found
|
|
@@ -160,8 +162,8 @@ dataset_invalid_regression_target=Unexpected dtype of target for regression task
|
|
|
160
162
|
dataset_invalid_timeseries_target=Unexpected dtype of target for timeseries task type: {}. Expected float
|
|
161
163
|
dataset_to_many_multiclass_targets=The number of target classes {} exceeds the allowed threshold: {}. Please, correct your data and try again
|
|
162
164
|
dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
|
|
163
|
-
dataset_rarest_class_less_threshold
|
|
164
|
-
dataset_date_features
|
|
165
|
+
dataset_rarest_class_less_threshold=Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
|
|
166
|
+
dataset_date_features=Columns {} is a datetime or period type but not used as a search key, removed from X
|
|
165
167
|
dataset_too_many_features=Too many features. Maximum number of features is {}
|
|
166
168
|
dataset_constant_target=y contains only one distinct value
|
|
167
169
|
dataset_empty_target=y contains only NaN or incorrect values.
|
|
@@ -169,10 +171,9 @@ dataset_invalid_column_type=Unsupported data type of column {}: {}
|
|
|
169
171
|
dataset_invalid_filter=Unknown field in filter_features. Should be {'min_importance', 'max_psi', 'max_count', 'selected_features'}.
|
|
170
172
|
dataset_too_big_file=Too big size of dataframe X for processing. Please reduce number of rows or columns
|
|
171
173
|
dataset_transform_diff_fit=You try to enrich dataset that column names are different from the train dataset column names that you used on the fit stage. Please make the column names the same as in the train dataset and restart.
|
|
172
|
-
binary_small_dataset
|
|
174
|
+
binary_small_dataset=The least populated class in Target contains less than 1000 rows.\nSmall numbers of observations may negatively affect the number of selected features and quality of your ML model.\nUpgini recommends you increase the number of observations in the least populated class.\n
|
|
173
175
|
all_search_keys_invalid=All search keys are invalid
|
|
174
|
-
all_emails_invalid
|
|
175
|
-
# Metrics validation
|
|
176
|
+
all_emails_invalid=All values in column {} are invalid emails # Metrics validation
|
|
176
177
|
metrics_msle_negative_target=Mean Squared Logarithmic Error cannot be used when y contain negative values
|
|
177
178
|
metrics_unsupported_target_type=Unsupported type of target in y: {}
|
|
178
179
|
metrics_invalid_scoring={} is not a valid scoring value. Use {} to get valid options
|
|
@@ -188,10 +189,9 @@ ads_upload_too_few_rows=At least 1000 records per sample are needed. Increase th
|
|
|
188
189
|
ads_upload_search_key_not_found=Search key {} wasn't found in dataframe columns
|
|
189
190
|
ads_upload_to_many_empty_rows=More than 50% of rows in the submitted sample doesn't contain valid keys\nPlease fill the key columns with valid values and resubmit the data
|
|
190
191
|
# Features info warning
|
|
191
|
-
features_info_zero_important_features=Oops, we can't find any relevant external features for your training dataset,\nmost probably due to issues with search keys formats
|
|
192
|
+
features_info_zero_important_features=Oops, we can't find any relevant external features for your training dataset,\nmost probably due to issues with search keys formats.\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
|
|
192
193
|
features_info_zero_hit_rate_search_keys=Oops, looks like values/formats of the search keys {} might be incorrect,\nas we won't be able to match any data source using these values\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
|
|
193
|
-
features_not_generated
|
|
194
|
-
|
|
194
|
+
features_not_generated=Following features didn't pass checks for automated feature generation: {}
|
|
195
195
|
# Information
|
|
196
196
|
postal_code_detected=Postal codes detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
197
197
|
country_detected=Countries detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
@@ -200,12 +200,19 @@ country_default_determined=Search key country_code `{}` was used as default. \nS
|
|
|
200
200
|
email_detected=Emails detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
201
201
|
email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
202
202
|
phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
203
|
-
phone_detected_not_registered
|
|
204
|
-
target_type_detected=\nDetected task type: {}\n
|
|
203
|
+
phone_detected_not_registered=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
204
|
+
target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
|
|
205
|
+
binary_target_reason=only two unique label-values observed
|
|
206
|
+
non_numeric_multiclass_reason=non-numeric label values observed
|
|
207
|
+
few_unique_label_multiclass_reason=few unique label-values observed and can be considered as categorical
|
|
208
|
+
date_search_key_regression_reason=date search key is present, treating as regression
|
|
209
|
+
many_unique_label_regression_reason=many unique label-values or non-integer floating point values observed
|
|
210
|
+
limited_int_multiclass_reason=integer-like values with limited unique values observed
|
|
205
211
|
# all_ok_community_invite=Chat with us in Slack community:
|
|
206
212
|
all_ok_community_invite=❓ Support request
|
|
207
213
|
too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
|
|
208
214
|
imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
|
|
215
|
+
imbalanced_target=\nTarget is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
|
|
209
216
|
loss_selection_info=Using loss `{}` for feature selection
|
|
210
217
|
loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
|
|
211
218
|
|
|
@@ -244,6 +251,14 @@ relevant_data_sources_header=Relevant data sources
|
|
|
244
251
|
relevant_data_sources_all_shap=All features SHAP
|
|
245
252
|
relevant_data_sources_number=Number of relevant features
|
|
246
253
|
|
|
254
|
+
# Autofe descriptions
|
|
255
|
+
autofe_descriptions_header=*Description of AutoFE feature names
|
|
256
|
+
autofe_descriptions_sources=Sources
|
|
257
|
+
autofe_descriptions_feature_name=Feature name
|
|
258
|
+
autofe_descriptions_feature=Feature {}
|
|
259
|
+
autofe_descriptions_function=Function
|
|
260
|
+
|
|
261
|
+
|
|
247
262
|
# Quality metrics table
|
|
248
263
|
quality_metrics_header=Accuracy after enrichment
|
|
249
264
|
quality_metrics_train_segment=Train
|
upgini/search_task.py
CHANGED
|
@@ -3,6 +3,7 @@ import tempfile
|
|
|
3
3
|
import time
|
|
4
4
|
from functools import lru_cache
|
|
5
5
|
from typing import Dict, List, Optional
|
|
6
|
+
import uuid
|
|
6
7
|
|
|
7
8
|
import pandas as pd
|
|
8
9
|
|
|
@@ -97,10 +98,7 @@ class SearchTask:
|
|
|
97
98
|
time.sleep(self.POLLING_DELAY_SECONDS)
|
|
98
99
|
except KeyboardInterrupt as e:
|
|
99
100
|
if not check_fit:
|
|
100
|
-
|
|
101
|
-
self.rest_client.stop_search_task_v2(trace_id, search_task_id)
|
|
102
|
-
self.logger.warning(f"Search {search_task_id} stopped by user")
|
|
103
|
-
print(bundle.get("search_stopped"))
|
|
101
|
+
self._stop(trace_id)
|
|
104
102
|
raise e
|
|
105
103
|
print()
|
|
106
104
|
|
|
@@ -133,6 +131,14 @@ class SearchTask:
|
|
|
133
131
|
|
|
134
132
|
return self
|
|
135
133
|
|
|
134
|
+
def _stop(self, trace_id: Optional[str] = None):
|
|
135
|
+
trace_id = trace_id or uuid.uuid4()
|
|
136
|
+
search_task_id = self.initial_search_task_id if self.initial_search_task_id is not None else self.search_task_id
|
|
137
|
+
print(bundle.get("search_stopping"))
|
|
138
|
+
self.rest_client.stop_search_task_v2(trace_id, search_task_id)
|
|
139
|
+
self.logger.warning(f"Search {search_task_id} stopped by user")
|
|
140
|
+
print(bundle.get("search_stopped"))
|
|
141
|
+
|
|
136
142
|
def get_all_features_metadata_v2(self) -> Optional[List[FeaturesMetadataV2]]:
|
|
137
143
|
if self.provider_metadata_v2 is None:
|
|
138
144
|
return None
|
|
Binary file
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
@@ -10,16 +10,18 @@ class BaseSearchKeyDetector:
|
|
|
10
10
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
11
11
|
raise NotImplementedError
|
|
12
12
|
|
|
13
|
-
def
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
13
|
+
def _get_search_keys_by_name(self, column_names: List[str]) -> List[str]:
|
|
14
|
+
return [
|
|
15
|
+
column_name
|
|
16
|
+
for column_name in column_names
|
|
17
|
+
if self._is_search_key_by_name(column_name)
|
|
18
|
+
]
|
|
17
19
|
|
|
18
|
-
def
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
for column_name in df.columns:
|
|
20
|
+
def get_search_key_columns(self, df: pd.DataFrame, existing_search_keys: List[str]) -> List[str]:
|
|
21
|
+
other_columns = [col for col in df.columns if col not in existing_search_keys]
|
|
22
|
+
columns_by_names = self._get_search_keys_by_name(other_columns)
|
|
23
|
+
columns_by_values = []
|
|
24
|
+
for column_name in other_columns:
|
|
24
25
|
if self._is_search_key_by_values(df[column_name]):
|
|
25
|
-
|
|
26
|
+
columns_by_values.append(column_name)
|
|
27
|
+
return list(set(columns_by_names + columns_by_values))
|
upgini/utils/country_utils.py
CHANGED
|
@@ -4,6 +4,22 @@ from pandas.api.types import is_object_dtype, is_string_dtype
|
|
|
4
4
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
5
5
|
|
|
6
6
|
|
|
7
|
+
class CountrySearchKeyConverter:
|
|
8
|
+
|
|
9
|
+
def __init__(self, country_col: str):
|
|
10
|
+
self.country_col = country_col
|
|
11
|
+
|
|
12
|
+
def convert(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
13
|
+
df[self.country_col] = (
|
|
14
|
+
df[self.country_col]
|
|
15
|
+
.astype("string")
|
|
16
|
+
.str.upper()
|
|
17
|
+
.str.replace(r"[^A-Z]", "", regex=True)
|
|
18
|
+
.str.replace("UK", "GB", regex=False)
|
|
19
|
+
)
|
|
20
|
+
return df
|
|
21
|
+
|
|
22
|
+
|
|
7
23
|
class CountrySearchKeyDetector(BaseSearchKeyDetector):
|
|
8
24
|
def _is_search_key_by_name(self, column_name: str) -> bool:
|
|
9
25
|
return "country" in str(column_name).lower()
|
|
@@ -11,46 +11,49 @@ def get_runtime_params_custom_loss(
|
|
|
11
11
|
runtime_parameters: RuntimeParameters,
|
|
12
12
|
logger: Optional[logging.Logger] = None,
|
|
13
13
|
) -> RuntimeParameters:
|
|
14
|
+
if not loss:
|
|
15
|
+
return runtime_parameters
|
|
16
|
+
|
|
14
17
|
if logger is None:
|
|
15
18
|
logger = logging.getLogger()
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
)
|
|
40
|
-
else False
|
|
19
|
+
|
|
20
|
+
selection_loss_reg = [
|
|
21
|
+
"regression",
|
|
22
|
+
"regression_l1",
|
|
23
|
+
"huber",
|
|
24
|
+
"poisson",
|
|
25
|
+
"quantile",
|
|
26
|
+
"mape",
|
|
27
|
+
"mean_absolute_percentage_error",
|
|
28
|
+
"gamma",
|
|
29
|
+
"tweedie",
|
|
30
|
+
]
|
|
31
|
+
selection_loss_binary = ["binary"]
|
|
32
|
+
selection_loss_multi_clf = ["multiclass", "multiclassova", "multiclass_ova", "ova", "ovr"]
|
|
33
|
+
use_custom_loss = (
|
|
34
|
+
True
|
|
35
|
+
if (
|
|
36
|
+
(model_task_type == ModelTaskType.REGRESSION)
|
|
37
|
+
and (loss in selection_loss_reg)
|
|
38
|
+
or (model_task_type == ModelTaskType.BINARY)
|
|
39
|
+
and (loss in selection_loss_binary)
|
|
40
|
+
or (model_task_type == ModelTaskType.MULTICLASS)
|
|
41
|
+
and (loss in selection_loss_multi_clf)
|
|
41
42
|
)
|
|
43
|
+
else False
|
|
44
|
+
)
|
|
42
45
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
46
|
+
if use_custom_loss:
|
|
47
|
+
runtime_parameters.properties["lightgbm_params_preselection.objective"] = loss
|
|
48
|
+
runtime_parameters.properties["lightgbm_params_base.objective"] = loss
|
|
49
|
+
runtime_parameters.properties["lightgbm_params_segment.objective"] = loss
|
|
50
|
+
msg = bundle.get("loss_selection_info").format(loss)
|
|
51
|
+
logger.info(msg)
|
|
52
|
+
print(msg)
|
|
53
|
+
else:
|
|
54
|
+
msg = bundle.get("loss_selection_warn").format(loss, model_task_type)
|
|
55
|
+
logger.warning(msg)
|
|
56
|
+
print(msg)
|
|
54
57
|
|
|
55
58
|
return runtime_parameters
|
|
56
59
|
|