upgini 1.1.280.dev0__py3-none-any.whl → 1.2.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (43) hide show
  1. upgini/__about__.py +1 -1
  2. upgini/__init__.py +4 -20
  3. upgini/autofe/all_operands.py +39 -9
  4. upgini/autofe/binary.py +148 -45
  5. upgini/autofe/date.py +197 -26
  6. upgini/autofe/feature.py +102 -19
  7. upgini/autofe/groupby.py +22 -22
  8. upgini/autofe/operand.py +9 -6
  9. upgini/autofe/unary.py +83 -41
  10. upgini/autofe/vector.py +8 -8
  11. upgini/data_source/data_source_publisher.py +128 -5
  12. upgini/dataset.py +50 -386
  13. upgini/features_enricher.py +931 -542
  14. upgini/http.py +27 -16
  15. upgini/lazy_import.py +35 -0
  16. upgini/metadata.py +84 -59
  17. upgini/metrics.py +164 -34
  18. upgini/normalizer/normalize_utils.py +197 -0
  19. upgini/resource_bundle/strings.properties +66 -51
  20. upgini/search_task.py +10 -4
  21. upgini/utils/Roboto-Regular.ttf +0 -0
  22. upgini/utils/base_search_key_detector.py +14 -12
  23. upgini/utils/country_utils.py +16 -0
  24. upgini/utils/custom_loss_utils.py +39 -36
  25. upgini/utils/datetime_utils.py +98 -45
  26. upgini/utils/deduplicate_utils.py +135 -112
  27. upgini/utils/display_utils.py +46 -15
  28. upgini/utils/email_utils.py +54 -16
  29. upgini/utils/feature_info.py +172 -0
  30. upgini/utils/features_validator.py +34 -20
  31. upgini/utils/ip_utils.py +100 -1
  32. upgini/utils/phone_utils.py +343 -0
  33. upgini/utils/postal_code_utils.py +34 -0
  34. upgini/utils/sklearn_ext.py +28 -19
  35. upgini/utils/target_utils.py +113 -57
  36. upgini/utils/warning_counter.py +1 -0
  37. upgini/version_validator.py +8 -4
  38. {upgini-1.1.280.dev0.dist-info → upgini-1.2.31.dist-info}/METADATA +31 -16
  39. upgini-1.2.31.dist-info/RECORD +65 -0
  40. upgini/normalizer/phone_normalizer.py +0 -340
  41. upgini-1.1.280.dev0.dist-info/RECORD +0 -62
  42. {upgini-1.1.280.dev0.dist-info → upgini-1.2.31.dist-info}/WHEEL +0 -0
  43. {upgini-1.1.280.dev0.dist-info → upgini-1.2.31.dist-info}/licenses/LICENSE +0 -0
@@ -9,37 +9,34 @@ search_stopped=Search request stopped
9
9
  polling_search_task=\nRunning search request, search_id={}
10
10
  polling_unregister_information=We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
11
11
  ads_upload_finish=Thank you for your submission!\nWe'll check your data sharing proposal and get back to you
12
- demo_dataset_info=Demo training dataset detected. Registration for an API key is not required.
12
+ demo_dataset_info=Demo training dataset detected. Registration for an API key is not required.\n
13
13
  transform_usage_info=You use Trial access to Upgini data enrichment. Limit for Trial: {} rows. You have already enriched: {} rows.
14
14
  transform_usage_warning=You are trying to launch enrichment for {} rows, which will exceed the rest limit {}.
15
15
 
16
16
  # Warnings
17
17
  support_link=https://upgini.com/support
18
- # slack_community_link=https://4mlg.short.gy/join-upgini-community
19
- # slack_community_text=\nWARNING: Looks like you've run into an error. For help request write us in the Upgini community
20
- support_text=\nWARNING: Looks like you've run into an error. For help request write us in support
18
+ support_text=Looks like you've run into an error. For help request write us in support
21
19
  slack_community_bage=https://img.shields.io/badge/slack-@upgini-orange.svg?logo=slack
22
20
  slack_community_alt=Upgini Slack community
23
- version_warning=\nWARNING: Unsupported library version detected {},\nplease update with “%pip install -U upgini” to the latest {} and restart Jupyter kernel
24
- unregistered_with_personal_keys=\nWARNING: Search key {} can be used only with personal api_key from profile.upgini.com It will be ignored
25
- date_only_search=\nWARNING: Search started with DATE search key only\nTry to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IPv4 to your training dataset\nfor search through all the available data sources.\nSee docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history
26
- date_search_without_time_series=\nWARNING: Looks like your training dataset is a time series. We recommend to set `cv=CVType.time_series` param for correct search results.\nSee docs https://github.com/upgini/upgini#-time-series-prediction-support
27
- metrics_exclude_paid_features=\nWARNING: Metrics calculated after enrichment has a free features only. To calculate metrics with a full set of relevant features, including commercial data sources, please contact support team:
28
- metrics_no_important_free_features=\nWARNING: No important free features to calculate metrics
29
- metrics_no_important_features=\nWARNING: No important features to calculate metrics
21
+ version_warning=Unsupported library version detected {},\nplease update with “%pip install -U upgini” to the latest {} and restart Jupyter kernel
22
+ unregistered_with_personal_keys=Search key {} can be used only with personal api_key from profile.upgini.com It will be ignored
23
+ date_only_search=Search started with DATE search key only\nTry to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IP to your training dataset\nfor search through all the available data sources.\nSee docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history
24
+ date_search_without_time_series=Looks like your training dataset is a time series. We recommend to set `cv=CVType.time_series` param for correct search results.\nSee docs https://github.com/upgini/upgini#-time-series-prediction-support
25
+ metrics_exclude_paid_features=Metrics calculated after enrichment has a free features only. To calculate metrics with a full set of relevant features, including commercial data sources, please contact support team:
26
+ metrics_no_important_free_features=No important free features to calculate metrics
27
+ metrics_no_important_features=No important features to calculate metrics
30
28
  metrics_negative_uplift_without_cv=Please re-check that your task is not a time series prediction. If so, restart search with cv=CVType.time_series param for correct search results. See docs https://github.com/upgini/upgini#-time-series-prediction-support
31
29
  # metrics_with_trial_features=The calculation of final accuracy metrics using Trial data is not available for unauthorized users.\nGet a free API key on https://upgini.com and repeat your request.
32
- # transform_with_trial_features=\nWARNING: Your search results contain Trial data sources. To enrich your dataframe using transform or fit_transform with features from these Trial data sources, please register for a Free API key at https://upgini.com and resubmit your request.
30
+ # transform_with_trial_features=Your search results contain Trial data sources. To enrich your dataframe using transform or fit_transform with features from these Trial data sources, please register for a Free API key at https://upgini.com and resubmit your request.
33
31
  # Enriching with Trial data is not available for unauthorized users.\nGet a free API key on https://upgini.com and repeat your request.
34
- metrics_with_paid_features=\nWARNING: The calculation of final accuracy metrics using Paid data is not available.\nContact Upgini support for the data access
35
- transform_with_paid_features=\nWARNING: Enriching with Paid data is not available.\nContact Upgini support for the data access
36
- trial_quota_limit_riched=\nWARNING: You have reached the quota limit of trial data usage. Please contact Upgini support to remove restriction
37
- loss_selection_warn=\nWARNING: Loss `{0}` is not supported for feature selection with {1}
38
- loss_calc_metrics_warn=\nWARNING: Loss `{0}` is not supported for metrics calculation with {1}
39
- multivariate_timeseries_detected=\nWARNING: Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
40
- group_k_fold_in_classification=\nWARNING: Using group K-fold cross-validation split for classification task.
41
- current_date_added=\nWARNING: No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
42
-
32
+ metrics_with_paid_features=The calculation of final accuracy metrics using Paid data is not available.\nContact Upgini support for the data access
33
+ transform_with_paid_features=Enriching with Paid data is not available.\nContact Upgini support for the data access
34
+ trial_quota_limit_riched=You have reached the quota limit of trial data usage. Please contact Upgini support to remove restriction
35
+ loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
36
+ loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
37
+ multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
38
+ group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
39
+ current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
43
40
  # Errors
44
41
  failed_search_by_task_id=Failed to retrieve the specified search results
45
42
  metrics_unfitted_enricher=Call fit method before calling calculate_metrics
@@ -81,24 +78,26 @@ date_and_datetime_simultanious=DATE and DATETIME search keys cannot be used simu
81
78
  email_and_hem_simultanious=EMAIL and HEM search keys cannot be used simultaneously. Choose one to keep
82
79
  postal_code_without_country=COUNTRY search key required if POSTAL_CODE is present
83
80
  multiple_search_key=Search key {} passed multiple times
84
- unregistered_only_personal_keys=Only personal search keys used. Api_key from profile.upgini.com required for EMAIL/HEM, PHONE NUMBER or IPv4 search keys\nSee docs https://github.com/upgini/upgini#-open-up-all-capabilities-of-upgini
81
+ unregistered_only_personal_keys=Only personal search keys used. Api_key from profile.upgini.com required for EMAIL/HEM, PHONE NUMBER or IPv4/IPv6 search keys\nSee docs https://github.com/upgini/upgini#-open-up-all-capabilities-of-upgini
85
82
  search_key_not_found=Column `{}` from search_keys was not found in X dataframe: {}
86
83
  numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
87
84
  unsupported_search_key_type=Unsupported type of key in search_keys: {}
88
- search_key_country_and_country_code=\nWARNING: SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
85
+ unsupported_type_of_search_key=Unsupported type of search key: {}. It should be a member of SearchKey
86
+ search_key_country_and_country_code=SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
89
87
  empty_search_key=Search key {} is empty. Please fill values or remove this search key
90
- single_constant_search_key=\nWARNING: Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
91
- unsupported_index_column=\nWARNING: Your column with name `index` was dropped because it's reserved name is booked for system needs.
88
+ single_constant_search_key=Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
89
+ unsupported_multi_key=Search key {} cannot be used multiple times
90
+ unsupported_index_column=Your column with name `index` was dropped because it's reserved name is booked for system needs.
92
91
  date_string_without_format=Date column `{}` has string type, but date_format is not specified. Convert column to datetime type or pass date_format
93
92
  invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit date format in date_format argument of FeaturesEnricher constructor
94
93
  unsupported_date_type=Unsupported type of date column `{}`. Convert to datetime please.
95
94
  invalid_postal_code=All values of POSTAL_CODE column `{}` are invalid
96
95
  invalid_country=All values of COUNTRY column `{}` are invalid
97
- invalid_ip=All values of IPv4 column `{}` are invalid
96
+ invalid_ip=All values of IP column `{}` are invalid
98
97
  # X and y validation
99
98
  unsupported_x_type=Unsupported type of X: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list
100
99
  x_contains_dup_columns=X contains duplicate column names. Please rename or drop duplicates
101
- x_contains_enriching_columns=\nWARNING: X contains column names that match the names of features from external data sources. They will be dropped from the dataframe before the enrichment: {}
100
+ x_contains_enriching_columns=X contains column names that match the names of features from external data sources. They will be dropped from the dataframe before the enrichment: {}
102
101
  unsupported_y_type=Unsupported type of y: {}. Use pandas.DataFrame, pandas.Series, numpy.ndarray or list
103
102
  y_is_constant=y is a constant. Relevant feature search requires a non-constant y
104
103
  x_and_y_diff_size=X and y has different size: {}, {}.
@@ -111,10 +110,10 @@ y_multiindex_unsupported=Multi index in y is not supported
111
110
  x_is_empty=X is empty
112
111
  y_is_empty=y is empty
113
112
  x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
114
- missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
115
- x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
116
- train_unstable_target=\nWARNING: Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
117
- eval_unstable_target=\nWARNING: Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
113
+ missing_generate_feature=Feature {} specified in `generate_features` is not present in input columns: {}
114
+ x_unstable_by_date=Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
115
+ train_unstable_target=Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
116
+ eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
118
117
  # eval set validation
119
118
  unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
120
119
  eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
@@ -134,24 +133,27 @@ eval_y_is_empty=y in eval_set is empty.
134
133
  x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
135
134
  baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
136
135
  baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
136
+ missing_features_for_transform=Missing some features for transform that were presented on fit: {}
137
137
  # target validation
138
138
  empty_target=Target is empty in all rows
139
139
  # non_numeric_target=Binary target should be numerical type
140
- uneven_eval_target_distribution=\nWARNING: y distributions from the training sample and eval_set differ according to the Kolmogorov-Smirnov test,\nwhich makes metrics between the train and eval_set incomparable.
141
- target_outliers_warning=\nWARNING: We detected {} outliers in your sample.\nExamples of outliers with maximum value of target:\n{}\nOutliers will {}be excluded during the metrics calculation.
140
+ uneven_eval_target_distribution=y distributions from the training sample and eval_set differ according to the Kolmogorov-Smirnov test,\nwhich makes metrics between the train and eval_set incomparable.
141
+ target_outliers_warning=We detected {} outliers in your sample.\nExamples of outliers with maximum value of target:\n{}\nOutliers will {}be excluded during the metrics calculation.
142
+
142
143
  # features validation
143
- empty_or_contant_features=\nWARNING: Columns {} has value with frequency more than 99%, removed from X
144
- high_cardinality_features=\nWARNING: Columns {} has high cardinality (>90% unique values), removed from X
145
- # one_hot_encoded_features=\nWARNING: One hot encoded features detected. Use int encoding for correct results of fit.\n{}
144
+ empty_or_contant_features=Columns {} has value with frequency more than 99%, removed from X
145
+ high_cardinality_features=Columns {} has high cardinality (>90% unique values), removed from X
146
+ # one_hot_encoded_features=One hot encoded features detected. Use int encoding for correct results of fit.\n{}
147
+
146
148
  # Dataset validation
147
149
  dataset_too_few_rows=X size should be at least {} rows after validation
148
150
  dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample X
149
151
  dataset_empty_column_names=Some column names are empty. Add names please
150
- dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
151
- dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
152
- dataset_train_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
153
- dataset_eval_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
154
- dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
152
+ dataset_full_duplicates={:.5f}% of the rows are fully duplicated
153
+ dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
154
+ dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
155
+ dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
156
+ dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset
155
157
  dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
156
158
  dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
157
159
  dataset_invalid_binary_target=Binary task type should contain only 2 target values, but {} found
@@ -160,8 +162,8 @@ dataset_invalid_regression_target=Unexpected dtype of target for regression task
160
162
  dataset_invalid_timeseries_target=Unexpected dtype of target for timeseries task type: {}. Expected float
161
163
  dataset_to_many_multiclass_targets=The number of target classes {} exceeds the allowed threshold: {}. Please, correct your data and try again
162
164
  dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
163
- dataset_rarest_class_less_threshold=\nWARNING: Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
164
- dataset_date_features=\nWARNING: Columns {} is a datetime or period type but not used as a search key, removed from X
165
+ dataset_rarest_class_less_threshold=Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
166
+ dataset_date_features=Columns {} is a datetime or period type but not used as a search key, removed from X
165
167
  dataset_too_many_features=Too many features. Maximum number of features is {}
166
168
  dataset_constant_target=y contains only one distinct value
167
169
  dataset_empty_target=y contains only NaN or incorrect values.
@@ -169,10 +171,9 @@ dataset_invalid_column_type=Unsupported data type of column {}: {}
169
171
  dataset_invalid_filter=Unknown field in filter_features. Should be {'min_importance', 'max_psi', 'max_count', 'selected_features'}.
170
172
  dataset_too_big_file=Too big size of dataframe X for processing. Please reduce number of rows or columns
171
173
  dataset_transform_diff_fit=You try to enrich dataset that column names are different from the train dataset column names that you used on the fit stage. Please make the column names the same as in the train dataset and restart.
172
- binary_small_dataset=\nWARNING: The least populated class in Target contains less than 1000 rows.\nSmall numbers of observations may negatively affect the number of selected features and quality of your ML model.\nUpgini recommends you increase the number of observations in the least populated class.
174
+ binary_small_dataset=The least populated class in Target contains less than 1000 rows.\nSmall numbers of observations may negatively affect the number of selected features and quality of your ML model.\nUpgini recommends you increase the number of observations in the least populated class.\n
173
175
  all_search_keys_invalid=All search keys are invalid
174
- all_emails_invalid=\nWARNING: All values in column {} are invalid emails
175
- # Metrics validation
176
+ all_emails_invalid=All values in column {} are invalid emails # Metrics validation
176
177
  metrics_msle_negative_target=Mean Squared Logarithmic Error cannot be used when y contain negative values
177
178
  metrics_unsupported_target_type=Unsupported type of target in y: {}
178
179
  metrics_invalid_scoring={} is not a valid scoring value. Use {} to get valid options
@@ -188,10 +189,9 @@ ads_upload_too_few_rows=At least 1000 records per sample are needed. Increase th
188
189
  ads_upload_search_key_not_found=Search key {} wasn't found in dataframe columns
189
190
  ads_upload_to_many_empty_rows=More than 50% of rows in the submitted sample doesn't contain valid keys\nPlease fill the key columns with valid values and resubmit the data
190
191
  # Features info warning
191
- features_info_zero_important_features=Oops, we can't find any relevant external features for your training dataset,\nmost probably due to issues with search keys formats\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
192
+ features_info_zero_important_features=Oops, we can't find any relevant external features for your training dataset,\nmost probably due to issues with search keys formats.\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
192
193
  features_info_zero_hit_rate_search_keys=Oops, looks like values/formats of the search keys {} might be incorrect,\nas we won't be able to match any data source using these values\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
193
- features_not_generated=\nWARNING: Following features didn't pass checks for automated feature generation: {}
194
-
194
+ features_not_generated=Following features didn't pass checks for automated feature generation: {}
195
195
  # Information
196
196
  postal_code_detected=Postal codes detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
197
197
  country_detected=Countries detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
@@ -200,12 +200,19 @@ country_default_determined=Search key country_code `{}` was used as default. \nS
200
200
  email_detected=Emails detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
201
201
  email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
202
202
  phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
203
- phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
204
- target_type_detected=\nDetected task type: {}\n
203
+ phone_detected_not_registered=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
204
+ target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
205
+ binary_target_reason=only two unique label-values observed
206
+ non_numeric_multiclass_reason=non-numeric label values observed
207
+ few_unique_label_multiclass_reason=few unique label-values observed and can be considered as categorical
208
+ date_search_key_regression_reason=date search key is present, treating as regression
209
+ many_unique_label_regression_reason=many unique label-values or non-integer floating point values observed
210
+ limited_int_multiclass_reason=integer-like values with limited unique values observed
205
211
  # all_ok_community_invite=Chat with us in Slack community:
206
212
  all_ok_community_invite=❓ Support request
207
213
  too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
208
214
  imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
215
+ imbalanced_target=\nTarget is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
209
216
  loss_selection_info=Using loss `{}` for feature selection
210
217
  loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
211
218
 
@@ -244,6 +251,14 @@ relevant_data_sources_header=Relevant data sources
244
251
  relevant_data_sources_all_shap=All features SHAP
245
252
  relevant_data_sources_number=Number of relevant features
246
253
 
254
+ # Autofe descriptions
255
+ autofe_descriptions_header=*Description of AutoFE feature names
256
+ autofe_descriptions_sources=Sources
257
+ autofe_descriptions_feature_name=Feature name
258
+ autofe_descriptions_feature=Feature {}
259
+ autofe_descriptions_function=Function
260
+
261
+
247
262
  # Quality metrics table
248
263
  quality_metrics_header=Accuracy after enrichment
249
264
  quality_metrics_train_segment=Train
upgini/search_task.py CHANGED
@@ -3,6 +3,7 @@ import tempfile
3
3
  import time
4
4
  from functools import lru_cache
5
5
  from typing import Dict, List, Optional
6
+ import uuid
6
7
 
7
8
  import pandas as pd
8
9
 
@@ -97,10 +98,7 @@ class SearchTask:
97
98
  time.sleep(self.POLLING_DELAY_SECONDS)
98
99
  except KeyboardInterrupt as e:
99
100
  if not check_fit:
100
- print(bundle.get("search_stopping"))
101
- self.rest_client.stop_search_task_v2(trace_id, search_task_id)
102
- self.logger.warning(f"Search {search_task_id} stopped by user")
103
- print(bundle.get("search_stopped"))
101
+ self._stop(trace_id)
104
102
  raise e
105
103
  print()
106
104
 
@@ -133,6 +131,14 @@ class SearchTask:
133
131
 
134
132
  return self
135
133
 
134
+ def _stop(self, trace_id: Optional[str] = None):
135
+ trace_id = trace_id or uuid.uuid4()
136
+ search_task_id = self.initial_search_task_id if self.initial_search_task_id is not None else self.search_task_id
137
+ print(bundle.get("search_stopping"))
138
+ self.rest_client.stop_search_task_v2(trace_id, search_task_id)
139
+ self.logger.warning(f"Search {search_task_id} stopped by user")
140
+ print(bundle.get("search_stopped"))
141
+
136
142
  def get_all_features_metadata_v2(self) -> Optional[List[FeaturesMetadataV2]]:
137
143
  if self.provider_metadata_v2 is None:
138
144
  return None
Binary file
@@ -1,4 +1,4 @@
1
- from typing import List, Optional
1
+ from typing import List
2
2
 
3
3
  import pandas as pd
4
4
 
@@ -10,16 +10,18 @@ class BaseSearchKeyDetector:
10
10
  def _is_search_key_by_values(self, column: pd.Series) -> bool:
11
11
  raise NotImplementedError
12
12
 
13
- def _get_search_key_by_name(self, column_names: List[str]) -> Optional[str]:
14
- for column_name in column_names:
15
- if self._is_search_key_by_name(column_name):
16
- return column_name
13
+ def _get_search_keys_by_name(self, column_names: List[str]) -> List[str]:
14
+ return [
15
+ column_name
16
+ for column_name in column_names
17
+ if self._is_search_key_by_name(column_name)
18
+ ]
17
19
 
18
- def get_search_key_column(self, df: pd.DataFrame) -> Optional[str]:
19
- maybe_column = self._get_search_key_by_name(df.columns.to_list())
20
- if maybe_column is not None:
21
- return maybe_column
22
-
23
- for column_name in df.columns:
20
+ def get_search_key_columns(self, df: pd.DataFrame, existing_search_keys: List[str]) -> List[str]:
21
+ other_columns = [col for col in df.columns if col not in existing_search_keys]
22
+ columns_by_names = self._get_search_keys_by_name(other_columns)
23
+ columns_by_values = []
24
+ for column_name in other_columns:
24
25
  if self._is_search_key_by_values(df[column_name]):
25
- return column_name
26
+ columns_by_values.append(column_name)
27
+ return list(set(columns_by_names + columns_by_values))
@@ -4,6 +4,22 @@ from pandas.api.types import is_object_dtype, is_string_dtype
4
4
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
5
5
 
6
6
 
7
+ class CountrySearchKeyConverter:
8
+
9
+ def __init__(self, country_col: str):
10
+ self.country_col = country_col
11
+
12
+ def convert(self, df: pd.DataFrame) -> pd.DataFrame:
13
+ df[self.country_col] = (
14
+ df[self.country_col]
15
+ .astype("string")
16
+ .str.upper()
17
+ .str.replace(r"[^A-Z]", "", regex=True)
18
+ .str.replace("UK", "GB", regex=False)
19
+ )
20
+ return df
21
+
22
+
7
23
  class CountrySearchKeyDetector(BaseSearchKeyDetector):
8
24
  def _is_search_key_by_name(self, column_name: str) -> bool:
9
25
  return "country" in str(column_name).lower()
@@ -11,46 +11,49 @@ def get_runtime_params_custom_loss(
11
11
  runtime_parameters: RuntimeParameters,
12
12
  logger: Optional[logging.Logger] = None,
13
13
  ) -> RuntimeParameters:
14
+ if not loss:
15
+ return runtime_parameters
16
+
14
17
  if logger is None:
15
18
  logger = logging.getLogger()
16
- if loss is not None:
17
- selection_loss_reg = [
18
- "regression",
19
- "regression_l1",
20
- "huber",
21
- "poisson",
22
- "quantile",
23
- "mape",
24
- "mean_absolute_percentage_error",
25
- "gamma",
26
- "tweedie",
27
- ]
28
- selection_loss_binary = ["binary"]
29
- selection_loss_multi_clf = ["multiclass", "multiclassova", "multiclass_ova", "ova", "ovr"]
30
- use_custom_loss = (
31
- True
32
- if (
33
- (model_task_type == ModelTaskType.REGRESSION)
34
- and (loss in selection_loss_reg)
35
- or (model_task_type == ModelTaskType.BINARY)
36
- and (loss in selection_loss_binary)
37
- or (model_task_type == ModelTaskType.MULTICLASS)
38
- and (loss in selection_loss_multi_clf)
39
- )
40
- else False
19
+
20
+ selection_loss_reg = [
21
+ "regression",
22
+ "regression_l1",
23
+ "huber",
24
+ "poisson",
25
+ "quantile",
26
+ "mape",
27
+ "mean_absolute_percentage_error",
28
+ "gamma",
29
+ "tweedie",
30
+ ]
31
+ selection_loss_binary = ["binary"]
32
+ selection_loss_multi_clf = ["multiclass", "multiclassova", "multiclass_ova", "ova", "ovr"]
33
+ use_custom_loss = (
34
+ True
35
+ if (
36
+ (model_task_type == ModelTaskType.REGRESSION)
37
+ and (loss in selection_loss_reg)
38
+ or (model_task_type == ModelTaskType.BINARY)
39
+ and (loss in selection_loss_binary)
40
+ or (model_task_type == ModelTaskType.MULTICLASS)
41
+ and (loss in selection_loss_multi_clf)
41
42
  )
43
+ else False
44
+ )
42
45
 
43
- if use_custom_loss:
44
- runtime_parameters.properties["lightgbm_params_preselection.objective"] = loss
45
- runtime_parameters.properties["lightgbm_params_base.objective"] = loss
46
- runtime_parameters.properties["lightgbm_params_segment.objective"] = loss
47
- msg = bundle.get("loss_selection_info").format(loss)
48
- logger.info(msg)
49
- print(msg)
50
- else:
51
- msg = bundle.get("loss_selection_warn").format(loss, model_task_type)
52
- logger.warning(msg)
53
- print(msg)
46
+ if use_custom_loss:
47
+ runtime_parameters.properties["lightgbm_params_preselection.objective"] = loss
48
+ runtime_parameters.properties["lightgbm_params_base.objective"] = loss
49
+ runtime_parameters.properties["lightgbm_params_segment.objective"] = loss
50
+ msg = bundle.get("loss_selection_info").format(loss)
51
+ logger.info(msg)
52
+ print(msg)
53
+ else:
54
+ msg = bundle.get("loss_selection_warn").format(loss, model_task_type)
55
+ logger.warning(msg)
56
+ print(msg)
54
57
 
55
58
  return runtime_parameters
56
59