PyPI - upgini - Versions diffs - 1.1.280.dev0__py3-none-any.whl → 1.2.31__py3-none-any.whl - Mend

upgini 1.1.280.dev0py3-none-any.whl → 1.2.31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (43) hide show

upgini/__about__.py +1 -1
upgini/__init__.py +4 -20
upgini/autofe/all_operands.py +39 -9
upgini/autofe/binary.py +148 -45
upgini/autofe/date.py +197 -26
upgini/autofe/feature.py +102 -19
upgini/autofe/groupby.py +22 -22
upgini/autofe/operand.py +9 -6
upgini/autofe/unary.py +83 -41
upgini/autofe/vector.py +8 -8
upgini/data_source/data_source_publisher.py +128 -5
upgini/dataset.py +50 -386
upgini/features_enricher.py +931 -542
upgini/http.py +27 -16
upgini/lazy_import.py +35 -0
upgini/metadata.py +84 -59
upgini/metrics.py +164 -34
upgini/normalizer/normalize_utils.py +197 -0
upgini/resource_bundle/strings.properties +66 -51
upgini/search_task.py +10 -4
upgini/utils/Roboto-Regular.ttf +0 -0
upgini/utils/base_search_key_detector.py +14 -12
upgini/utils/country_utils.py +16 -0
upgini/utils/custom_loss_utils.py +39 -36
upgini/utils/datetime_utils.py +98 -45
upgini/utils/deduplicate_utils.py +135 -112
upgini/utils/display_utils.py +46 -15
upgini/utils/email_utils.py +54 -16
upgini/utils/feature_info.py +172 -0
upgini/utils/features_validator.py +34 -20
upgini/utils/ip_utils.py +100 -1
upgini/utils/phone_utils.py +343 -0
upgini/utils/postal_code_utils.py +34 -0
upgini/utils/sklearn_ext.py +28 -19
upgini/utils/target_utils.py +113 -57
upgini/utils/warning_counter.py +1 -0
upgini/version_validator.py +8 -4
{upgini-1.1.280.dev0.dist-info → upgini-1.2.31.dist-info}/METADATA +31 -16
upgini-1.2.31.dist-info/RECORD +65 -0
upgini/normalizer/phone_normalizer.py +0 -340
upgini-1.1.280.dev0.dist-info/RECORD +0 -62
{upgini-1.1.280.dev0.dist-info → upgini-1.2.31.dist-info}/WHEEL +0 -0
{upgini-1.1.280.dev0.dist-info → upgini-1.2.31.dist-info}/licenses/LICENSE +0 -0

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -9,37 +9,34 @@ search_stopped=Search request stopped
 polling_search_task=\nRunning search request, search_id={}
 polling_unregister_information=We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
 ads_upload_finish=Thank you for your submission!\nWe'll check your data sharing proposal and get back to you
-demo_dataset_info=Demo training dataset detected. Registration for an API key is not required.
+demo_dataset_info=Demo training dataset detected. Registration for an API key is not required.\n
 transform_usage_info=You use Trial access to Upgini data enrichment. Limit for Trial: {} rows. You have already enriched: {} rows.
 transform_usage_warning=You are trying to launch enrichment for {} rows, which will exceed the rest limit {}.
 # Warnings
 support_link=https://upgini.com/support
-# slack_community_link=https://4mlg.short.gy/join-upgini-community
-# slack_community_text=\nWARNING: Looks like you've run into an error. For help request write us in the Upgini community
-support_text=\nWARNING: Looks like you've run into an error. For help request write us in support
+support_text=Looks like you've run into an error. For help request write us in support
 slack_community_bage=https://img.shields.io/badge/slack-@upgini-orange.svg?logo=slack
 slack_community_alt=Upgini Slack community
-version_warning=\nWARNING: Unsupported library version detected {},\nplease update with “%pip install -U upgini” to the latest {} and restart Jupyter kernel
-unregistered_with_personal_keys=\nWARNING: Search key {} can be used only with personal api_key from profile.upgini.com It will be ignored
-date_only_search=\nWARNING: Search started with DATE search key only\nTry to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IPv4 to your training dataset\nfor search through all the available data sources.\nSee docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history
-date_search_without_time_series=\nWARNING: Looks like your training dataset is a time series. We recommend to set `cv=CVType.time_series` param for correct search results.\nSee docs https://github.com/upgini/upgini#-time-series-prediction-support
-metrics_exclude_paid_features=\nWARNING: Metrics calculated after enrichment has a free features only. To calculate metrics with a full set of relevant features, including commercial data sources, please contact support team:
-metrics_no_important_free_features=\nWARNING: No important free features to calculate metrics
-metrics_no_important_features=\nWARNING: No important features to calculate metrics
+version_warning=Unsupported library version detected {},\nplease update with “%pip install -U upgini” to the latest {} and restart Jupyter kernel
+unregistered_with_personal_keys=Search key {} can be used only with personal api_key from profile.upgini.com It will be ignored
+date_only_search=Search started with DATE search key only\nTry to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IP to your training dataset\nfor search through all the available data sources.\nSee docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history
+date_search_without_time_series=Looks like your training dataset is a time series. We recommend to set `cv=CVType.time_series` param for correct search results.\nSee docs https://github.com/upgini/upgini#-time-series-prediction-support
+metrics_exclude_paid_features=Metrics calculated after enrichment has a free features only. To calculate metrics with a full set of relevant features, including commercial data sources, please contact support team:
+metrics_no_important_free_features=No important free features to calculate metrics
+metrics_no_important_features=No important features to calculate metrics
 metrics_negative_uplift_without_cv=Please re-check that your task is not a time series prediction. If so, restart search with cv=CVType.time_series param for correct search results. See docs https://github.com/upgini/upgini#-time-series-prediction-support
 # metrics_with_trial_features=The calculation of final accuracy metrics using Trial data is not available for unauthorized users.\nGet a free API key on https://upgini.com and repeat your request.
-# transform_with_trial_features=\nWARNING: Your search results contain Trial data sources. To enrich your dataframe using transform or fit_transform with features from these Trial data sources, please register for a Free API key at https://upgini.com and resubmit your request.
+# transform_with_trial_features=Your search results contain Trial data sources. To enrich your dataframe using transform or fit_transform with features from these Trial data sources, please register for a Free API key at https://upgini.com and resubmit your request.
 # Enriching with Trial data is not available for unauthorized users.\nGet a free API key on https://upgini.com and repeat your request.
-metrics_with_paid_features=\nWARNING: The calculation of final accuracy metrics using Paid data is not available.\nContact Upgini support for the data access
-transform_with_paid_features=\nWARNING: Enriching with Paid data is not available.\nContact Upgini support for the data access
-trial_quota_limit_riched=\nWARNING: You have reached the quota limit of trial data usage. Please contact Upgini support to remove restriction
-loss_selection_warn=\nWARNING: Loss `{0}` is not supported for feature selection with {1}
-loss_calc_metrics_warn=\nWARNING: Loss `{0}` is not supported for metrics calculation with {1}
-multivariate_timeseries_detected=\nWARNING: Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
-group_k_fold_in_classification=\nWARNING: Using group K-fold cross-validation split for classification task.
-current_date_added=\nWARNING: No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
+metrics_with_paid_features=The calculation of final accuracy metrics using Paid data is not available.\nContact Upgini support for the data access
+transform_with_paid_features=Enriching with Paid data is not available.\nContact Upgini support for the data access
+trial_quota_limit_riched=You have reached the quota limit of trial data usage. Please contact Upgini support to remove restriction
+loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
+loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
+multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
+group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
+current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
 # Errors
 failed_search_by_task_id=Failed to retrieve the specified search results
 metrics_unfitted_enricher=Call fit method before calling calculate_metrics
@@ -81,24 +78,26 @@ date_and_datetime_simultanious=DATE and DATETIME search keys cannot be used simu
 email_and_hem_simultanious=EMAIL and HEM search keys cannot be used simultaneously. Choose one to keep
 postal_code_without_country=COUNTRY search key required if POSTAL_CODE is present
 multiple_search_key=Search key {} passed multiple times
-unregistered_only_personal_keys=Only personal search keys used. Api_key from profile.upgini.com required for EMAIL/HEM, PHONE NUMBER or IPv4 search keys\nSee docs https://github.com/upgini/upgini#-open-up-all-capabilities-of-upgini
+unregistered_only_personal_keys=Only personal search keys used. Api_key from profile.upgini.com required for EMAIL/HEM, PHONE NUMBER or IPv4/IPv6 search keys\nSee docs https://github.com/upgini/upgini#-open-up-all-capabilities-of-upgini
 search_key_not_found=Column `{}` from search_keys was not found in X dataframe: {}
 numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
 unsupported_search_key_type=Unsupported type of key in search_keys: {}
-search_key_country_and_country_code=\nWARNING: SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
+unsupported_type_of_search_key=Unsupported type of search key: {}. It should be a member of SearchKey
+search_key_country_and_country_code=SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
 empty_search_key=Search key {} is empty. Please fill values or remove this search key
-single_constant_search_key=\nWARNING: Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
-unsupported_index_column=\nWARNING: Your column with name `index` was dropped because it's reserved name is booked for system needs.
+single_constant_search_key=Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
+unsupported_multi_key=Search key {} cannot be used multiple times
+unsupported_index_column=Your column with name `index` was dropped because it's reserved name is booked for system needs.
 date_string_without_format=Date column `{}` has string type, but date_format is not specified. Convert column to datetime type or pass date_format
 invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit date format in date_format argument of FeaturesEnricher constructor
 unsupported_date_type=Unsupported type of date column `{}`. Convert to datetime please.
 invalid_postal_code=All values of POSTAL_CODE column `{}` are invalid
 invalid_country=All values of COUNTRY column `{}` are invalid
-invalid_ip=All values of IPv4 column `{}` are invalid
+invalid_ip=All values of IP column `{}` are invalid
     # X and y validation
 unsupported_x_type=Unsupported type of X: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list
 x_contains_dup_columns=X contains duplicate column names. Please rename or drop duplicates
-x_contains_enriching_columns=\nWARNING: X contains column names that match the names of features from external data sources. They will be dropped from the dataframe before the enrichment: {}
+x_contains_enriching_columns=X contains column names that match the names of features from external data sources. They will be dropped from the dataframe before the enrichment: {}
 unsupported_y_type=Unsupported type of y: {}. Use pandas.DataFrame, pandas.Series, numpy.ndarray or list
 y_is_constant=y is a constant. Relevant feature search requires a non-constant y
 x_and_y_diff_size=X and y has different size: {}, {}.
@@ -111,10 +110,10 @@ y_multiindex_unsupported=Multi index in y is not supported
 x_is_empty=X is empty
 y_is_empty=y is empty
 x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
-missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
-x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
-train_unstable_target=\nWARNING: Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
-eval_unstable_target=\nWARNING: Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
+missing_generate_feature=Feature {} specified in `generate_features` is not present in input columns: {}
+x_unstable_by_date=Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
+train_unstable_target=Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
+eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
     # eval set validation
 unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
 eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
@@ -134,24 +133,27 @@ eval_y_is_empty=y in eval_set is empty.
 x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
 baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
 baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
+missing_features_for_transform=Missing some features for transform that were presented on fit: {}
     # target validation
 empty_target=Target is empty in all rows
 # non_numeric_target=Binary target should be numerical type
-uneven_eval_target_distribution=\nWARNING: y distributions from the training sample and eval_set differ according to the Kolmogorov-Smirnov test,\nwhich makes metrics between the train and eval_set incomparable.
-target_outliers_warning=\nWARNING: We detected {} outliers in your sample.\nExamples of outliers with maximum value of target:\n{}\nOutliers will {}be excluded during the metrics calculation.
+uneven_eval_target_distribution=y distributions from the training sample and eval_set differ according to the Kolmogorov-Smirnov test,\nwhich makes metrics between the train and eval_set incomparable.
+target_outliers_warning=We detected {} outliers in your sample.\nExamples of outliers with maximum value of target:\n{}\nOutliers will {}be excluded during the metrics calculation.
     # features validation
-empty_or_contant_features=\nWARNING: Columns {} has value with frequency more than 99%, removed from X
-high_cardinality_features=\nWARNING: Columns {} has high cardinality (>90% unique values), removed from X
-# one_hot_encoded_features=\nWARNING: One hot encoded features detected. Use int encoding for correct results of fit.\n{}
+empty_or_contant_features=Columns {} has value with frequency more than 99%, removed from X
+high_cardinality_features=Columns {} has high cardinality (>90% unique values), removed from X
+# one_hot_encoded_features=One hot encoded features detected. Use int encoding for correct results of fit.\n{}
     # Dataset validation
 dataset_too_few_rows=X size should be at least {} rows after validation
 dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample X
 dataset_empty_column_names=Some column names are empty. Add names please
-dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
-dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
-dataset_train_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
-dataset_eval_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
-dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
+dataset_full_duplicates={:.5f}% of the rows are fully duplicated
+dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
+dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
+dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
+dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset
 dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
 dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
 dataset_invalid_binary_target=Binary task type should contain only 2 target values, but {} found
@@ -160,8 +162,8 @@ dataset_invalid_regression_target=Unexpected dtype of target for regression task
 dataset_invalid_timeseries_target=Unexpected dtype of target for timeseries task type: {}. Expected float
 dataset_to_many_multiclass_targets=The number of target classes {} exceeds the allowed threshold: {}. Please, correct your data and try again
 dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
-dataset_rarest_class_less_threshold=\nWARNING: Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
-dataset_date_features=\nWARNING: Columns {} is a datetime or period type but not used as a search key, removed from X
+dataset_rarest_class_less_threshold=Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
+dataset_date_features=Columns {} is a datetime or period type but not used as a search key, removed from X
 dataset_too_many_features=Too many features. Maximum number of features is {}
 dataset_constant_target=y contains only one distinct value
 dataset_empty_target=y contains only NaN or incorrect values.
@@ -169,10 +171,9 @@ dataset_invalid_column_type=Unsupported data type of column {}: {}
 dataset_invalid_filter=Unknown field in filter_features. Should be {'min_importance', 'max_psi', 'max_count', 'selected_features'}.
 dataset_too_big_file=Too big size of dataframe X for processing. Please reduce number of rows or columns
 dataset_transform_diff_fit=You try to enrich dataset that column names are different from the train dataset column names that you used on the fit stage. Please make the column names the same as in the train dataset and restart.
-binary_small_dataset=\nWARNING: The least populated class in Target contains less than 1000 rows.\nSmall numbers of observations may negatively affect the number of selected features and quality of your ML model.\nUpgini recommends you increase the number of observations in the least populated class.
+binary_small_dataset=The least populated class in Target contains less than 1000 rows.\nSmall numbers of observations may negatively affect the number of selected features and quality of your ML model.\nUpgini recommends you increase the number of observations in the least populated class.\n
 all_search_keys_invalid=All search keys are invalid
-all_emails_invalid=\nWARNING: All values in column {} are invalid emails
-    # Metrics validation
+all_emails_invalid=All values in column {} are invalid emails    # Metrics validation
 metrics_msle_negative_target=Mean Squared Logarithmic Error cannot be used when y contain negative values
 metrics_unsupported_target_type=Unsupported type of target in y: {}
 metrics_invalid_scoring={} is not a valid scoring value. Use {} to get valid options
@@ -188,10 +189,9 @@ ads_upload_too_few_rows=At least 1000 records per sample are needed. Increase th
 ads_upload_search_key_not_found=Search key {} wasn't found in dataframe columns
 ads_upload_to_many_empty_rows=More than 50% of rows in the submitted sample doesn't contain valid keys\nPlease fill the key columns with valid values and resubmit the data
     # Features info warning
-features_info_zero_important_features=Oops, we can't find any relevant external features for your training dataset,\nmost probably due to issues with search keys formats\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
+features_info_zero_important_features=Oops, we can't find any relevant external features for your training dataset,\nmost probably due to issues with search keys formats.\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
 features_info_zero_hit_rate_search_keys=Oops, looks like values/formats of the search keys {} might be incorrect,\nas we won't be able to match any data source using these values\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
-features_not_generated=\nWARNING: Following features didn't pass checks for automated feature generation: {}
+features_not_generated=Following features didn't pass checks for automated feature generation: {}
 # Information
 postal_code_detected=Postal codes detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
 country_detected=Countries detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
@@ -200,12 +200,19 @@ country_default_determined=Search key country_code `{}` was used as default. \nS
 email_detected=Emails detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
 email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
 phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
-phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
-target_type_detected=\nDetected task type: {}\n
+phone_detected_not_registered=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
+target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
+binary_target_reason=only two unique label-values observed
+non_numeric_multiclass_reason=non-numeric label values observed
+few_unique_label_multiclass_reason=few unique label-values observed and can be considered as categorical
+date_search_key_regression_reason=date search key is present, treating as regression
+many_unique_label_regression_reason=many unique label-values or non-integer floating point values observed
+limited_int_multiclass_reason=integer-like values with limited unique values observed
 # all_ok_community_invite=Chat with us in Slack community:
 all_ok_community_invite=❓ Support request
 too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
 imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
+imbalanced_target=\nTarget is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
 loss_selection_info=Using loss `{}` for feature selection
 loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
@@ -244,6 +251,14 @@ relevant_data_sources_header=Relevant data sources
 relevant_data_sources_all_shap=All features SHAP
 relevant_data_sources_number=Number of relevant features
+# Autofe descriptions
+autofe_descriptions_header=*Description of AutoFE feature names
+autofe_descriptions_sources=Sources
+autofe_descriptions_feature_name=Feature name
+autofe_descriptions_feature=Feature {}
+autofe_descriptions_function=Function
 # Quality metrics table
 quality_metrics_header=Accuracy after enrichment
 quality_metrics_train_segment=Train

upgini/search_task.py CHANGED Viewed

@@ -3,6 +3,7 @@ import tempfile
 import time
 from functools import lru_cache
 from typing import Dict, List, Optional
+import uuid
 import pandas as pd
@@ -97,10 +98,7 @@ class SearchTask:
                     time.sleep(self.POLLING_DELAY_SECONDS)
         except KeyboardInterrupt as e:
             if not check_fit:
-                print(bundle.get("search_stopping"))
-                self.rest_client.stop_search_task_v2(trace_id, search_task_id)
-                self.logger.warning(f"Search {search_task_id} stopped by user")
-                print(bundle.get("search_stopped"))
+                self._stop(trace_id)
             raise e
         print()
@@ -133,6 +131,14 @@ class SearchTask:
         return self
+    def _stop(self, trace_id: Optional[str] = None):
+        trace_id = trace_id or uuid.uuid4()
+        search_task_id = self.initial_search_task_id if self.initial_search_task_id is not None else self.search_task_id
+        print(bundle.get("search_stopping"))
+        self.rest_client.stop_search_task_v2(trace_id, search_task_id)
+        self.logger.warning(f"Search {search_task_id} stopped by user")
+        print(bundle.get("search_stopped"))
     def get_all_features_metadata_v2(self) -> Optional[List[FeaturesMetadataV2]]:
         if self.provider_metadata_v2 is None:
             return None

upgini/utils/Roboto-Regular.ttf ADDED Viewed

Binary file

upgini/utils/base_search_key_detector.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import List
 import pandas as pd
@@ -10,16 +10,18 @@ class BaseSearchKeyDetector:
     def _is_search_key_by_values(self, column: pd.Series) -> bool:
         raise NotImplementedError
-    def _get_search_key_by_name(self, column_names: List[str]) -> Optional[str]:
-        for column_name in column_names:
-            if self._is_search_key_by_name(column_name):
-                return column_name
+    def _get_search_keys_by_name(self, column_names: List[str]) -> List[str]:
+        return [
+            column_name
+            for column_name in column_names
+            if self._is_search_key_by_name(column_name)
+        ]
-    def get_search_key_column(self, df: pd.DataFrame) -> Optional[str]:
-        maybe_column = self._get_search_key_by_name(df.columns.to_list())
-        if maybe_column is not None:
-            return maybe_column
-        for column_name in df.columns:
+    def get_search_key_columns(self, df: pd.DataFrame, existing_search_keys: List[str]) -> List[str]:
+        other_columns = [col for col in df.columns if col not in existing_search_keys]
+        columns_by_names = self._get_search_keys_by_name(other_columns)
+        columns_by_values = []
+        for column_name in other_columns:
             if self._is_search_key_by_values(df[column_name]):
-                return column_name
+                columns_by_values.append(column_name)
+        return list(set(columns_by_names + columns_by_values))

upgini/utils/country_utils.py CHANGED Viewed

@@ -4,6 +4,22 @@ from pandas.api.types import is_object_dtype, is_string_dtype
 from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
+class CountrySearchKeyConverter:
+    def __init__(self, country_col: str):
+        self.country_col = country_col
+    def convert(self, df: pd.DataFrame) -> pd.DataFrame:
+        df[self.country_col] = (
+            df[self.country_col]
+            .astype("string")
+            .str.upper()
+            .str.replace(r"[^A-Z]", "", regex=True)
+            .str.replace("UK", "GB", regex=False)
+        )
+        return df
 class CountrySearchKeyDetector(BaseSearchKeyDetector):
     def _is_search_key_by_name(self, column_name: str) -> bool:
         return "country" in str(column_name).lower()

upgini/utils/custom_loss_utils.py CHANGED Viewed

@@ -11,46 +11,49 @@ def get_runtime_params_custom_loss(
     runtime_parameters: RuntimeParameters,
     logger: Optional[logging.Logger] = None,
 ) -> RuntimeParameters:
+    if not loss:
+        return runtime_parameters
     if logger is None:
         logger = logging.getLogger()
-    if loss is not None:
-        selection_loss_reg = [
-            "regression",
-            "regression_l1",
-            "huber",
-            "poisson",
-            "quantile",
-            "mape",
-            "mean_absolute_percentage_error",
-            "gamma",
-            "tweedie",
-        ]
-        selection_loss_binary = ["binary"]
-        selection_loss_multi_clf = ["multiclass", "multiclassova", "multiclass_ova", "ova", "ovr"]
-        use_custom_loss = (
-            True
-            if (
-                (model_task_type == ModelTaskType.REGRESSION)
-                and (loss in selection_loss_reg)
-                or (model_task_type == ModelTaskType.BINARY)
-                and (loss in selection_loss_binary)
-                or (model_task_type == ModelTaskType.MULTICLASS)
-                and (loss in selection_loss_multi_clf)
-            )
-            else False
+    selection_loss_reg = [
+        "regression",
+        "regression_l1",
+        "huber",
+        "poisson",
+        "quantile",
+        "mape",
+        "mean_absolute_percentage_error",
+        "gamma",
+        "tweedie",
+    ]
+    selection_loss_binary = ["binary"]
+    selection_loss_multi_clf = ["multiclass", "multiclassova", "multiclass_ova", "ova", "ovr"]
+    use_custom_loss = (
+        True
+        if (
+            (model_task_type == ModelTaskType.REGRESSION)
+            and (loss in selection_loss_reg)
+            or (model_task_type == ModelTaskType.BINARY)
+            and (loss in selection_loss_binary)
+            or (model_task_type == ModelTaskType.MULTICLASS)
+            and (loss in selection_loss_multi_clf)
         )
+        else False
+    )
-        if use_custom_loss:
-            runtime_parameters.properties["lightgbm_params_preselection.objective"] = loss
-            runtime_parameters.properties["lightgbm_params_base.objective"] = loss
-            runtime_parameters.properties["lightgbm_params_segment.objective"] = loss
-            msg = bundle.get("loss_selection_info").format(loss)
-            logger.info(msg)
-            print(msg)
-        else:
-            msg = bundle.get("loss_selection_warn").format(loss, model_task_type)
-            logger.warning(msg)
-            print(msg)
+    if use_custom_loss:
+        runtime_parameters.properties["lightgbm_params_preselection.objective"] = loss
+        runtime_parameters.properties["lightgbm_params_base.objective"] = loss
+        runtime_parameters.properties["lightgbm_params_segment.objective"] = loss
+        msg = bundle.get("loss_selection_info").format(loss)
+        logger.info(msg)
+        print(msg)
+    else:
+        msg = bundle.get("loss_selection_warn").format(loss, model_task_type)
+        logger.warning(msg)
+        print(msg)
     return runtime_parameters

upgini 1.1.280.dev0__py3-none-any.whl → 1.2.31__py3-none-any.whl

Potentially problematic release.

upgini 1.1.280.dev0py3-none-any.whl → 1.2.31py3-none-any.whl