upgini 1.2.26__tar.gz → 1.2.28__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (67) hide show
  1. {upgini-1.2.26 → upgini-1.2.28}/PKG-INFO +1 -1
  2. upgini-1.2.28/src/upgini/__about__.py +1 -0
  3. upgini-1.2.28/src/upgini/__init__.py +5 -0
  4. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/features_enricher.py +29 -16
  5. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/resource_bundle/strings.properties +2 -2
  6. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/target_utils.py +16 -16
  7. upgini-1.2.26/src/upgini/__about__.py +0 -1
  8. upgini-1.2.26/src/upgini/__init__.py +0 -13
  9. {upgini-1.2.26 → upgini-1.2.28}/.gitignore +0 -0
  10. {upgini-1.2.26 → upgini-1.2.28}/LICENSE +0 -0
  11. {upgini-1.2.26 → upgini-1.2.28}/README.md +0 -0
  12. {upgini-1.2.26 → upgini-1.2.28}/pyproject.toml +0 -0
  13. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/ads.py +0 -0
  14. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/ads_management/__init__.py +0 -0
  15. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/ads_management/ads_manager.py +0 -0
  16. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/autofe/__init__.py +0 -0
  17. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/autofe/all_operands.py +0 -0
  18. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/autofe/binary.py +0 -0
  19. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/autofe/date.py +0 -0
  20. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/autofe/feature.py +0 -0
  21. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/autofe/groupby.py +0 -0
  22. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/autofe/operand.py +0 -0
  23. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/autofe/unary.py +0 -0
  24. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/autofe/vector.py +0 -0
  25. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/data_source/__init__.py +0 -0
  26. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/data_source/data_source_publisher.py +0 -0
  27. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/dataset.py +0 -0
  28. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/errors.py +0 -0
  29. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/http.py +0 -0
  30. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/lazy_import.py +0 -0
  31. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/mdc/__init__.py +0 -0
  32. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/mdc/context.py +0 -0
  33. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/metadata.py +0 -0
  34. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/metrics.py +0 -0
  35. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/normalizer/__init__.py +0 -0
  36. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/normalizer/normalize_utils.py +0 -0
  37. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/resource_bundle/__init__.py +0 -0
  38. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/resource_bundle/exceptions.py +0 -0
  39. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  40. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/sampler/__init__.py +0 -0
  41. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/sampler/base.py +0 -0
  42. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/sampler/random_under_sampler.py +0 -0
  43. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/sampler/utils.py +0 -0
  44. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/search_task.py +0 -0
  45. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/spinner.py +0 -0
  46. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  47. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/__init__.py +0 -0
  48. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/base_search_key_detector.py +0 -0
  49. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/blocked_time_series.py +0 -0
  50. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/country_utils.py +0 -0
  51. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/custom_loss_utils.py +0 -0
  52. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/cv_utils.py +0 -0
  53. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/datetime_utils.py +0 -0
  54. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/deduplicate_utils.py +0 -0
  55. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/display_utils.py +0 -0
  56. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/email_utils.py +0 -0
  57. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/fallback_progress_bar.py +0 -0
  58. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/features_validator.py +0 -0
  59. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/format.py +0 -0
  60. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/ip_utils.py +0 -0
  61. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/phone_utils.py +0 -0
  62. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/postal_code_utils.py +0 -0
  63. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/progress_bar.py +0 -0
  64. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/sklearn_ext.py +0 -0
  65. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/track_info.py +0 -0
  66. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/warning_counter.py +0 -0
  67. {upgini-1.2.26 → upgini-1.2.28}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.26
3
+ Version: 1.2.28
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.28"
@@ -0,0 +1,5 @@
1
+ from upgini.features_enricher import FeaturesEnricher # noqa: F401
2
+ from upgini.metadata import SearchKey, CVType, RuntimeParameters, ModelTaskType # noqa: F401
3
+ import warnings
4
+
5
+ warnings.filterwarnings("ignore", category=UserWarning, module="_distutils_hack")
@@ -2026,7 +2026,10 @@ class FeaturesEnricher(TransformerMixin):
2026
2026
  start_time = time.time()
2027
2027
  with MDC(trace_id=trace_id):
2028
2028
  self.logger.info("Start transform")
2029
- self.__log_debug_information(X, exclude_features_sources=exclude_features_sources)
2029
+
2030
+ validated_X = self._validate_X(X, is_transform=True)
2031
+
2032
+ self.__log_debug_information(validated_X, exclude_features_sources=exclude_features_sources)
2030
2033
 
2031
2034
  self.__validate_search_keys(self.search_keys, self.search_id)
2032
2035
 
@@ -2058,8 +2061,6 @@ class FeaturesEnricher(TransformerMixin):
2058
2061
  self.logger.info(msg)
2059
2062
  print(msg)
2060
2063
 
2061
- validated_X = self._validate_X(X, is_transform=True)
2062
-
2063
2064
  is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
2064
2065
 
2065
2066
  columns_to_drop = [c for c in validated_X.columns if c in self.feature_names_]
@@ -2476,9 +2477,9 @@ class FeaturesEnricher(TransformerMixin):
2476
2477
  validate_scoring_argument(scoring)
2477
2478
 
2478
2479
  self.__log_debug_information(
2479
- X,
2480
- y,
2481
- eval_set,
2480
+ validated_X,
2481
+ validated_y,
2482
+ validated_eval_set,
2482
2483
  exclude_features_sources=exclude_features_sources,
2483
2484
  calculate_metrics=calculate_metrics,
2484
2485
  scoring=scoring,
@@ -2546,9 +2547,11 @@ class FeaturesEnricher(TransformerMixin):
2546
2547
  self.fit_generated_features.extend(generator.generated_features)
2547
2548
 
2548
2549
  # Checks that need validated date
2549
-
2550
- if not is_dates_distribution_valid(df, self.fit_search_keys):
2551
- self.__log_warning(bundle.get("x_unstable_by_date"))
2550
+ try:
2551
+ if not is_dates_distribution_valid(df, self.fit_search_keys):
2552
+ self.__log_warning(bundle.get("x_unstable_by_date"))
2553
+ except Exception:
2554
+ self.logger.exception("Failed to check dates distribution validity")
2552
2555
 
2553
2556
  if (
2554
2557
  is_numeric_dtype(df[self.TARGET_NAME])
@@ -3760,11 +3763,17 @@ class FeaturesEnricher(TransformerMixin):
3760
3763
  if len(passed_unsupported_search_keys) > 0:
3761
3764
  raise ValidationError(self.bundle.get("unsupported_search_key").format(passed_unsupported_search_keys))
3762
3765
 
3766
+ x_columns = [
3767
+ c
3768
+ for c in x.columns
3769
+ if c not in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
3770
+ ]
3771
+
3763
3772
  for column_id, meaning_type in search_keys.items():
3764
3773
  column_name = None
3765
3774
  if isinstance(column_id, str):
3766
3775
  if column_id not in x.columns:
3767
- raise ValidationError(self.bundle.get("search_key_not_found").format(column_id, list(x.columns)))
3776
+ raise ValidationError(self.bundle.get("search_key_not_found").format(column_id, x_columns))
3768
3777
  column_name = column_id
3769
3778
  valid_search_keys[column_name] = meaning_type
3770
3779
  elif isinstance(column_id, int):
@@ -4038,15 +4047,19 @@ class FeaturesEnricher(TransformerMixin):
4038
4047
  half_train = round(len(train) / 2)
4039
4048
  part1 = train[:half_train]
4040
4049
  part2 = train[half_train:]
4041
- train_psi = calculate_psi(part1[self.TARGET_NAME], part2[self.TARGET_NAME])
4042
- if train_psi > 0.2:
4043
- self.__log_warning(self.bundle.get("train_unstable_target").format(train_psi))
4050
+ train_psi_result = calculate_psi(part1[self.TARGET_NAME], part2[self.TARGET_NAME])
4051
+ if isinstance(train_psi_result, Exception):
4052
+ self.logger.exception("Failed to calculate train PSI", train_psi_result)
4053
+ elif train_psi_result > 0.2:
4054
+ self.__log_warning(self.bundle.get("train_unstable_target").format(train_psi_result))
4044
4055
 
4045
4056
  # 2. Check train-test PSI
4046
4057
  if eval1 is not None:
4047
- train_test_psi = calculate_psi(train[self.TARGET_NAME], eval1[self.TARGET_NAME])
4048
- if train_test_psi > 0.2:
4049
- self.__log_warning(self.bundle.get("eval_unstable_target").format(train_test_psi))
4058
+ train_test_psi_result = calculate_psi(train[self.TARGET_NAME], eval1[self.TARGET_NAME])
4059
+ if isinstance(train_test_psi_result, Exception):
4060
+ self.logger.exception("Failed to calculate test PSI", train_test_psi_result)
4061
+ elif train_test_psi_result > 0.2:
4062
+ self.__log_warning(self.bundle.get("eval_unstable_target").format(train_test_psi_result))
4050
4063
 
4051
4064
  def _dump_python_libs(self):
4052
4065
  try:
@@ -201,7 +201,7 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
201
201
  email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
202
202
  phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
203
203
  phone_detected_not_registered=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
204
- target_type_detected=Detected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
204
+ target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
205
205
  binary_target_reason=only two unique label-values observed
206
206
  non_numeric_multiclass_reason=non-numeric label values observed
207
207
  few_unique_label_multiclass_reason=few unique label-values observed and can be considered as categorical
@@ -212,7 +212,7 @@ limited_int_multiclass_reason=integer-like values with limited unique values obs
212
212
  all_ok_community_invite=❓ Support request
213
213
  too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
214
214
  imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
215
- imbalanced_target=Target is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
215
+ imbalanced_target=\nTarget is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
216
216
  loss_selection_info=Using loss `{}` for feature selection
217
217
  loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
218
218
 
@@ -229,25 +229,25 @@ def balance_undersample(
229
229
  return resampled_data
230
230
 
231
231
 
232
- def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
233
- df = pd.concat([expected, actual])
232
+ def calculate_psi(expected: pd.Series, actual: pd.Series) -> Union[float, Exception]:
233
+ try:
234
+ df = pd.concat([expected, actual])
234
235
 
235
- if is_bool_dtype(df):
236
- df = np.where(df, 1, 0)
236
+ if is_bool_dtype(df):
237
+ df = np.where(df, 1, 0)
237
238
 
238
- # Define the bins for the target variable
239
- df_min = df.min()
240
- df_max = df.max()
241
- bins = [df_min, (df_min + df_max) / 2, df_max]
239
+ # Define the bins for the target variable
240
+ df_min = df.min()
241
+ df_max = df.max()
242
+ bins = [df_min, (df_min + df_max) / 2, df_max]
242
243
 
243
- # Calculate the base distribution
244
- train_distribution = expected.value_counts(bins=bins, normalize=True).sort_index().values
244
+ # Calculate the base distribution
245
+ train_distribution = expected.value_counts(bins=bins, normalize=True).sort_index().values
245
246
 
246
- # Calculate the target distribution
247
- test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
247
+ # Calculate the target distribution
248
+ test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
248
249
 
249
- # Calculate the PSI
250
- try:
250
+ # Calculate the PSI
251
251
  return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
252
- except Exception:
253
- return np.nan
252
+ except Exception as e:
253
+ return e
@@ -1 +0,0 @@
1
- __version__ = "1.2.26"
@@ -1,13 +0,0 @@
1
- import os
2
-
3
- from upgini.features_enricher import FeaturesEnricher # noqa: F401
4
- from upgini.metadata import SearchKey, CVType, RuntimeParameters, ModelTaskType # noqa: F401
5
- # from .lazy_import import LazyImport
6
-
7
- os.environ["SETUPTOOLS_USE_DISTUTILS"] = "stdlib"
8
-
9
- # FeaturesEnricher = LazyImport("upgini.features_enricher", "FeaturesEnricher")
10
- # SearchKey = LazyImport("upgini.metadata", "SearchKey")
11
- # RuntimeParameters = LazyImport("upgini.metadata", "RuntimeParameters")
12
- # CVType = LazyImport("upgini.metadata", "CVType")
13
- # ModelTaskType = LazyImport("upgini.metadata", "ModelTaskType")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes