upgini 1.2.143__tar.gz → 1.2.145__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (82) hide show
  1. {upgini-1.2.143 → upgini-1.2.145}/PKG-INFO +1 -1
  2. upgini-1.2.145/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/features_enricher.py +26 -17
  4. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/search_task.py +2 -1
  5. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/postal_code_utils.py +4 -0
  6. upgini-1.2.143/src/upgini/__about__.py +0 -1
  7. {upgini-1.2.143 → upgini-1.2.145}/.gitignore +0 -0
  8. {upgini-1.2.143 → upgini-1.2.145}/LICENSE +0 -0
  9. {upgini-1.2.143 → upgini-1.2.145}/README.md +0 -0
  10. {upgini-1.2.143 → upgini-1.2.145}/pyproject.toml +0 -0
  11. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/__init__.py +0 -0
  12. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/ads.py +0 -0
  13. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/ads_management/__init__.py +0 -0
  14. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/ads_management/ads_manager.py +0 -0
  15. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/autofe/__init__.py +0 -0
  16. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/autofe/all_operators.py +0 -0
  17. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/autofe/binary.py +0 -0
  18. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/autofe/date.py +0 -0
  19. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/autofe/feature.py +0 -0
  20. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/autofe/groupby.py +0 -0
  21. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/autofe/operator.py +0 -0
  22. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/autofe/timeseries/__init__.py +0 -0
  23. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/autofe/timeseries/base.py +0 -0
  24. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/autofe/timeseries/cross.py +0 -0
  25. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/autofe/timeseries/delta.py +0 -0
  26. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/autofe/timeseries/lag.py +0 -0
  27. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/autofe/timeseries/roll.py +0 -0
  28. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/autofe/timeseries/trend.py +0 -0
  29. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/autofe/timeseries/volatility.py +0 -0
  30. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/autofe/unary.py +0 -0
  31. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/autofe/utils.py +0 -0
  32. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/autofe/vector.py +0 -0
  33. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/data_source/__init__.py +0 -0
  34. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/data_source/data_source_publisher.py +0 -0
  35. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/dataset.py +0 -0
  36. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/errors.py +0 -0
  37. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/http.py +0 -0
  38. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/mdc/__init__.py +0 -0
  39. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/mdc/context.py +0 -0
  40. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/metadata.py +0 -0
  41. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/metrics.py +0 -0
  42. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/normalizer/__init__.py +0 -0
  43. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/normalizer/normalize_utils.py +0 -0
  44. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/resource_bundle/__init__.py +0 -0
  45. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/resource_bundle/exceptions.py +0 -0
  46. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/resource_bundle/strings.properties +0 -0
  47. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  48. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/sampler/__init__.py +0 -0
  49. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/sampler/base.py +0 -0
  50. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/sampler/random_under_sampler.py +0 -0
  51. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/sampler/utils.py +0 -0
  52. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/spinner.py +0 -0
  53. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  54. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/__init__.py +0 -0
  55. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/base_search_key_detector.py +0 -0
  56. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/blocked_time_series.py +0 -0
  57. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/config.py +0 -0
  58. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/country_utils.py +0 -0
  59. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/custom_loss_utils.py +0 -0
  60. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/cv_utils.py +0 -0
  61. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/datetime_utils.py +0 -0
  62. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/deduplicate_utils.py +0 -0
  63. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/display_utils.py +0 -0
  64. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/email_utils.py +0 -0
  65. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/fallback_progress_bar.py +0 -0
  66. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/feature_info.py +0 -0
  67. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/features_validator.py +0 -0
  68. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/format.py +0 -0
  69. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/hash_utils.py +0 -0
  70. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/ip_utils.py +0 -0
  71. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/mstats.py +0 -0
  72. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/phone_utils.py +0 -0
  73. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/progress_bar.py +0 -0
  74. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/psi.py +0 -0
  75. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/sample_utils.py +0 -0
  76. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/sklearn_ext.py +0 -0
  77. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/sort.py +0 -0
  78. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/target_utils.py +0 -0
  79. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/track_info.py +0 -0
  80. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/ts_utils.py +0 -0
  81. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/utils/warning_counter.py +0 -0
  82. {upgini-1.2.143 → upgini-1.2.145}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: upgini
3
- Version: 1.2.143
3
+ Version: 1.2.145
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.145"
@@ -274,7 +274,7 @@ class FeaturesEnricher(TransformerMixin):
274
274
  self.X: pd.DataFrame | None = None
275
275
  self.y: pd.Series | None = None
276
276
  self.eval_set: list[tuple] | None = None
277
- self.autodetected_search_keys: dict[str, SearchKey] = dict()
277
+ self.autodetected_search_keys: dict[str, SearchKey] | None = None
278
278
  self.imbalanced = False
279
279
  self.fit_select_features = True
280
280
  self.__cached_sampled_datasets: dict[str, tuple[pd.DataFrame, pd.DataFrame, pd.Series, dict, dict, dict]] = (
@@ -1311,10 +1311,17 @@ class FeaturesEnricher(TransformerMixin):
1311
1311
  def _get_autodetected_search_keys(self):
1312
1312
  if self.autodetected_search_keys is None and self._search_task is not None:
1313
1313
  meta = self._search_task.get_file_metadata(self._get_trace_id())
1314
- self.autodetected_search_keys = {k: SearchKey[v] for k, v in meta.autodetectedSearchKeys.items()}
1314
+ autodetected_search_keys = meta.autodetectedSearchKeys or {}
1315
+ self.autodetected_search_keys = {k: SearchKey[v] for k, v in autodetected_search_keys.items()}
1315
1316
 
1316
1317
  return self.autodetected_search_keys
1317
1318
 
1319
+ def _add_autodetected_search_keys(self, adding_search_keys: dict[str, SearchKey]):
1320
+ if self.autodetected_search_keys is None:
1321
+ self.autodetected_search_keys = dict()
1322
+ self.autodetected_search_keys.update(adding_search_keys)
1323
+ return self.autodetected_search_keys
1324
+
1318
1325
  def _get_fit_search_keys_with_original_names(self):
1319
1326
  if self.fit_search_keys is None and self._search_task is not None:
1320
1327
  fit_search_keys = dict()
@@ -2954,10 +2961,6 @@ if response.status_code == 200:
2954
2961
  if add_fit_system_record_id:
2955
2962
  result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
2956
2963
 
2957
- for c in result.columns:
2958
- if result[c].dtype == "category":
2959
- result.loc[:, c] = np.where(~result[c].isin(result[c].dtype.categories), np.nan, result[c])
2960
-
2961
2964
  return result, columns_renaming, generated_features, search_keys
2962
2965
 
2963
2966
  def _selecting_input_and_generated_columns(
@@ -2997,15 +3000,16 @@ if response.status_code == 200:
2997
3000
 
2998
3001
  return selected_input_columns + selected_generated_features
2999
3002
 
3000
- def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
3003
+ def _validate_empty_search_keys(self, search_keys: dict[str, SearchKey], is_transform: bool = False):
3001
3004
  if (search_keys is None or len(search_keys) == 0) and self.country_code is None:
3002
- if search_id:
3003
- self.logger.debug(f"search_id {search_id} provided without search_keys")
3005
+ if is_transform:
3006
+ self.logger.debug("Transform started without search_keys")
3004
3007
  return
3005
3008
  else:
3006
3009
  self.logger.warning("search_keys not provided")
3007
3010
  raise ValidationError(self.bundle.get("empty_search_keys"))
3008
3011
 
3012
+ def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
3009
3013
  key_types = search_keys.values()
3010
3014
 
3011
3015
  # Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
@@ -3647,7 +3651,8 @@ if response.status_code == 200:
3647
3651
  keys.append("EMAIL")
3648
3652
  if "DATE" in keys:
3649
3653
  keys.append("DATETIME")
3650
- search_keys_with_autodetection = {**self.search_keys, **self.autodetected_search_keys}
3654
+ autodetected_search_keys = self.autodetected_search_keys or {}
3655
+ search_keys_with_autodetection = {**self.search_keys, **autodetected_search_keys}
3651
3656
  return [c for c, v in search_keys_with_autodetection.items() if v.value.value in keys]
3652
3657
 
3653
3658
  def _validate_train_eval(
@@ -4797,6 +4802,8 @@ if response.status_code == 200:
4797
4802
 
4798
4803
  self.logger.info(f"Prepared search keys: {valid_search_keys}")
4799
4804
 
4805
+ self._validate_empty_search_keys(valid_search_keys, is_transform=is_transform)
4806
+
4800
4807
  return valid_search_keys
4801
4808
 
4802
4809
  def __show_metrics(
@@ -4882,8 +4889,9 @@ if response.status_code == 200:
4882
4889
  maybe_keys = DateSearchKeyDetector().get_search_key_columns(sample, search_keys)
4883
4890
  if len(maybe_keys) > 0:
4884
4891
  datetime_key = maybe_keys[0]
4885
- search_keys[datetime_key] = SearchKey.DATETIME
4886
- self.autodetected_search_keys[datetime_key] = SearchKey.DATETIME
4892
+ new_keys = {datetime_key: SearchKey.DATETIME}
4893
+ search_keys.update(new_keys)
4894
+ self._add_autodetected_search_keys(new_keys)
4887
4895
  self.logger.info(f"Autodetected search key DATETIME in column {datetime_key}")
4888
4896
  print(self.bundle.get("datetime_detected").format(datetime_key))
4889
4897
 
@@ -4892,15 +4900,16 @@ if response.status_code == 200:
4892
4900
  if maybe_keys:
4893
4901
  new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
4894
4902
  search_keys.update(new_keys)
4895
- self.autodetected_search_keys.update(new_keys)
4903
+ self._add_autodetected_search_keys(new_keys)
4896
4904
  self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
4897
4905
  print(self.bundle.get("postal_code_detected").format(maybe_keys))
4898
4906
 
4899
4907
  if SearchKey.COUNTRY not in search_keys.values() and self.country_code is None:
4900
4908
  maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
4901
4909
  if maybe_key:
4902
- search_keys[maybe_key[0]] = SearchKey.COUNTRY
4903
- self.autodetected_search_keys[maybe_key[0]] = SearchKey.COUNTRY
4910
+ new_keys = {maybe_key[0]: SearchKey.COUNTRY}
4911
+ search_keys.update(new_keys)
4912
+ self._add_autodetected_search_keys(new_keys)
4904
4913
  self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
4905
4914
  print(self.bundle.get("country_detected").format(maybe_key))
4906
4915
 
@@ -4910,7 +4919,7 @@ if response.status_code == 200:
4910
4919
  if self.__is_registered or is_demo_dataset:
4911
4920
  new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
4912
4921
  search_keys.update(new_keys)
4913
- self.autodetected_search_keys.update(new_keys)
4922
+ self._add_autodetected_search_keys(new_keys)
4914
4923
  self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
4915
4924
  print(self.bundle.get("email_detected").format(maybe_keys))
4916
4925
  else:
@@ -4926,7 +4935,7 @@ if response.status_code == 200:
4926
4935
  if self.__is_registered or is_demo_dataset:
4927
4936
  new_keys = {key: SearchKey.PHONE for key in maybe_keys}
4928
4937
  search_keys.update(new_keys)
4929
- self.autodetected_search_keys.update(new_keys)
4938
+ self._add_autodetected_search_keys(new_keys)
4930
4939
  self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
4931
4940
  print(self.bundle.get("phone_detected").format(maybe_keys))
4932
4941
  else:
@@ -434,4 +434,5 @@ def _read_parquet(file_content: bytes, file_name: str = "features.parquet"):
434
434
  tmp_file_name = f"{tmp_dir}/{file_name}"
435
435
  with open(tmp_file_name, "wb") as gzip_file:
436
436
  gzip_file.write(file_content)
437
- return pd.read_parquet(tmp_file_name, engine="fastparquet")
437
+ # Note: MLB writes files using pyarrow, so reading with fastparquet may cause errors.
438
+ return pd.read_parquet(tmp_file_name, engine="pyarrow")
@@ -21,6 +21,10 @@ class PostalCodeSearchKeyDetector(BaseSearchKeyDetector):
21
21
  # Returns True if, after removing missing values, values remain,
22
22
  # and all of them match the common characteristics of a postal code.
23
23
  """
24
+ # Check only columns that are candidates for postal code by column name
25
+ if not self._is_search_key_by_name(column.name):
26
+ return False
27
+
24
28
  s = column.copy().dropna().astype(str).str.strip()
25
29
  s = s[s != ""] # remove empty strings
26
30
  if s.empty:
@@ -1 +0,0 @@
1
- __version__ = "1.2.143"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes