upgini 1.2.143__tar.gz → 1.2.144__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (82) hide show
  1. {upgini-1.2.143 → upgini-1.2.144}/PKG-INFO +1 -1
  2. upgini-1.2.144/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/features_enricher.py +20 -14
  4. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/search_task.py +2 -1
  5. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/postal_code_utils.py +4 -0
  6. upgini-1.2.143/src/upgini/__about__.py +0 -1
  7. {upgini-1.2.143 → upgini-1.2.144}/.gitignore +0 -0
  8. {upgini-1.2.143 → upgini-1.2.144}/LICENSE +0 -0
  9. {upgini-1.2.143 → upgini-1.2.144}/README.md +0 -0
  10. {upgini-1.2.143 → upgini-1.2.144}/pyproject.toml +0 -0
  11. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/__init__.py +0 -0
  12. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/ads.py +0 -0
  13. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/ads_management/__init__.py +0 -0
  14. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/ads_management/ads_manager.py +0 -0
  15. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/autofe/__init__.py +0 -0
  16. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/autofe/all_operators.py +0 -0
  17. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/autofe/binary.py +0 -0
  18. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/autofe/date.py +0 -0
  19. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/autofe/feature.py +0 -0
  20. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/autofe/groupby.py +0 -0
  21. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/autofe/operator.py +0 -0
  22. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/autofe/timeseries/__init__.py +0 -0
  23. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/autofe/timeseries/base.py +0 -0
  24. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/autofe/timeseries/cross.py +0 -0
  25. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/autofe/timeseries/delta.py +0 -0
  26. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/autofe/timeseries/lag.py +0 -0
  27. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/autofe/timeseries/roll.py +0 -0
  28. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/autofe/timeseries/trend.py +0 -0
  29. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/autofe/timeseries/volatility.py +0 -0
  30. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/autofe/unary.py +0 -0
  31. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/autofe/utils.py +0 -0
  32. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/autofe/vector.py +0 -0
  33. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/data_source/__init__.py +0 -0
  34. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/data_source/data_source_publisher.py +0 -0
  35. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/dataset.py +0 -0
  36. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/errors.py +0 -0
  37. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/http.py +0 -0
  38. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/mdc/__init__.py +0 -0
  39. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/mdc/context.py +0 -0
  40. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/metadata.py +0 -0
  41. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/metrics.py +0 -0
  42. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/normalizer/__init__.py +0 -0
  43. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/normalizer/normalize_utils.py +0 -0
  44. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/resource_bundle/__init__.py +0 -0
  45. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/resource_bundle/exceptions.py +0 -0
  46. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/resource_bundle/strings.properties +0 -0
  47. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  48. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/sampler/__init__.py +0 -0
  49. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/sampler/base.py +0 -0
  50. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/sampler/random_under_sampler.py +0 -0
  51. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/sampler/utils.py +0 -0
  52. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/spinner.py +0 -0
  53. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  54. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/__init__.py +0 -0
  55. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/base_search_key_detector.py +0 -0
  56. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/blocked_time_series.py +0 -0
  57. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/config.py +0 -0
  58. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/country_utils.py +0 -0
  59. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/custom_loss_utils.py +0 -0
  60. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/cv_utils.py +0 -0
  61. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/datetime_utils.py +0 -0
  62. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/deduplicate_utils.py +0 -0
  63. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/display_utils.py +0 -0
  64. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/email_utils.py +0 -0
  65. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/fallback_progress_bar.py +0 -0
  66. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/feature_info.py +0 -0
  67. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/features_validator.py +0 -0
  68. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/format.py +0 -0
  69. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/hash_utils.py +0 -0
  70. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/ip_utils.py +0 -0
  71. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/mstats.py +0 -0
  72. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/phone_utils.py +0 -0
  73. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/progress_bar.py +0 -0
  74. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/psi.py +0 -0
  75. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/sample_utils.py +0 -0
  76. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/sklearn_ext.py +0 -0
  77. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/sort.py +0 -0
  78. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/target_utils.py +0 -0
  79. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/track_info.py +0 -0
  80. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/ts_utils.py +0 -0
  81. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/utils/warning_counter.py +0 -0
  82. {upgini-1.2.143 → upgini-1.2.144}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: upgini
3
- Version: 1.2.143
3
+ Version: 1.2.144
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.144"
@@ -274,7 +274,7 @@ class FeaturesEnricher(TransformerMixin):
274
274
  self.X: pd.DataFrame | None = None
275
275
  self.y: pd.Series | None = None
276
276
  self.eval_set: list[tuple] | None = None
277
- self.autodetected_search_keys: dict[str, SearchKey] = dict()
277
+ self.autodetected_search_keys: dict[str, SearchKey] | None = None
278
278
  self.imbalanced = False
279
279
  self.fit_select_features = True
280
280
  self.__cached_sampled_datasets: dict[str, tuple[pd.DataFrame, pd.DataFrame, pd.Series, dict, dict, dict]] = (
@@ -1311,10 +1311,17 @@ class FeaturesEnricher(TransformerMixin):
1311
1311
  def _get_autodetected_search_keys(self):
1312
1312
  if self.autodetected_search_keys is None and self._search_task is not None:
1313
1313
  meta = self._search_task.get_file_metadata(self._get_trace_id())
1314
- self.autodetected_search_keys = {k: SearchKey[v] for k, v in meta.autodetectedSearchKeys.items()}
1314
+ autodetected_search_keys = meta.autodetectedSearchKeys or {}
1315
+ self.autodetected_search_keys = {k: SearchKey[v] for k, v in autodetected_search_keys.items()}
1315
1316
 
1316
1317
  return self.autodetected_search_keys
1317
1318
 
1319
+ def _add_autodetected_search_keys(self, adding_search_keys: dict[str, SearchKey]):
1320
+ if self.autodetected_search_keys is None:
1321
+ self.autodetected_search_keys = dict()
1322
+ self.autodetected_search_keys.update(adding_search_keys)
1323
+ return self.autodetected_search_keys
1324
+
1318
1325
  def _get_fit_search_keys_with_original_names(self):
1319
1326
  if self.fit_search_keys is None and self._search_task is not None:
1320
1327
  fit_search_keys = dict()
@@ -2954,10 +2961,6 @@ if response.status_code == 200:
2954
2961
  if add_fit_system_record_id:
2955
2962
  result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
2956
2963
 
2957
- for c in result.columns:
2958
- if result[c].dtype == "category":
2959
- result.loc[:, c] = np.where(~result[c].isin(result[c].dtype.categories), np.nan, result[c])
2960
-
2961
2964
  return result, columns_renaming, generated_features, search_keys
2962
2965
 
2963
2966
  def _selecting_input_and_generated_columns(
@@ -3647,7 +3650,8 @@ if response.status_code == 200:
3647
3650
  keys.append("EMAIL")
3648
3651
  if "DATE" in keys:
3649
3652
  keys.append("DATETIME")
3650
- search_keys_with_autodetection = {**self.search_keys, **self.autodetected_search_keys}
3653
+ autodetected_search_keys = self.autodetected_search_keys or {}
3654
+ search_keys_with_autodetection = {**self.search_keys, **autodetected_search_keys}
3651
3655
  return [c for c, v in search_keys_with_autodetection.items() if v.value.value in keys]
3652
3656
 
3653
3657
  def _validate_train_eval(
@@ -4882,8 +4886,9 @@ if response.status_code == 200:
4882
4886
  maybe_keys = DateSearchKeyDetector().get_search_key_columns(sample, search_keys)
4883
4887
  if len(maybe_keys) > 0:
4884
4888
  datetime_key = maybe_keys[0]
4885
- search_keys[datetime_key] = SearchKey.DATETIME
4886
- self.autodetected_search_keys[datetime_key] = SearchKey.DATETIME
4889
+ new_keys = {datetime_key: SearchKey.DATETIME}
4890
+ search_keys.update(new_keys)
4891
+ self._add_autodetected_search_keys(new_keys)
4887
4892
  self.logger.info(f"Autodetected search key DATETIME in column {datetime_key}")
4888
4893
  print(self.bundle.get("datetime_detected").format(datetime_key))
4889
4894
 
@@ -4892,15 +4897,16 @@ if response.status_code == 200:
4892
4897
  if maybe_keys:
4893
4898
  new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
4894
4899
  search_keys.update(new_keys)
4895
- self.autodetected_search_keys.update(new_keys)
4900
+ self._add_autodetected_search_keys(new_keys)
4896
4901
  self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
4897
4902
  print(self.bundle.get("postal_code_detected").format(maybe_keys))
4898
4903
 
4899
4904
  if SearchKey.COUNTRY not in search_keys.values() and self.country_code is None:
4900
4905
  maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
4901
4906
  if maybe_key:
4902
- search_keys[maybe_key[0]] = SearchKey.COUNTRY
4903
- self.autodetected_search_keys[maybe_key[0]] = SearchKey.COUNTRY
4907
+ new_keys = {maybe_key[0]: SearchKey.COUNTRY}
4908
+ search_keys.update(new_keys)
4909
+ self._add_autodetected_search_keys(new_keys)
4904
4910
  self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
4905
4911
  print(self.bundle.get("country_detected").format(maybe_key))
4906
4912
 
@@ -4910,7 +4916,7 @@ if response.status_code == 200:
4910
4916
  if self.__is_registered or is_demo_dataset:
4911
4917
  new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
4912
4918
  search_keys.update(new_keys)
4913
- self.autodetected_search_keys.update(new_keys)
4919
+ self._add_autodetected_search_keys(new_keys)
4914
4920
  self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
4915
4921
  print(self.bundle.get("email_detected").format(maybe_keys))
4916
4922
  else:
@@ -4926,7 +4932,7 @@ if response.status_code == 200:
4926
4932
  if self.__is_registered or is_demo_dataset:
4927
4933
  new_keys = {key: SearchKey.PHONE for key in maybe_keys}
4928
4934
  search_keys.update(new_keys)
4929
- self.autodetected_search_keys.update(new_keys)
4935
+ self._add_autodetected_search_keys(new_keys)
4930
4936
  self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
4931
4937
  print(self.bundle.get("phone_detected").format(maybe_keys))
4932
4938
  else:
@@ -434,4 +434,5 @@ def _read_parquet(file_content: bytes, file_name: str = "features.parquet"):
434
434
  tmp_file_name = f"{tmp_dir}/{file_name}"
435
435
  with open(tmp_file_name, "wb") as gzip_file:
436
436
  gzip_file.write(file_content)
437
- return pd.read_parquet(tmp_file_name, engine="fastparquet")
437
+ # Note: MLB writes files using pyarrow, so reading with fastparquet may cause errors.
438
+ return pd.read_parquet(tmp_file_name, engine="pyarrow")
@@ -21,6 +21,10 @@ class PostalCodeSearchKeyDetector(BaseSearchKeyDetector):
21
21
  # Returns True if, after removing missing values, values remain,
22
22
  # and all of them match the common characteristics of a postal code.
23
23
  """
24
+ # Check only columns that are candidates for postal code by column name
25
+ if not self._is_search_key_by_name(column.name):
26
+ return False
27
+
24
28
  s = column.copy().dropna().astype(str).str.strip()
25
29
  s = s[s != ""] # remove empty strings
26
30
  if s.empty:
@@ -1 +0,0 @@
1
- __version__ = "1.2.143"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes