upgini 1.2.142a1__tar.gz → 1.2.143__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (82) hide show
  1. {upgini-1.2.142a1 → upgini-1.2.143}/PKG-INFO +1 -1
  2. upgini-1.2.143/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/features_enricher.py +6 -5
  4. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/datetime_utils.py +7 -4
  5. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/postal_code_utils.py +31 -2
  6. upgini-1.2.142a1/src/upgini/__about__.py +0 -1
  7. {upgini-1.2.142a1 → upgini-1.2.143}/.gitignore +0 -0
  8. {upgini-1.2.142a1 → upgini-1.2.143}/LICENSE +0 -0
  9. {upgini-1.2.142a1 → upgini-1.2.143}/README.md +0 -0
  10. {upgini-1.2.142a1 → upgini-1.2.143}/pyproject.toml +0 -0
  11. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/__init__.py +0 -0
  12. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/ads.py +0 -0
  13. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/ads_management/__init__.py +0 -0
  14. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/ads_management/ads_manager.py +0 -0
  15. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/autofe/__init__.py +0 -0
  16. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/autofe/all_operators.py +0 -0
  17. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/autofe/binary.py +0 -0
  18. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/autofe/date.py +0 -0
  19. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/autofe/feature.py +0 -0
  20. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/autofe/groupby.py +0 -0
  21. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/autofe/operator.py +0 -0
  22. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/autofe/timeseries/__init__.py +0 -0
  23. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/autofe/timeseries/base.py +0 -0
  24. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/autofe/timeseries/cross.py +0 -0
  25. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/autofe/timeseries/delta.py +0 -0
  26. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/autofe/timeseries/lag.py +0 -0
  27. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/autofe/timeseries/roll.py +0 -0
  28. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/autofe/timeseries/trend.py +0 -0
  29. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/autofe/timeseries/volatility.py +0 -0
  30. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/autofe/unary.py +0 -0
  31. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/autofe/utils.py +0 -0
  32. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/autofe/vector.py +0 -0
  33. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/data_source/__init__.py +0 -0
  34. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/data_source/data_source_publisher.py +0 -0
  35. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/dataset.py +0 -0
  36. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/errors.py +0 -0
  37. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/http.py +0 -0
  38. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/mdc/__init__.py +0 -0
  39. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/mdc/context.py +0 -0
  40. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/metadata.py +0 -0
  41. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/metrics.py +0 -0
  42. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/normalizer/__init__.py +0 -0
  43. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/normalizer/normalize_utils.py +0 -0
  44. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/resource_bundle/__init__.py +0 -0
  45. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/resource_bundle/exceptions.py +0 -0
  46. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/resource_bundle/strings.properties +0 -0
  47. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  48. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/sampler/__init__.py +0 -0
  49. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/sampler/base.py +0 -0
  50. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/sampler/random_under_sampler.py +0 -0
  51. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/sampler/utils.py +0 -0
  52. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/search_task.py +0 -0
  53. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/spinner.py +0 -0
  54. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  55. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/__init__.py +0 -0
  56. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/base_search_key_detector.py +0 -0
  57. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/blocked_time_series.py +0 -0
  58. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/config.py +0 -0
  59. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/country_utils.py +0 -0
  60. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/custom_loss_utils.py +0 -0
  61. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/cv_utils.py +0 -0
  62. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/deduplicate_utils.py +0 -0
  63. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/display_utils.py +0 -0
  64. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/email_utils.py +0 -0
  65. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/fallback_progress_bar.py +0 -0
  66. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/feature_info.py +0 -0
  67. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/features_validator.py +0 -0
  68. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/format.py +0 -0
  69. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/hash_utils.py +0 -0
  70. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/ip_utils.py +0 -0
  71. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/mstats.py +0 -0
  72. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/phone_utils.py +0 -0
  73. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/progress_bar.py +0 -0
  74. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/psi.py +0 -0
  75. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/sample_utils.py +0 -0
  76. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/sklearn_ext.py +0 -0
  77. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/sort.py +0 -0
  78. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/target_utils.py +0 -0
  79. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/track_info.py +0 -0
  80. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/ts_utils.py +0 -0
  81. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/utils/warning_counter.py +0 -0
  82. {upgini-1.2.142a1 → upgini-1.2.143}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: upgini
3
- Version: 1.2.142a1
3
+ Version: 1.2.143
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.143"
@@ -2553,9 +2553,7 @@ if response.status_code == 200:
2553
2553
 
2554
2554
  self.__validate_search_keys(search_keys, self.search_id)
2555
2555
 
2556
- validated_X, validated_y, validated_eval_set = self._validate_train_eval(
2557
- X, y, eval_set=None, is_transform=True
2558
- )
2556
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set=None, is_transform=True)
2559
2557
  df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
2560
2558
 
2561
2559
  validated_Xy = df.copy()
@@ -3239,6 +3237,7 @@ if response.status_code == 200:
3239
3237
  )
3240
3238
  self.fit_columns_renaming = normalizer.columns_renaming
3241
3239
  if normalizer.removed_datetime_features:
3240
+ self.fit_dropped_features.update(normalizer.removed_datetime_features)
3242
3241
  original_removed_datetime_features = [
3243
3242
  self.fit_columns_renaming.get(f, f) for f in normalizer.removed_datetime_features
3244
3243
  ]
@@ -4783,7 +4782,7 @@ if response.status_code == 200:
4783
4782
  maybe_date = [k for k, v in valid_search_keys.items() if v in [SearchKey.DATE, SearchKey.DATETIME]]
4784
4783
  if (self.cv is None or self.cv == CVType.k_fold) and len(maybe_date) > 0 and not silent_mode:
4785
4784
  date_column = next(iter(maybe_date))
4786
- if x[date_column].nunique() > 0.9 * _num_samples(x):
4785
+ if x[date_column].nunique() > 0.9 * _num_samples(x) and not is_transform:
4787
4786
  msg = self.bundle.get("date_search_without_time_series")
4788
4787
  self.__log_warning(msg)
4789
4788
 
@@ -5024,7 +5023,9 @@ if response.status_code == 200:
5024
5023
  f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping"
5025
5024
  )
5026
5025
  else:
5027
- self.rest_client.dump_input_file(f"{tmp_dir}/x.parquet", "x.parquet", x_digest_sha256)
5026
+ self.rest_client.dump_input_file(
5027
+ trace_id_, f"{tmp_dir}/x.parquet", "x.parquet", x_digest_sha256
5028
+ )
5028
5029
 
5029
5030
  if y_ is not None:
5030
5031
  if isinstance(y_, pd.Series):
@@ -282,10 +282,13 @@ class DateTimeConverter:
282
282
  warnings.filterwarnings("ignore", message="Could not infer format")
283
283
  return pd.to_datetime(df[self.date_column])
284
284
  except ValueError:
285
- if raise_errors:
286
- raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
287
- else:
288
- return None
285
+ try:
286
+ return pd.to_datetime(df[self.date_column], format="mixed", errors="raise")
287
+ except ValueError:
288
+ if raise_errors:
289
+ raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
290
+ else:
291
+ return None
289
292
 
290
293
  def clean_old_dates(self, df: pd.DataFrame) -> pd.DataFrame:
291
294
  condition = df[self.date_column] <= self.MIN_SUPPORTED_DATE_TS
@@ -4,16 +4,45 @@ from pandas.api.types import (
4
4
  is_object_dtype,
5
5
  is_string_dtype,
6
6
  )
7
+ import re
7
8
 
8
9
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
9
10
 
10
11
 
11
12
  class PostalCodeSearchKeyDetector(BaseSearchKeyDetector):
13
+ postal_pattern = re.compile(r'^[A-Za-z0-9][A-Za-z0-9\s\-]{1,9}$')
14
+
12
15
  def _is_search_key_by_name(self, column_name: str) -> bool:
13
- return str(column_name).lower() in ["zip", "zipcode", "zip_code", "postal_code", "postalcode"]
16
+ return "zip" in str(column_name).lower() or "postal" in str(column_name).lower()
14
17
 
15
18
  def _is_search_key_by_values(self, column: pd.Series) -> bool:
16
- return False
19
+ """
20
+ # Fast two-step check whether the column looks like a postal code.
21
+ # Returns True if, after removing missing values, values remain,
22
+ # and all of them match the common characteristics of a postal code.
23
+ """
24
+ s = column.copy().dropna().astype(str).str.strip()
25
+ s = s[s != ""] # remove empty strings
26
+ if s.empty:
27
+ return False
28
+
29
+ # remove suffix ".0" (often after float)
30
+ s = s.str.replace(r"\.0$", "", regex=True)
31
+
32
+ # --- Step 1: fast filtering ---
33
+ mask_len = s.str.len().between(2, 10)
34
+ mask_digit = s.str.contains(r'\d', regex=True)
35
+ mask_chars = ~s.str.contains(r'[^A-Za-z0-9\s\-]', regex=True)
36
+ fast_mask = mask_len & mask_digit & mask_chars
37
+
38
+ # if any of them failed the fast check, return False
39
+ if not fast_mask.all():
40
+ return False
41
+
42
+ # --- Step 2: regex check ---
43
+ # only if the first step passed
44
+ valid_mask = s.apply(lambda x: bool(self.postal_pattern.fullmatch(x)))
45
+ return valid_mask.all()
17
46
 
18
47
 
19
48
  class PostalCodeSearchKeyConverter:
@@ -1 +0,0 @@
1
- __version__ = "1.2.142a1"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes