upgini 1.2.142a2__tar.gz → 1.2.144__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.142a2 → upgini-1.2.144}/PKG-INFO +1 -1
- upgini-1.2.144/src/upgini/__about__.py +1 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/features_enricher.py +25 -19
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/search_task.py +2 -1
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/postal_code_utils.py +35 -2
- upgini-1.2.142a2/src/upgini/__about__.py +0 -1
- {upgini-1.2.142a2 → upgini-1.2.144}/.gitignore +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/LICENSE +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/README.md +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/pyproject.toml +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/__init__.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/ads.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/autofe/all_operators.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/autofe/operator.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/autofe/timeseries/__init__.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/autofe/timeseries/base.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/autofe/timeseries/cross.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/autofe/timeseries/delta.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/autofe/timeseries/lag.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/autofe/timeseries/roll.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/autofe/timeseries/trend.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/autofe/timeseries/volatility.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/autofe/utils.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/dataset.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/errors.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/http.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/metadata.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/metrics.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/spinner.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/config.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/hash_utils.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/mstats.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/psi.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/sample_utils.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/sort.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/ts_utils.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.142a2 → upgini-1.2.144}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.144"
|
|
@@ -274,7 +274,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
274
274
|
self.X: pd.DataFrame | None = None
|
|
275
275
|
self.y: pd.Series | None = None
|
|
276
276
|
self.eval_set: list[tuple] | None = None
|
|
277
|
-
self.autodetected_search_keys: dict[str, SearchKey] =
|
|
277
|
+
self.autodetected_search_keys: dict[str, SearchKey] | None = None
|
|
278
278
|
self.imbalanced = False
|
|
279
279
|
self.fit_select_features = True
|
|
280
280
|
self.__cached_sampled_datasets: dict[str, tuple[pd.DataFrame, pd.DataFrame, pd.Series, dict, dict, dict]] = (
|
|
@@ -1311,10 +1311,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1311
1311
|
def _get_autodetected_search_keys(self):
|
|
1312
1312
|
if self.autodetected_search_keys is None and self._search_task is not None:
|
|
1313
1313
|
meta = self._search_task.get_file_metadata(self._get_trace_id())
|
|
1314
|
-
|
|
1314
|
+
autodetected_search_keys = meta.autodetectedSearchKeys or {}
|
|
1315
|
+
self.autodetected_search_keys = {k: SearchKey[v] for k, v in autodetected_search_keys.items()}
|
|
1315
1316
|
|
|
1316
1317
|
return self.autodetected_search_keys
|
|
1317
1318
|
|
|
1319
|
+
def _add_autodetected_search_keys(self, adding_search_keys: dict[str, SearchKey]):
|
|
1320
|
+
if self.autodetected_search_keys is None:
|
|
1321
|
+
self.autodetected_search_keys = dict()
|
|
1322
|
+
self.autodetected_search_keys.update(adding_search_keys)
|
|
1323
|
+
return self.autodetected_search_keys
|
|
1324
|
+
|
|
1318
1325
|
def _get_fit_search_keys_with_original_names(self):
|
|
1319
1326
|
if self.fit_search_keys is None and self._search_task is not None:
|
|
1320
1327
|
fit_search_keys = dict()
|
|
@@ -2553,9 +2560,7 @@ if response.status_code == 200:
|
|
|
2553
2560
|
|
|
2554
2561
|
self.__validate_search_keys(search_keys, self.search_id)
|
|
2555
2562
|
|
|
2556
|
-
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
|
2557
|
-
X, y, eval_set=None, is_transform=True
|
|
2558
|
-
)
|
|
2563
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set=None, is_transform=True)
|
|
2559
2564
|
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
|
|
2560
2565
|
|
|
2561
2566
|
validated_Xy = df.copy()
|
|
@@ -2956,10 +2961,6 @@ if response.status_code == 200:
|
|
|
2956
2961
|
if add_fit_system_record_id:
|
|
2957
2962
|
result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
|
|
2958
2963
|
|
|
2959
|
-
for c in result.columns:
|
|
2960
|
-
if result[c].dtype == "category":
|
|
2961
|
-
result.loc[:, c] = np.where(~result[c].isin(result[c].dtype.categories), np.nan, result[c])
|
|
2962
|
-
|
|
2963
2964
|
return result, columns_renaming, generated_features, search_keys
|
|
2964
2965
|
|
|
2965
2966
|
def _selecting_input_and_generated_columns(
|
|
@@ -3649,7 +3650,8 @@ if response.status_code == 200:
|
|
|
3649
3650
|
keys.append("EMAIL")
|
|
3650
3651
|
if "DATE" in keys:
|
|
3651
3652
|
keys.append("DATETIME")
|
|
3652
|
-
|
|
3653
|
+
autodetected_search_keys = self.autodetected_search_keys or {}
|
|
3654
|
+
search_keys_with_autodetection = {**self.search_keys, **autodetected_search_keys}
|
|
3653
3655
|
return [c for c, v in search_keys_with_autodetection.items() if v.value.value in keys]
|
|
3654
3656
|
|
|
3655
3657
|
def _validate_train_eval(
|
|
@@ -4784,7 +4786,7 @@ if response.status_code == 200:
|
|
|
4784
4786
|
maybe_date = [k for k, v in valid_search_keys.items() if v in [SearchKey.DATE, SearchKey.DATETIME]]
|
|
4785
4787
|
if (self.cv is None or self.cv == CVType.k_fold) and len(maybe_date) > 0 and not silent_mode:
|
|
4786
4788
|
date_column = next(iter(maybe_date))
|
|
4787
|
-
if x[date_column].nunique() > 0.9 * _num_samples(x):
|
|
4789
|
+
if x[date_column].nunique() > 0.9 * _num_samples(x) and not is_transform:
|
|
4788
4790
|
msg = self.bundle.get("date_search_without_time_series")
|
|
4789
4791
|
self.__log_warning(msg)
|
|
4790
4792
|
|
|
@@ -4884,8 +4886,9 @@ if response.status_code == 200:
|
|
|
4884
4886
|
maybe_keys = DateSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
4885
4887
|
if len(maybe_keys) > 0:
|
|
4886
4888
|
datetime_key = maybe_keys[0]
|
|
4887
|
-
|
|
4888
|
-
|
|
4889
|
+
new_keys = {datetime_key: SearchKey.DATETIME}
|
|
4890
|
+
search_keys.update(new_keys)
|
|
4891
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4889
4892
|
self.logger.info(f"Autodetected search key DATETIME in column {datetime_key}")
|
|
4890
4893
|
print(self.bundle.get("datetime_detected").format(datetime_key))
|
|
4891
4894
|
|
|
@@ -4894,15 +4897,16 @@ if response.status_code == 200:
|
|
|
4894
4897
|
if maybe_keys:
|
|
4895
4898
|
new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
|
|
4896
4899
|
search_keys.update(new_keys)
|
|
4897
|
-
self.
|
|
4900
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4898
4901
|
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
|
|
4899
4902
|
print(self.bundle.get("postal_code_detected").format(maybe_keys))
|
|
4900
4903
|
|
|
4901
4904
|
if SearchKey.COUNTRY not in search_keys.values() and self.country_code is None:
|
|
4902
4905
|
maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
4903
4906
|
if maybe_key:
|
|
4904
|
-
|
|
4905
|
-
|
|
4907
|
+
new_keys = {maybe_key[0]: SearchKey.COUNTRY}
|
|
4908
|
+
search_keys.update(new_keys)
|
|
4909
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4906
4910
|
self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
|
|
4907
4911
|
print(self.bundle.get("country_detected").format(maybe_key))
|
|
4908
4912
|
|
|
@@ -4912,7 +4916,7 @@ if response.status_code == 200:
|
|
|
4912
4916
|
if self.__is_registered or is_demo_dataset:
|
|
4913
4917
|
new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
|
|
4914
4918
|
search_keys.update(new_keys)
|
|
4915
|
-
self.
|
|
4919
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4916
4920
|
self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
|
|
4917
4921
|
print(self.bundle.get("email_detected").format(maybe_keys))
|
|
4918
4922
|
else:
|
|
@@ -4928,7 +4932,7 @@ if response.status_code == 200:
|
|
|
4928
4932
|
if self.__is_registered or is_demo_dataset:
|
|
4929
4933
|
new_keys = {key: SearchKey.PHONE for key in maybe_keys}
|
|
4930
4934
|
search_keys.update(new_keys)
|
|
4931
|
-
self.
|
|
4935
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4932
4936
|
self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
|
|
4933
4937
|
print(self.bundle.get("phone_detected").format(maybe_keys))
|
|
4934
4938
|
else:
|
|
@@ -5025,7 +5029,9 @@ if response.status_code == 200:
|
|
|
5025
5029
|
f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping"
|
|
5026
5030
|
)
|
|
5027
5031
|
else:
|
|
5028
|
-
self.rest_client.dump_input_file(
|
|
5032
|
+
self.rest_client.dump_input_file(
|
|
5033
|
+
trace_id_, f"{tmp_dir}/x.parquet", "x.parquet", x_digest_sha256
|
|
5034
|
+
)
|
|
5029
5035
|
|
|
5030
5036
|
if y_ is not None:
|
|
5031
5037
|
if isinstance(y_, pd.Series):
|
|
@@ -434,4 +434,5 @@ def _read_parquet(file_content: bytes, file_name: str = "features.parquet"):
|
|
|
434
434
|
tmp_file_name = f"{tmp_dir}/{file_name}"
|
|
435
435
|
with open(tmp_file_name, "wb") as gzip_file:
|
|
436
436
|
gzip_file.write(file_content)
|
|
437
|
-
|
|
437
|
+
# Note: MLB writes files using pyarrow, so reading with fastparquet may cause errors.
|
|
438
|
+
return pd.read_parquet(tmp_file_name, engine="pyarrow")
|
|
@@ -4,16 +4,49 @@ from pandas.api.types import (
|
|
|
4
4
|
is_object_dtype,
|
|
5
5
|
is_string_dtype,
|
|
6
6
|
)
|
|
7
|
+
import re
|
|
7
8
|
|
|
8
9
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class PostalCodeSearchKeyDetector(BaseSearchKeyDetector):
|
|
13
|
+
postal_pattern = re.compile(r'^[A-Za-z0-9][A-Za-z0-9\s\-]{1,9}$')
|
|
14
|
+
|
|
12
15
|
def _is_search_key_by_name(self, column_name: str) -> bool:
|
|
13
|
-
return str(column_name).lower()
|
|
16
|
+
return "zip" in str(column_name).lower() or "postal" in str(column_name).lower()
|
|
14
17
|
|
|
15
18
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
16
|
-
|
|
19
|
+
"""
|
|
20
|
+
# Fast two-step check whether the column looks like a postal code.
|
|
21
|
+
# Returns True if, after removing missing values, values remain,
|
|
22
|
+
# and all of them match the common characteristics of a postal code.
|
|
23
|
+
"""
|
|
24
|
+
# Check only columns that are candidates for postal code by column name
|
|
25
|
+
if not self._is_search_key_by_name(column.name):
|
|
26
|
+
return False
|
|
27
|
+
|
|
28
|
+
s = column.copy().dropna().astype(str).str.strip()
|
|
29
|
+
s = s[s != ""] # remove empty strings
|
|
30
|
+
if s.empty:
|
|
31
|
+
return False
|
|
32
|
+
|
|
33
|
+
# remove suffix ".0" (often after float)
|
|
34
|
+
s = s.str.replace(r"\.0$", "", regex=True)
|
|
35
|
+
|
|
36
|
+
# --- Step 1: fast filtering ---
|
|
37
|
+
mask_len = s.str.len().between(2, 10)
|
|
38
|
+
mask_digit = s.str.contains(r'\d', regex=True)
|
|
39
|
+
mask_chars = ~s.str.contains(r'[^A-Za-z0-9\s\-]', regex=True)
|
|
40
|
+
fast_mask = mask_len & mask_digit & mask_chars
|
|
41
|
+
|
|
42
|
+
# if any of them failed the fast check, return False
|
|
43
|
+
if not fast_mask.all():
|
|
44
|
+
return False
|
|
45
|
+
|
|
46
|
+
# --- Step 2: regex check ---
|
|
47
|
+
# only if the first step passed
|
|
48
|
+
valid_mask = s.apply(lambda x: bool(self.postal_pattern.fullmatch(x)))
|
|
49
|
+
return valid_mask.all()
|
|
17
50
|
|
|
18
51
|
|
|
19
52
|
class PostalCodeSearchKeyConverter:
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.142a2"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|