upgini 1.1.275a99__tar.gz → 1.1.277__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.275a99/src/upgini.egg-info → upgini-1.1.277}/PKG-INFO +1 -1
- {upgini-1.1.275a99 → upgini-1.1.277}/setup.py +1 -1
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/ads.py +6 -2
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/dataset.py +4 -3
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/features_enricher.py +6 -2
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/__init__.py +3 -2
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/country_utils.py +2 -2
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/datetime_utils.py +6 -5
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/email_utils.py +2 -2
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/target_utils.py +1 -1
- {upgini-1.1.275a99 → upgini-1.1.277/src/upgini.egg-info}/PKG-INFO +1 -1
- {upgini-1.1.275a99 → upgini-1.1.277}/LICENSE +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/README.md +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/pyproject.toml +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/setup.cfg +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/__init__.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/autofe/date.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/errors.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/fingerprint.js +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/http.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/metadata.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/metrics.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/normalizer/phone_normalizer.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/search_task.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/spinner.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini/version_validator.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini.egg-info/SOURCES.txt +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini.egg-info/dependency_links.txt +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini.egg-info/requires.txt +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/src/upgini.egg-info/top_level.txt +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/tests/test_autofe_operands.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/tests/test_binary_dataset.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/tests/test_blocked_time_series.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/tests/test_categorical_dataset.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/tests/test_continuous_dataset.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/tests/test_country_utils.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/tests/test_custom_loss_utils.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/tests/test_datetime_utils.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/tests/test_email_utils.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/tests/test_etalon_validation.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/tests/test_features_enricher.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/tests/test_metrics.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/tests/test_phone_utils.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/tests/test_postal_code_utils.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/tests/test_target_utils.py +0 -0
- {upgini-1.1.275a99 → upgini-1.1.277}/tests/test_widget.py +0 -0
|
@@ -5,7 +5,7 @@ from typing import Dict, Optional
|
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
|
-
from pandas.api.types import is_string_dtype
|
|
8
|
+
from pandas.api.types import is_object_dtype, is_string_dtype
|
|
9
9
|
|
|
10
10
|
from upgini import SearchKey
|
|
11
11
|
from upgini.http import get_rest_client
|
|
@@ -34,7 +34,11 @@ def upload_user_ads(name: str, df: pd.DataFrame, search_keys: Dict[str, SearchKe
|
|
|
34
34
|
if df[column_name].notnull().sum() < min_valid_rows_count:
|
|
35
35
|
raise ValueError(bundle.get("ads_upload_to_many_empty_rows"))
|
|
36
36
|
meaning_type = search_keys[column_name].value
|
|
37
|
-
if
|
|
37
|
+
if (
|
|
38
|
+
meaning_type == FileColumnMeaningType.MSISDN
|
|
39
|
+
and not is_string_dtype(df[column_name])
|
|
40
|
+
and not is_object_dtype(df[column_name])
|
|
41
|
+
):
|
|
38
42
|
df[column_name] = df[column_name].values.astype(np.int64).astype("string") # type: ignore
|
|
39
43
|
else:
|
|
40
44
|
meaning_type = FileColumnMeaningType.FEATURE
|
|
@@ -17,6 +17,7 @@ from pandas.api.types import (
|
|
|
17
17
|
is_numeric_dtype,
|
|
18
18
|
is_period_dtype,
|
|
19
19
|
is_string_dtype,
|
|
20
|
+
is_object_dtype,
|
|
20
21
|
)
|
|
21
22
|
|
|
22
23
|
from upgini.errors import ValidationError
|
|
@@ -219,7 +220,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
219
220
|
"""Check that string values less than maximum characters for LLM"""
|
|
220
221
|
# self.logger.info("Validate too long string values")
|
|
221
222
|
for col in self.data.columns:
|
|
222
|
-
if is_string_dtype(self.data[col]):
|
|
223
|
+
if is_string_dtype(self.data[col]) or is_object_dtype(self.data[col]):
|
|
223
224
|
max_length: int = self.data[col].astype("str").str.len().max()
|
|
224
225
|
if max_length > self.MAX_STRING_FEATURE_LENGTH:
|
|
225
226
|
self.data[col] = self.data[col].astype("str").str.slice(stop=self.MAX_STRING_FEATURE_LENGTH)
|
|
@@ -350,7 +351,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
350
351
|
if postal_code is not None and postal_code in self.data.columns:
|
|
351
352
|
# self.logger.info("Normalize postal code")
|
|
352
353
|
|
|
353
|
-
if is_string_dtype(self.data[postal_code]):
|
|
354
|
+
if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
|
|
354
355
|
try:
|
|
355
356
|
self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
|
|
356
357
|
except Exception:
|
|
@@ -821,7 +822,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
821
822
|
return DataType.INT
|
|
822
823
|
elif is_float_dtype(pandas_data_type):
|
|
823
824
|
return DataType.DECIMAL
|
|
824
|
-
elif is_string_dtype(pandas_data_type):
|
|
825
|
+
elif is_string_dtype(pandas_data_type) or is_object_dtype(pandas_data_type):
|
|
825
826
|
return DataType.STRING
|
|
826
827
|
else:
|
|
827
828
|
msg = self.bundle.get("dataset_invalid_column_type").format(column_name, pandas_data_type)
|
|
@@ -21,6 +21,7 @@ from pandas.api.types import (
|
|
|
21
21
|
is_bool,
|
|
22
22
|
is_datetime64_any_dtype,
|
|
23
23
|
is_numeric_dtype,
|
|
24
|
+
is_object_dtype,
|
|
24
25
|
is_period_dtype,
|
|
25
26
|
is_string_dtype,
|
|
26
27
|
)
|
|
@@ -2982,7 +2983,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2982
2983
|
|
|
2983
2984
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
2984
2985
|
target = df[self.TARGET_NAME]
|
|
2985
|
-
if is_string_dtype(target):
|
|
2986
|
+
if is_string_dtype(target) or is_object_dtype(target):
|
|
2986
2987
|
maybe_numeric_target = pd.to_numeric(target, errors="coerce")
|
|
2987
2988
|
# If less than 5% is non numeric then leave this rows with NaN target and later it will be dropped
|
|
2988
2989
|
if maybe_numeric_target.isna().sum() <= _num_samples(df) * 0.05:
|
|
@@ -3255,6 +3256,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3255
3256
|
descriptions = []
|
|
3256
3257
|
for m in autofe_meta:
|
|
3257
3258
|
autofe_feature = Feature.from_formula(m.formula)
|
|
3259
|
+
orig_to_hashed = {base_column.original_name: base_column.hashed_name for base_column in m.base_columns}
|
|
3260
|
+
autofe_feature.rename_columns(orig_to_hashed)
|
|
3258
3261
|
autofe_feature.set_display_index(m.display_index)
|
|
3259
3262
|
if autofe_feature.op.is_vector:
|
|
3260
3263
|
continue
|
|
@@ -3382,7 +3385,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3382
3385
|
valid_search_keys[column_name] = SearchKey.CUSTOM_KEY
|
|
3383
3386
|
else:
|
|
3384
3387
|
if x[column_name].isnull().all() or (
|
|
3385
|
-
is_string_dtype(x[column_name])
|
|
3388
|
+
(is_string_dtype(x[column_name]) or is_object_dtype(x[column_name]))
|
|
3389
|
+
and (x[column_name].astype("string").str.strip() == "").all()
|
|
3386
3390
|
):
|
|
3387
3391
|
raise ValidationError(self.bundle.get("empty_search_key").format(column_name))
|
|
3388
3392
|
|
|
@@ -2,7 +2,7 @@ import itertools
|
|
|
2
2
|
from typing import List, Tuple
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
|
-
from pandas.api.types import is_string_dtype
|
|
5
|
+
from pandas.api.types import is_string_dtype, is_object_dtype
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def combine_search_keys(search_keys: List[str]) -> List[Tuple[str]]:
|
|
@@ -20,5 +20,6 @@ def find_numbers_with_decimal_comma(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
20
20
|
return [
|
|
21
21
|
col
|
|
22
22
|
for col in tmp.columns
|
|
23
|
-
if is_string_dtype(tmp[col])
|
|
23
|
+
if (is_string_dtype(tmp[col]) or is_object_dtype(tmp[col]))
|
|
24
|
+
and tmp[col].astype("string").str.match("^[0-9]+,[0-9]*$").any()
|
|
24
25
|
]
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
-
from pandas.api.types import is_string_dtype
|
|
2
|
+
from pandas.api.types import is_string_dtype, is_object_dtype
|
|
3
3
|
|
|
4
4
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
5
5
|
|
|
@@ -9,7 +9,7 @@ class CountrySearchKeyDetector(BaseSearchKeyDetector):
|
|
|
9
9
|
return "country" in str(column_name).lower()
|
|
10
10
|
|
|
11
11
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
12
|
-
if not is_string_dtype(column):
|
|
12
|
+
if not is_string_dtype(column) and not is_object_dtype(column):
|
|
13
13
|
return False
|
|
14
14
|
|
|
15
15
|
all_count = len(column)
|
|
@@ -6,7 +6,10 @@ from typing import Dict, List, Optional
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
8
|
from dateutil.relativedelta import relativedelta
|
|
9
|
-
from pandas.api.types import
|
|
9
|
+
from pandas.api.types import (
|
|
10
|
+
is_numeric_dtype,
|
|
11
|
+
is_period_dtype,
|
|
12
|
+
)
|
|
10
13
|
|
|
11
14
|
from upgini.errors import ValidationError
|
|
12
15
|
from upgini.metadata import SearchKey
|
|
@@ -78,9 +81,6 @@ class DateTimeSearchKeyConverter:
|
|
|
78
81
|
df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
|
|
79
82
|
elif isinstance(df[self.date_column].values[0], datetime.date):
|
|
80
83
|
df[self.date_column] = pd.to_datetime(df[self.date_column], errors="coerce")
|
|
81
|
-
elif is_string_dtype(df[self.date_column]):
|
|
82
|
-
df[self.date_column] = df[self.date_column].apply(self.clean_date)
|
|
83
|
-
df[self.date_column] = self.parse_date(df)
|
|
84
84
|
elif is_period_dtype(df[self.date_column]):
|
|
85
85
|
df[self.date_column] = pd.to_datetime(df[self.date_column].astype("string"))
|
|
86
86
|
elif is_numeric_dtype(df[self.date_column]):
|
|
@@ -249,7 +249,8 @@ def validate_dates_distribution(
|
|
|
249
249
|
if col in search_keys:
|
|
250
250
|
continue
|
|
251
251
|
try:
|
|
252
|
-
|
|
252
|
+
# Format mixed to avoid massive warnings
|
|
253
|
+
pd.to_datetime(X[col], format="mixed")
|
|
253
254
|
maybe_date_col = col
|
|
254
255
|
break
|
|
255
256
|
except Exception:
|
|
@@ -4,7 +4,7 @@ from hashlib import sha256
|
|
|
4
4
|
from typing import Dict, List, Optional
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
|
-
from pandas.api.types import is_string_dtype
|
|
7
|
+
from pandas.api.types import is_string_dtype, is_object_dtype
|
|
8
8
|
from upgini.resource_bundle import bundle
|
|
9
9
|
|
|
10
10
|
from upgini.metadata import SearchKey
|
|
@@ -18,7 +18,7 @@ class EmailSearchKeyDetector(BaseSearchKeyDetector):
|
|
|
18
18
|
return str(column_name).lower() in ["email", "e_mail", "e-mail"]
|
|
19
19
|
|
|
20
20
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
21
|
-
if not is_string_dtype(column):
|
|
21
|
+
if not is_string_dtype(column) and not is_object_dtype:
|
|
22
22
|
return False
|
|
23
23
|
if not column.astype("string").str.contains("@").any():
|
|
24
24
|
return False
|
|
@@ -107,7 +107,7 @@ def balance_undersample(
|
|
|
107
107
|
min_class_count = vc[min_class_value]
|
|
108
108
|
|
|
109
109
|
min_class_percent = imbalance_threshold / target_classes_count
|
|
110
|
-
min_class_threshold = min_class_percent * count
|
|
110
|
+
min_class_threshold = int(min_class_percent * count)
|
|
111
111
|
|
|
112
112
|
resampled_data = df
|
|
113
113
|
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|