upgini 1.1.275__tar.gz → 1.1.275a99__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.275/src/upgini.egg-info → upgini-1.1.275a99}/PKG-INFO +1 -1
- {upgini-1.1.275 → upgini-1.1.275a99}/setup.py +1 -1
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/ads.py +2 -6
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/dataset.py +3 -4
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/features_enricher.py +2 -4
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/__init__.py +2 -3
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/country_utils.py +2 -2
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/datetime_utils.py +4 -4
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/email_utils.py +2 -2
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/target_utils.py +1 -1
- {upgini-1.1.275 → upgini-1.1.275a99/src/upgini.egg-info}/PKG-INFO +1 -1
- {upgini-1.1.275 → upgini-1.1.275a99}/LICENSE +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/README.md +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/pyproject.toml +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/setup.cfg +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/__init__.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/autofe/date.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/errors.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/fingerprint.js +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/http.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/metadata.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/metrics.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/normalizer/phone_normalizer.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/search_task.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/spinner.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini/version_validator.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini.egg-info/SOURCES.txt +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini.egg-info/dependency_links.txt +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini.egg-info/requires.txt +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/src/upgini.egg-info/top_level.txt +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/tests/test_autofe_operands.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/tests/test_binary_dataset.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/tests/test_blocked_time_series.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/tests/test_categorical_dataset.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/tests/test_continuous_dataset.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/tests/test_country_utils.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/tests/test_custom_loss_utils.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/tests/test_datetime_utils.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/tests/test_email_utils.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/tests/test_etalon_validation.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/tests/test_features_enricher.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/tests/test_metrics.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/tests/test_phone_utils.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/tests/test_postal_code_utils.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/tests/test_target_utils.py +0 -0
- {upgini-1.1.275 → upgini-1.1.275a99}/tests/test_widget.py +0 -0
|
@@ -5,7 +5,7 @@ from typing import Dict, Optional
|
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
|
-
from pandas.api.types import
|
|
8
|
+
from pandas.api.types import is_string_dtype
|
|
9
9
|
|
|
10
10
|
from upgini import SearchKey
|
|
11
11
|
from upgini.http import get_rest_client
|
|
@@ -34,11 +34,7 @@ def upload_user_ads(name: str, df: pd.DataFrame, search_keys: Dict[str, SearchKe
|
|
|
34
34
|
if df[column_name].notnull().sum() < min_valid_rows_count:
|
|
35
35
|
raise ValueError(bundle.get("ads_upload_to_many_empty_rows"))
|
|
36
36
|
meaning_type = search_keys[column_name].value
|
|
37
|
-
if (
|
|
38
|
-
meaning_type == FileColumnMeaningType.MSISDN
|
|
39
|
-
and not is_string_dtype(df[column_name])
|
|
40
|
-
and not is_object_dtype(df[column_name])
|
|
41
|
-
):
|
|
37
|
+
if meaning_type == FileColumnMeaningType.MSISDN and not is_string_dtype(df[column_name]):
|
|
42
38
|
df[column_name] = df[column_name].values.astype(np.int64).astype("string") # type: ignore
|
|
43
39
|
else:
|
|
44
40
|
meaning_type = FileColumnMeaningType.FEATURE
|
|
@@ -17,7 +17,6 @@ from pandas.api.types import (
|
|
|
17
17
|
is_numeric_dtype,
|
|
18
18
|
is_period_dtype,
|
|
19
19
|
is_string_dtype,
|
|
20
|
-
is_object_dtype,
|
|
21
20
|
)
|
|
22
21
|
|
|
23
22
|
from upgini.errors import ValidationError
|
|
@@ -220,7 +219,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
220
219
|
"""Check that string values less than maximum characters for LLM"""
|
|
221
220
|
# self.logger.info("Validate too long string values")
|
|
222
221
|
for col in self.data.columns:
|
|
223
|
-
if is_string_dtype(self.data[col])
|
|
222
|
+
if is_string_dtype(self.data[col]):
|
|
224
223
|
max_length: int = self.data[col].astype("str").str.len().max()
|
|
225
224
|
if max_length > self.MAX_STRING_FEATURE_LENGTH:
|
|
226
225
|
self.data[col] = self.data[col].astype("str").str.slice(stop=self.MAX_STRING_FEATURE_LENGTH)
|
|
@@ -351,7 +350,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
351
350
|
if postal_code is not None and postal_code in self.data.columns:
|
|
352
351
|
# self.logger.info("Normalize postal code")
|
|
353
352
|
|
|
354
|
-
if is_string_dtype(self.data[postal_code])
|
|
353
|
+
if is_string_dtype(self.data[postal_code]):
|
|
355
354
|
try:
|
|
356
355
|
self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
|
|
357
356
|
except Exception:
|
|
@@ -822,7 +821,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
822
821
|
return DataType.INT
|
|
823
822
|
elif is_float_dtype(pandas_data_type):
|
|
824
823
|
return DataType.DECIMAL
|
|
825
|
-
elif is_string_dtype(pandas_data_type)
|
|
824
|
+
elif is_string_dtype(pandas_data_type):
|
|
826
825
|
return DataType.STRING
|
|
827
826
|
else:
|
|
828
827
|
msg = self.bundle.get("dataset_invalid_column_type").format(column_name, pandas_data_type)
|
|
@@ -21,7 +21,6 @@ from pandas.api.types import (
|
|
|
21
21
|
is_bool,
|
|
22
22
|
is_datetime64_any_dtype,
|
|
23
23
|
is_numeric_dtype,
|
|
24
|
-
is_object_dtype,
|
|
25
24
|
is_period_dtype,
|
|
26
25
|
is_string_dtype,
|
|
27
26
|
)
|
|
@@ -2983,7 +2982,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2983
2982
|
|
|
2984
2983
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
2985
2984
|
target = df[self.TARGET_NAME]
|
|
2986
|
-
if is_string_dtype(target)
|
|
2985
|
+
if is_string_dtype(target):
|
|
2987
2986
|
maybe_numeric_target = pd.to_numeric(target, errors="coerce")
|
|
2988
2987
|
# If less than 5% is non numeric then leave this rows with NaN target and later it will be dropped
|
|
2989
2988
|
if maybe_numeric_target.isna().sum() <= _num_samples(df) * 0.05:
|
|
@@ -3383,8 +3382,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3383
3382
|
valid_search_keys[column_name] = SearchKey.CUSTOM_KEY
|
|
3384
3383
|
else:
|
|
3385
3384
|
if x[column_name].isnull().all() or (
|
|
3386
|
-
|
|
3387
|
-
and (x[column_name].astype("string").str.strip() == "").all()
|
|
3385
|
+
is_string_dtype(x[column_name]) and (x[column_name].astype("string").str.strip() == "").all()
|
|
3388
3386
|
):
|
|
3389
3387
|
raise ValidationError(self.bundle.get("empty_search_key").format(column_name))
|
|
3390
3388
|
|
|
@@ -2,7 +2,7 @@ import itertools
|
|
|
2
2
|
from typing import List, Tuple
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
|
-
from pandas.api.types import is_string_dtype
|
|
5
|
+
from pandas.api.types import is_string_dtype
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def combine_search_keys(search_keys: List[str]) -> List[Tuple[str]]:
|
|
@@ -20,6 +20,5 @@ def find_numbers_with_decimal_comma(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
20
20
|
return [
|
|
21
21
|
col
|
|
22
22
|
for col in tmp.columns
|
|
23
|
-
if
|
|
24
|
-
and tmp[col].astype("string").str.match("^[0-9]+,[0-9]*$").any()
|
|
23
|
+
if is_string_dtype(tmp[col]) and tmp[col].astype("string").str.match("^[0-9]+,[0-9]*$").any()
|
|
25
24
|
]
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
-
from pandas.api.types import is_string_dtype
|
|
2
|
+
from pandas.api.types import is_string_dtype
|
|
3
3
|
|
|
4
4
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
5
5
|
|
|
@@ -9,7 +9,7 @@ class CountrySearchKeyDetector(BaseSearchKeyDetector):
|
|
|
9
9
|
return "country" in str(column_name).lower()
|
|
10
10
|
|
|
11
11
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
12
|
-
if not is_string_dtype(column)
|
|
12
|
+
if not is_string_dtype(column):
|
|
13
13
|
return False
|
|
14
14
|
|
|
15
15
|
all_count = len(column)
|
|
@@ -6,10 +6,7 @@ from typing import Dict, List, Optional
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
8
|
from dateutil.relativedelta import relativedelta
|
|
9
|
-
from pandas.api.types import
|
|
10
|
-
is_numeric_dtype,
|
|
11
|
-
is_period_dtype,
|
|
12
|
-
)
|
|
9
|
+
from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
|
|
13
10
|
|
|
14
11
|
from upgini.errors import ValidationError
|
|
15
12
|
from upgini.metadata import SearchKey
|
|
@@ -81,6 +78,9 @@ class DateTimeSearchKeyConverter:
|
|
|
81
78
|
df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
|
|
82
79
|
elif isinstance(df[self.date_column].values[0], datetime.date):
|
|
83
80
|
df[self.date_column] = pd.to_datetime(df[self.date_column], errors="coerce")
|
|
81
|
+
elif is_string_dtype(df[self.date_column]):
|
|
82
|
+
df[self.date_column] = df[self.date_column].apply(self.clean_date)
|
|
83
|
+
df[self.date_column] = self.parse_date(df)
|
|
84
84
|
elif is_period_dtype(df[self.date_column]):
|
|
85
85
|
df[self.date_column] = pd.to_datetime(df[self.date_column].astype("string"))
|
|
86
86
|
elif is_numeric_dtype(df[self.date_column]):
|
|
@@ -4,7 +4,7 @@ from hashlib import sha256
|
|
|
4
4
|
from typing import Dict, List, Optional
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
|
-
from pandas.api.types import is_string_dtype
|
|
7
|
+
from pandas.api.types import is_string_dtype
|
|
8
8
|
from upgini.resource_bundle import bundle
|
|
9
9
|
|
|
10
10
|
from upgini.metadata import SearchKey
|
|
@@ -18,7 +18,7 @@ class EmailSearchKeyDetector(BaseSearchKeyDetector):
|
|
|
18
18
|
return str(column_name).lower() in ["email", "e_mail", "e-mail"]
|
|
19
19
|
|
|
20
20
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
21
|
-
if not is_string_dtype(column)
|
|
21
|
+
if not is_string_dtype(column):
|
|
22
22
|
return False
|
|
23
23
|
if not column.astype("string").str.contains("@").any():
|
|
24
24
|
return False
|
|
@@ -107,7 +107,7 @@ def balance_undersample(
|
|
|
107
107
|
min_class_count = vc[min_class_value]
|
|
108
108
|
|
|
109
109
|
min_class_percent = imbalance_threshold / target_classes_count
|
|
110
|
-
min_class_threshold =
|
|
110
|
+
min_class_threshold = min_class_percent * count
|
|
111
111
|
|
|
112
112
|
resampled_data = df
|
|
113
113
|
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|