upgini 1.2.29a7__tar.gz → 1.2.30a7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.29a7 → upgini-1.2.30a7}/PKG-INFO +1 -1
- upgini-1.2.30a7/src/upgini/__about__.py +1 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/features_enricher.py +9 -18
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/http.py +1 -1
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/email_utils.py +1 -1
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/features_validator.py +1 -13
- upgini-1.2.29a7/src/upgini/__about__.py +0 -1
- {upgini-1.2.29a7 → upgini-1.2.30a7}/.gitignore +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/LICENSE +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/README.md +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/pyproject.toml +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/__init__.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/ads.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/dataset.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/errors.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/metadata.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/metrics.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/search_task.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/spinner.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.29a7 → upgini-1.2.30a7}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.30a7 "
|
|
@@ -1447,11 +1447,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1447
1447
|
client_features = [
|
|
1448
1448
|
c
|
|
1449
1449
|
for c in X_sampled.columns.to_list()
|
|
1450
|
-
if (
|
|
1451
|
-
not self.select_features
|
|
1452
|
-
or c in self.feature_names_
|
|
1453
|
-
or (self.fit_columns_renaming is not None and self.fit_columns_renaming.get(c) in self.feature_names_)
|
|
1454
|
-
)
|
|
1450
|
+
if (not self.select_features or c in self.feature_names_)
|
|
1455
1451
|
and c
|
|
1456
1452
|
not in (
|
|
1457
1453
|
excluding_search_keys
|
|
@@ -1668,10 +1664,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1668
1664
|
generated_features = []
|
|
1669
1665
|
if date_column is not None:
|
|
1670
1666
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
1671
|
-
|
|
1672
|
-
df_with_date_features = converter.convert(df, keep_time=True)
|
|
1673
|
-
df_with_date_features[date_column] = df[date_column]
|
|
1674
|
-
df = df_with_date_features
|
|
1667
|
+
df = converter.convert(df, keep_time=True)
|
|
1675
1668
|
generated_features = converter.generated_features
|
|
1676
1669
|
|
|
1677
1670
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
@@ -1680,10 +1673,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1680
1673
|
df = generator.generate(df)
|
|
1681
1674
|
generated_features.extend(generator.generated_features)
|
|
1682
1675
|
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
columns_renaming = {c: c for c in df.columns}
|
|
1676
|
+
normalizer = Normalizer(self.bundle, self.logger)
|
|
1677
|
+
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
1678
|
+
columns_renaming = normalizer.columns_renaming
|
|
1687
1679
|
|
|
1688
1680
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1689
1681
|
|
|
@@ -2113,7 +2105,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2113
2105
|
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2114
2106
|
if date_column is not None:
|
|
2115
2107
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2116
|
-
df = converter.convert(df
|
|
2108
|
+
df = converter.convert(df)
|
|
2117
2109
|
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
2118
2110
|
generated_features.extend(converter.generated_features)
|
|
2119
2111
|
else:
|
|
@@ -2208,12 +2200,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2208
2200
|
|
|
2209
2201
|
if add_fit_system_record_id:
|
|
2210
2202
|
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
2203
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2204
|
+
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2211
2205
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
2212
2206
|
features_not_to_pass.append(SORT_ID)
|
|
2213
2207
|
|
|
2214
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2215
|
-
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2216
|
-
|
|
2217
2208
|
# search keys might be changed after explode
|
|
2218
2209
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
|
2219
2210
|
df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
|
|
@@ -2232,7 +2223,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2232
2223
|
|
|
2233
2224
|
combined_search_keys = combine_search_keys(search_keys.keys())
|
|
2234
2225
|
|
|
2235
|
-
df_without_features = df.drop(columns=features_not_to_pass
|
|
2226
|
+
df_without_features = df.drop(columns=features_not_to_pass)
|
|
2236
2227
|
|
|
2237
2228
|
df_without_features, full_duplicates_warning = clean_full_duplicates(
|
|
2238
2229
|
df_without_features, self.logger, bundle=self.bundle
|
|
@@ -882,7 +882,7 @@ class _RestClient:
|
|
|
882
882
|
if content_type:
|
|
883
883
|
headers[_RestClient.CONTENT_TYPE_HEADER_NAME] = content_type
|
|
884
884
|
if trace_id:
|
|
885
|
-
headers[_RestClient.TRACE_ID_HEADER_NAME] =
|
|
885
|
+
headers[_RestClient.TRACE_ID_HEADER_NAME] = trace_id
|
|
886
886
|
for header_key, header_value in additional_headers.items():
|
|
887
887
|
headers[header_key] = header_value
|
|
888
888
|
return headers
|
|
@@ -39,7 +39,7 @@ class EmailDomainGenerator:
|
|
|
39
39
|
for email_col in self.email_columns:
|
|
40
40
|
domain_feature = email_col + self.DOMAIN_SUFFIX
|
|
41
41
|
if domain_feature not in df.columns:
|
|
42
|
-
df[domain_feature] = df[email_col].apply(self._email_to_domain)
|
|
42
|
+
df[domain_feature] = df[email_col].apply(self._email_to_domain)
|
|
43
43
|
self.generated_features.append(domain_feature)
|
|
44
44
|
return df
|
|
45
45
|
|
|
@@ -2,7 +2,6 @@ import logging
|
|
|
2
2
|
from logging import Logger
|
|
3
3
|
from typing import Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
|
-
import numpy as np
|
|
6
5
|
import pandas as pd
|
|
7
6
|
from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
|
|
8
7
|
|
|
@@ -84,21 +83,10 @@ class FeaturesValidator:
|
|
|
84
83
|
return [
|
|
85
84
|
i
|
|
86
85
|
for i in df
|
|
87
|
-
if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or
|
|
86
|
+
if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or is_integer_dtype(df[i]))
|
|
88
87
|
and (df[i].nunique(dropna=False) / row_count >= 0.85)
|
|
89
88
|
]
|
|
90
89
|
|
|
91
|
-
@staticmethod
|
|
92
|
-
def __is_integer(series: pd.Series) -> bool:
|
|
93
|
-
return (
|
|
94
|
-
is_integer_dtype(series)
|
|
95
|
-
or series.dropna()
|
|
96
|
-
.apply(
|
|
97
|
-
lambda f: (float.is_integer(f) and abs(f) < np.iinfo(np.int64).max) if isinstance(f, float) else False
|
|
98
|
-
)
|
|
99
|
-
.all()
|
|
100
|
-
)
|
|
101
|
-
|
|
102
90
|
@staticmethod
|
|
103
91
|
def find_constant_features(df: pd.DataFrame) -> List[str]:
|
|
104
92
|
return [i for i in df if df[i].nunique() <= 1]
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.29a7"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|