upgini 1.1.312a1__py3-none-any.whl → 1.1.312a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/dataset.py +1 -1
- upgini/features_enricher.py +24 -35
- upgini/utils/email_utils.py +35 -17
- {upgini-1.1.312a1.dist-info → upgini-1.1.312a3.dist-info}/METADATA +1 -1
- {upgini-1.1.312a1.dist-info → upgini-1.1.312a3.dist-info}/RECORD +8 -8
- {upgini-1.1.312a1.dist-info → upgini-1.1.312a3.dist-info}/WHEEL +0 -0
- {upgini-1.1.312a1.dist-info → upgini-1.1.312a3.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.1.
|
|
1
|
+
__version__ = "1.1.312a3"
|
upgini/dataset.py
CHANGED
|
@@ -302,7 +302,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
302
302
|
key
|
|
303
303
|
for search_group in self.search_keys_checked
|
|
304
304
|
for key in search_group
|
|
305
|
-
if self.columns_renaming.get(key)
|
|
305
|
+
if not self.columns_renaming.get(key).endswith(EmailSearchKeyConverter.ONE_DOMAIN_SUFFIX)
|
|
306
306
|
}
|
|
307
307
|
ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS.value)
|
|
308
308
|
if (
|
upgini/features_enricher.py
CHANGED
|
@@ -91,7 +91,7 @@ from upgini.utils.display_utils import (
|
|
|
91
91
|
prepare_and_show_report,
|
|
92
92
|
show_request_quote_button,
|
|
93
93
|
)
|
|
94
|
-
from upgini.utils.email_utils import EmailSearchKeyConverter, EmailSearchKeyDetector
|
|
94
|
+
from upgini.utils.email_utils import EmailDomainGenerator, EmailSearchKeyConverter, EmailSearchKeyDetector
|
|
95
95
|
from upgini.utils.features_validator import FeaturesValidator
|
|
96
96
|
from upgini.utils.format import Format
|
|
97
97
|
from upgini.utils.ip_utils import IpSearchKeyConverter
|
|
@@ -1212,29 +1212,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1212
1212
|
def _has_paid_features(self, exclude_features_sources: Optional[List[str]]) -> bool:
|
|
1213
1213
|
return self._has_features_with_commercial_schema(CommercialSchema.PAID.value, exclude_features_sources)
|
|
1214
1214
|
|
|
1215
|
-
def _extend_x(self, x: pd.DataFrame, is_demo_dataset: bool) -> Tuple[pd.DataFrame, Dict[str, SearchKey]]:
|
|
1216
|
-
search_keys = self.search_keys.copy()
|
|
1217
|
-
search_keys = self.__prepare_search_keys(x, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1218
|
-
|
|
1219
|
-
extended_X = x.copy()
|
|
1220
|
-
generated_features = []
|
|
1221
|
-
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1222
|
-
if date_column is not None:
|
|
1223
|
-
converter = DateTimeSearchKeyConverter(
|
|
1224
|
-
date_column, self.date_format, self.logger, self.bundle, silent_mode=True
|
|
1225
|
-
)
|
|
1226
|
-
extended_X = converter.convert(extended_X, keep_time=True)
|
|
1227
|
-
generated_features.extend(converter.generated_features)
|
|
1228
|
-
email_column = self._get_email_column(search_keys)
|
|
1229
|
-
hem_column = self._get_hem_column(search_keys)
|
|
1230
|
-
if email_column:
|
|
1231
|
-
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
|
|
1232
|
-
extended_X = converter.convert(extended_X)
|
|
1233
|
-
generated_features.extend(converter.generated_features)
|
|
1234
|
-
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
1235
|
-
|
|
1236
|
-
return extended_X, search_keys
|
|
1237
|
-
|
|
1238
1215
|
def _is_input_same_as_fit(
|
|
1239
1216
|
self,
|
|
1240
1217
|
X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
|
|
@@ -1591,6 +1568,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1591
1568
|
df = converter.convert(df, keep_time=True)
|
|
1592
1569
|
generated_features = converter.generated_features
|
|
1593
1570
|
|
|
1571
|
+
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
1572
|
+
if email_columns:
|
|
1573
|
+
generator = EmailDomainGenerator(email_columns)
|
|
1574
|
+
df = generator.generate(df)
|
|
1575
|
+
generated_features.extend(generator.generated_features)
|
|
1576
|
+
|
|
1594
1577
|
normalizer = Normalizer(self.search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
|
|
1595
1578
|
df = normalizer.normalize(df)
|
|
1596
1579
|
columns_renaming = normalizer.columns_renaming
|
|
@@ -1607,13 +1590,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1607
1590
|
self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
|
|
1608
1591
|
df = df.sample(n=sample_rows, random_state=self.random_state)
|
|
1609
1592
|
|
|
1610
|
-
email_column = self._get_email_column(search_keys)
|
|
1611
|
-
hem_column = self._get_hem_column(search_keys)
|
|
1612
|
-
if email_column:
|
|
1613
|
-
converter = EmailSearchKeyConverter(
|
|
1614
|
-
email_column, hem_column, search_keys, columns_renaming, [], self.bundle, self.logger
|
|
1615
|
-
)
|
|
1616
|
-
df = converter.convert(df)
|
|
1617
1593
|
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
1618
1594
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
1619
1595
|
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
@@ -2030,6 +2006,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2030
2006
|
if self.add_date_if_missing:
|
|
2031
2007
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
2032
2008
|
|
|
2009
|
+
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
2010
|
+
if email_columns:
|
|
2011
|
+
generator = EmailDomainGenerator(email_columns)
|
|
2012
|
+
df = generator.generate(df)
|
|
2013
|
+
generated_features.extend(generator.generated_features)
|
|
2014
|
+
|
|
2033
2015
|
normalizer = Normalizer(
|
|
2034
2016
|
search_keys, generated_features, self.bundle, self.logger, self.warning_counter, silent_mode
|
|
2035
2017
|
)
|
|
@@ -2053,7 +2035,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2053
2035
|
|
|
2054
2036
|
email_column = self._get_email_column(search_keys)
|
|
2055
2037
|
hem_column = self._get_hem_column(search_keys)
|
|
2056
|
-
# email_converted_to_hem = False
|
|
2057
2038
|
if email_column:
|
|
2058
2039
|
converter = EmailSearchKeyConverter(
|
|
2059
2040
|
email_column,
|
|
@@ -2064,7 +2045,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2064
2045
|
self.logger,
|
|
2065
2046
|
)
|
|
2066
2047
|
df = converter.convert(df)
|
|
2067
|
-
generated_features.extend(converter.generated_features)
|
|
2068
2048
|
|
|
2069
2049
|
ip_column = self._get_ip_column(search_keys)
|
|
2070
2050
|
if ip_column:
|
|
@@ -2099,7 +2079,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2099
2079
|
for col in features_for_transform:
|
|
2100
2080
|
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
2101
2081
|
features_not_to_pass = [
|
|
2102
|
-
c
|
|
2082
|
+
c
|
|
2083
|
+
for c in df.columns
|
|
2084
|
+
if c not in search_keys.keys() and c not in features_for_transform and c != ENTITY_SYSTEM_RECORD_ID
|
|
2103
2085
|
]
|
|
2104
2086
|
|
|
2105
2087
|
if add_fit_system_record_id:
|
|
@@ -2446,6 +2428,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2446
2428
|
if self.add_date_if_missing:
|
|
2447
2429
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2448
2430
|
|
|
2431
|
+
email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
|
|
2432
|
+
if email_columns:
|
|
2433
|
+
generator = EmailDomainGenerator(
|
|
2434
|
+
email_columns
|
|
2435
|
+
)
|
|
2436
|
+
df = generator.generate(df)
|
|
2437
|
+
self.fit_generated_features.extend(generator.generated_features)
|
|
2438
|
+
|
|
2449
2439
|
# Checks that need validated date
|
|
2450
2440
|
validate_dates_distribution(df, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
|
|
2451
2441
|
|
|
@@ -2488,7 +2478,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2488
2478
|
self.logger,
|
|
2489
2479
|
)
|
|
2490
2480
|
df = converter.convert(df)
|
|
2491
|
-
self.fit_generated_features.extend(converter.generated_features)
|
|
2492
2481
|
|
|
2493
2482
|
ip_column = self._get_ip_column(self.fit_search_keys)
|
|
2494
2483
|
if ip_column:
|
upgini/utils/email_utils.py
CHANGED
|
@@ -28,10 +28,31 @@ class EmailSearchKeyDetector(BaseSearchKeyDetector):
|
|
|
28
28
|
return is_email_count / all_count > 0.1
|
|
29
29
|
|
|
30
30
|
|
|
31
|
+
class EmailDomainGenerator:
|
|
32
|
+
DOMAIN_SUFFIX = "_domain"
|
|
33
|
+
|
|
34
|
+
def __init__(self, email_columns: List[str]):
|
|
35
|
+
self.email_columns = email_columns
|
|
36
|
+
self.generated_features = []
|
|
37
|
+
|
|
38
|
+
def generate(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
39
|
+
for email_col in self.email_columns:
|
|
40
|
+
domain_feature = email_col + self.DOMAIN_SUFFIX
|
|
41
|
+
df[domain_feature] = df[email_col].apply(self._email_to_domain)
|
|
42
|
+
self.generated_features.append(domain_feature)
|
|
43
|
+
return df
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def _email_to_domain(email: str) -> Optional[str]:
|
|
47
|
+
if email is not None and isinstance(email, str) and "@" in email:
|
|
48
|
+
name_and_domain = email.split("@")
|
|
49
|
+
if len(name_and_domain) == 2 and len(name_and_domain[1]) > 0:
|
|
50
|
+
return name_and_domain[1]
|
|
51
|
+
|
|
52
|
+
|
|
31
53
|
class EmailSearchKeyConverter:
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
EMAIL_ONE_DOMAIN_COLUMN_NAME = "email_one_domain"
|
|
54
|
+
HEM_SUFFIX = "_hem"
|
|
55
|
+
ONE_DOMAIN_SUFFIX = "_one_domain"
|
|
35
56
|
|
|
36
57
|
def __init__(
|
|
37
58
|
self,
|
|
@@ -54,7 +75,6 @@ class EmailSearchKeyConverter:
|
|
|
54
75
|
else:
|
|
55
76
|
self.logger = logging.getLogger()
|
|
56
77
|
self.logger.setLevel("FATAL")
|
|
57
|
-
self.generated_features: List[str] = []
|
|
58
78
|
self.email_converted_to_hem = False
|
|
59
79
|
|
|
60
80
|
@staticmethod
|
|
@@ -78,18 +98,19 @@ class EmailSearchKeyConverter:
|
|
|
78
98
|
df = df.copy()
|
|
79
99
|
original_email_column = self.columns_renaming[self.email_column]
|
|
80
100
|
if self.hem_column is None:
|
|
81
|
-
|
|
82
|
-
|
|
101
|
+
hem_name = self.email_column + self.HEM_SUFFIX
|
|
102
|
+
df[hem_name] = df[self.email_column].apply(self._email_to_hem)
|
|
103
|
+
if df[hem_name].isna().all():
|
|
83
104
|
msg = self.bundle.get("all_emails_invalid").format(self.email_column)
|
|
84
105
|
print(msg)
|
|
85
106
|
self.logger.warning(msg)
|
|
86
|
-
df = df.drop(columns=
|
|
107
|
+
df = df.drop(columns=hem_name)
|
|
87
108
|
del self.search_keys[self.email_column]
|
|
88
109
|
return df
|
|
89
|
-
self.search_keys[
|
|
110
|
+
self.search_keys[hem_name] = SearchKey.HEM
|
|
90
111
|
if self.email_column in self.unnest_search_keys:
|
|
91
|
-
self.unnest_search_keys.append(
|
|
92
|
-
self.columns_renaming[
|
|
112
|
+
self.unnest_search_keys.append(hem_name)
|
|
113
|
+
self.columns_renaming[hem_name] = original_email_column # it could be upgini_email_unnest...
|
|
93
114
|
self.email_converted_to_hem = True
|
|
94
115
|
else:
|
|
95
116
|
df[self.hem_column] = df[self.hem_column].astype("string").str.lower()
|
|
@@ -98,16 +119,13 @@ class EmailSearchKeyConverter:
|
|
|
98
119
|
if self.email_column in self.unnest_search_keys:
|
|
99
120
|
self.unnest_search_keys.remove(self.email_column)
|
|
100
121
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
self.
|
|
122
|
+
one_domain_name = self.email_column + self.ONE_DOMAIN_SUFFIX
|
|
123
|
+
df[one_domain_name] = df[self.email_column].apply(self._email_to_one_domain)
|
|
124
|
+
self.columns_renaming[one_domain_name] = original_email_column
|
|
125
|
+
self.search_keys[one_domain_name] = SearchKey.EMAIL_ONE_DOMAIN
|
|
104
126
|
|
|
105
127
|
if self.email_converted_to_hem:
|
|
106
128
|
df = df.drop(columns=self.email_column)
|
|
107
129
|
del self.columns_renaming[self.email_column]
|
|
108
130
|
|
|
109
|
-
df[self.DOMAIN_COLUMN_NAME] = df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME].str[1:]
|
|
110
|
-
self.generated_features.append(self.DOMAIN_COLUMN_NAME)
|
|
111
|
-
self.columns_renaming[self.DOMAIN_COLUMN_NAME] = original_email_column
|
|
112
|
-
|
|
113
131
|
return df
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=IaDaRN1MPzK9IEvOazFTrqDhFeyxseC5mkVDu1NRrYc,26
|
|
2
2
|
upgini/__init__.py,sha256=Xs0YFVBu1KUdtZzbStGRPQtLt3YLzJnjx5nIUBlX8BE,415
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=CdOE1h94E1YgStslQIPfvMp5z_ODt7QfXfxqpmYL5Xs,30758
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=FfFlFW3BArv2rQWGCs-SXrDDDcjQTwwJxzRysZlJfq0,186961
|
|
7
7
|
upgini/http.py,sha256=a4Epc9YLIJBuYk4t8E_2-QDLBtJFqKO35jn2SnYQZCg,42920
|
|
8
8
|
upgini/lazy_import.py,sha256=EwoM0msNGbSmWBhGbrLDny1DSnOlvTxCjmMKPxYlDms,610
|
|
9
9
|
upgini/metadata.py,sha256=YQ-1HZGyPOksP2iM50ff_pMHXLyzvpChqSfNh8Z0ke4,10833
|
|
@@ -45,7 +45,7 @@ upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
|
45
45
|
upgini/utils/datetime_utils.py,sha256=O-IQbWtWJs6xTAr3m9FMRHyT-fL_28vCMrrt4eqfpa0,12025
|
|
46
46
|
upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
|
|
47
47
|
upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
|
|
48
|
-
upgini/utils/email_utils.py,sha256=
|
|
48
|
+
upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
|
|
49
49
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
50
50
|
upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V5UO1A,3307
|
|
51
51
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
57
57
|
upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
62
|
-
upgini-1.1.
|
|
63
|
-
upgini-1.1.
|
|
60
|
+
upgini-1.1.312a3.dist-info/METADATA,sha256=133zxblkCtyOblc05Qe0mfRcwYYU9qLcPbNNBjHl-mY,48155
|
|
61
|
+
upgini-1.1.312a3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
62
|
+
upgini-1.1.312a3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.1.312a3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|