upgini 1.1.312a4__py3-none-any.whl → 1.1.313__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/metadata.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from enum import Enum
4
- from typing import Dict, List, Optional, Set, Union
4
+ from typing import Dict, List, Optional, Set
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
@@ -113,21 +113,6 @@ class SearchKey(Enum):
113
113
  if meaning_type == FileColumnMeaningType.MSISDN_RANGE_TO:
114
114
  return SearchKey.MSISDN_RANGE_TO
115
115
 
116
- @staticmethod
117
- def find_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[SearchKey]:
118
- if isinstance(keys, SearchKey):
119
- keys = [keys]
120
- for col, key_type in search_keys.items():
121
- if key_type in keys:
122
- return col
123
- return None
124
-
125
- @staticmethod
126
- def find_all_keys(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> List[SearchKey]:
127
- if isinstance(keys, SearchKey):
128
- keys = [keys]
129
- return [col for col, key_type in search_keys.items() if key_type in keys]
130
-
131
116
 
132
117
  class DataType(Enum):
133
118
  INT = "INT"
@@ -0,0 +1,340 @@
1
+ from typing import Optional
2
+
3
+ import pandas as pd
4
+ from pandas.api.types import is_float_dtype, is_int64_dtype, is_object_dtype, is_string_dtype
5
+
6
+ from upgini.errors import ValidationError
7
+
8
+
9
+ class PhoneNormalizer:
10
+ def __init__(self, df: pd.DataFrame, phone_column_name: str, country_column_name: Optional[str] = None):
11
+ self.df = df
12
+ self.phone_column_name = phone_column_name
13
+ self.country_column_name = country_column_name
14
+
15
+ def normalize(self) -> pd.DataFrame:
16
+ self.phone_to_int()
17
+ if self.country_column_name is not None:
18
+ self.df = self.df.apply(self.add_prefix, axis=1)
19
+ return self.df[self.phone_column_name].astype("Int64")
20
+
21
+ def add_prefix(self, row):
22
+ phone = row[self.phone_column_name]
23
+ if pd.isna(phone):
24
+ return row
25
+ country = row[self.country_column_name]
26
+ country_prefix_tuple = self.COUNTRIES_PREFIXES.get(country)
27
+ if country_prefix_tuple is not None:
28
+ country_prefix, number_of_digits = country_prefix_tuple
29
+ if len(str(phone)) == number_of_digits:
30
+ row[self.phone_column_name] = int(country_prefix + str(phone))
31
+ return row
32
+
33
+ def phone_to_int(self):
34
+ """
35
+ Convention: phone number is always presented as int number.
36
+ phone_number = Country code + National Destination Code + Subscriber Number.
37
+ Examples:
38
+ 41793834315 for Switzerland
39
+ 46767040672 for Sweden
40
+ 861065529988 for China
41
+ 18143008198 for the USA
42
+ Inplace conversion of phone to int.
43
+
44
+ Method will remove all non numeric chars from string and convert it to int.
45
+ None will be set for phone numbers that couldn"t be converted to int
46
+ """
47
+ if is_string_dtype(self.df[self.phone_column_name]) or is_object_dtype(self.df[self.phone_column_name]):
48
+ convert_func = self.phone_str_to_int_safe
49
+ elif is_float_dtype(self.df[self.phone_column_name]):
50
+ convert_func = self.phone_float_to_int_safe
51
+ elif is_int64_dtype(self.df[self.phone_column_name]):
52
+ convert_func = self.phone_int_to_int_safe
53
+ else:
54
+ raise ValidationError(
55
+ f"phone_column_name {self.phone_column_name} doesn't have supported dtype. "
56
+ f"Dataset dtypes: {self.df.dtypes}. "
57
+ f"Contact developer and request to implement conversion of {self.phone_column_name} to int"
58
+ )
59
+ self.df[self.phone_column_name] = self.df[self.phone_column_name].apply(convert_func).astype("Int64")
60
+
61
+ @staticmethod
62
+ def phone_float_to_int_safe(value: float) -> Optional[int]:
63
+ try:
64
+ return PhoneNormalizer.validate_length(int(value))
65
+ except Exception:
66
+ return None
67
+
68
+ @staticmethod
69
+ def phone_int_to_int_safe(value: int) -> Optional[int]:
70
+ try:
71
+ return PhoneNormalizer.validate_length(int(value))
72
+ except Exception:
73
+ return None
74
+
75
+ @staticmethod
76
+ def phone_str_to_int_safe(value: str) -> Optional[int]:
77
+ try:
78
+ value = str(value)
79
+ if value.endswith(".0"):
80
+ value = value[: len(value) - 2]
81
+ numeric_filter = filter(str.isdigit, value)
82
+ numeric_string = "".join(numeric_filter)
83
+ return PhoneNormalizer.validate_length(int(numeric_string))
84
+ except Exception:
85
+ return None
86
+
87
+ @staticmethod
88
+ def validate_length(value: int) -> Optional[int]:
89
+ if value < 10000000 or value > 999999999999999:
90
+ return None
91
+ else:
92
+ return value
93
+
94
+ COUNTRIES_PREFIXES = {
95
+ "US": ("1", 10),
96
+ "CA": ("1", 10),
97
+ "AI": ("1", 10),
98
+ "AG": ("1", 10),
99
+ "AS": ("1", 10),
100
+ "BB": ("1", 10),
101
+ "BS": ("1", 10),
102
+ "VG": ("1", 10),
103
+ "VI": ("1", 10),
104
+ "KY": ("1", 10),
105
+ "BM": ("1", 10),
106
+ "GD": ("1", 10),
107
+ "TC": ("1", 10),
108
+ "MS": ("1", 10),
109
+ "MP": ("1", 10),
110
+ "GU": ("1", 10),
111
+ "SX": ("1", 10),
112
+ "LC": ("1", 10),
113
+ "DM": ("1", 10),
114
+ "VC": ("1", 10),
115
+ "PR": ("1", 10),
116
+ "TT": ("1", 10),
117
+ "KN": ("1", 10),
118
+ "JM": ("1", 10),
119
+ "EG": ("20", 9),
120
+ "SS": ("211", 9),
121
+ "MA": ("212", 9),
122
+ "EH": ("212", 4),
123
+ "DZ": ("213", 8),
124
+ "TN": ("216", 8),
125
+ "LY": ("218", 9),
126
+ "GM": ("220", 6),
127
+ "SN": ("221", 9),
128
+ "MR": ("222", 7),
129
+ "ML": ("223", 8),
130
+ "GN": ("224", 9),
131
+ "CI": ("225", 7),
132
+ "BF": ("226", 8),
133
+ "NE": ("227", 8),
134
+ "TG": ("228", 8),
135
+ "BJ": ("229", 8),
136
+ "MU": ("230", 7),
137
+ "LR": ("231", 9),
138
+ "SL": ("232", 8),
139
+ "GH": ("233", 9),
140
+ "NG": ("234", 9),
141
+ "TD": ("235", 8),
142
+ "CF": ("236", 7),
143
+ "CM": ("237", 9),
144
+ "CV": ("238", 7),
145
+ "ST": ("239", 7),
146
+ "GQ": ("240", 9),
147
+ "GA": ("241", 8),
148
+ "CG": ("242", 7),
149
+ "CD": ("243", 9),
150
+ "AO": ("244", 9),
151
+ "GW": ("245", 6),
152
+ "IO": ("246", 7),
153
+ "AC": ("247", 5),
154
+ "SC": ("248", 7),
155
+ "SD": ("249", 9),
156
+ "RW": ("250", 9),
157
+ "ET": ("251", 9),
158
+ "SO": ("252", 9),
159
+ "DJ": ("253", 8),
160
+ "KE": ("254", 9),
161
+ "TZ": ("255", 9),
162
+ "UG": ("256", 9),
163
+ "BI": ("257", 8),
164
+ "MZ": ("258", 8),
165
+ "ZM": ("260", 9),
166
+ "MG": ("261", 9),
167
+ "RE": ("262", 9),
168
+ "YT": ("262", 9),
169
+ "TF": ("262", 9),
170
+ "ZW": ("263", 9),
171
+ "NA": ("264", 9),
172
+ "MW": ("265", 7),
173
+ "LS": ("266", 8),
174
+ "BW": ("267", 7),
175
+ "SZ": ("268", 8),
176
+ "KM": ("269", 7),
177
+ "ZA": ("27", 10),
178
+ "SH": ("290", 5),
179
+ "TA": ("290", 5),
180
+ "ER": ("291", 7),
181
+ "AT": ("43", 10),
182
+ "AW": ("297", 7),
183
+ "FO": ("298", 6),
184
+ "GL": ("299", 6),
185
+ "GR": ("30", 10),
186
+ "BE": ("32", 8),
187
+ "FR": ("33", 9),
188
+ "ES": ("34", 9),
189
+ "GI": ("350", 8),
190
+ "PE": ("51", 8),
191
+ "MX": ("52", 10),
192
+ "CU": ("53", 8),
193
+ "AR": ("54", 10),
194
+ "BR": ("55", 10),
195
+ "CL": ("56", 9),
196
+ "CO": ("57", 8),
197
+ "VE": ("58", 10),
198
+ "PT": ("351", 9),
199
+ "LU": ("352", 8),
200
+ "IE": ("353", 8),
201
+ "IS": ("354", 7),
202
+ "AL": ("355", 8),
203
+ "MT": ("356", 8),
204
+ "CY": ("357", 8),
205
+ "FI": ("358", 9),
206
+ "BG": ("359", 8),
207
+ "HU": ("36", 8),
208
+ "LT": ("370", 8),
209
+ "LV": ("371", 8),
210
+ "EE": ("372", 7),
211
+ "MD": ("373", 8),
212
+ "AM": ("374", 8),
213
+ "BY": ("375", 9),
214
+ "AD": ("376", 6),
215
+ "MC": ("377", 8),
216
+ "SM": ("378", 9),
217
+ "VA": ("3906698", 5),
218
+ "UA": ("380", 9),
219
+ "RS": ("381", 9),
220
+ "ME": ("382", 8),
221
+ "HR": ("385", 8),
222
+ "SI": ("386", 8),
223
+ "BA": ("387", 8),
224
+ "MK": ("389", 8),
225
+ "MY": ("60", 9),
226
+ "AU": ("61", 9),
227
+ "CX": ("61", 9),
228
+ "CC": ("61", 9),
229
+ "ID": ("62", 9),
230
+ "PH": ("632", 7),
231
+ "NZ": ("64", 8),
232
+ "PN": ("64", 8),
233
+ "SG": ("65", 8),
234
+ "TH": ("66", 8),
235
+ "IT": ("39", 10),
236
+ "RO": ("40", 9),
237
+ "CH": ("41", 9),
238
+ "CZ": ("420", 9),
239
+ "SK": ("421", 9),
240
+ "GB": ("44", 10),
241
+ "LI": ("423", 7),
242
+ "GG": ("44", 10),
243
+ "IM": ("44", 10),
244
+ "JE": ("44", 10),
245
+ "DK": ("45", 8),
246
+ "SE": ("46", 8),
247
+ "BD": ("880", 8),
248
+ "TW": ("886", 9),
249
+ "JP": ("81", 9),
250
+ "KR": ("82", 9),
251
+ "VN": ("84", 10),
252
+ "KP": ("850", 8),
253
+ "HK": ("852", 8),
254
+ "MO": ("853", 8),
255
+ "KH": ("855", 8),
256
+ "LA": ("856", 8),
257
+ "NO": ("47", 8),
258
+ "SJ": ("47", 8),
259
+ "BV": ("47", 8),
260
+ "PL": ("48", 9),
261
+ "DE": ("49", 10),
262
+ "TR": ("90", 10),
263
+ "IN": ("91", 10),
264
+ "PK": ("92", 9),
265
+ "AF": ("93", 9),
266
+ "LK": ("94", 9),
267
+ "MM": ("95", 7),
268
+ "IR": ("98", 10),
269
+ "MV": ("960", 7),
270
+ "LB": ("961", 7),
271
+ "JO": ("962", 9),
272
+ "SY": ("963", 10),
273
+ "IQ": ("964", 10),
274
+ "KW": ("965", 7),
275
+ "SA": ("966", 9),
276
+ "YE": ("967", 7),
277
+ "OM": ("968", 8),
278
+ "PS": ("970", 8),
279
+ "AE": ("971", 8),
280
+ "IL": ("972", 9),
281
+ "BH": ("973", 8),
282
+ "QA": ("974", 8),
283
+ "BT": ("975", 7),
284
+ "MN": ("976", 8),
285
+ "NP": ("977", 8),
286
+ "TJ": ("992", 9),
287
+ "TM": ("993", 8),
288
+ "AZ": ("994", 9),
289
+ "GE": ("995", 9),
290
+ "KG": ("996", 9),
291
+ "UZ": ("998", 9),
292
+ "FK": ("500", 5),
293
+ "BZ": ("501", 7),
294
+ "GT": ("502", 8),
295
+ "SV": ("503", 8),
296
+ "HN": ("504", 8),
297
+ "NI": ("505", 8),
298
+ "CR": ("506", 8),
299
+ "PA": ("507", 7),
300
+ "PM": ("508", 6),
301
+ "HT": ("509", 8),
302
+ "GS": ("500", 5),
303
+ "MF": ("590", 9),
304
+ "BL": ("590", 9),
305
+ "GP": ("590", 9),
306
+ "BO": ("591", 9),
307
+ "GY": ("592", 9),
308
+ "EC": ("593", 9),
309
+ "GF": ("594", 9),
310
+ "PY": ("595", 9),
311
+ "MQ": ("596", 9),
312
+ "SR": ("597", 9),
313
+ "UY": ("598", 9),
314
+ "CW": ("599", 9),
315
+ "BQ": ("599", 9),
316
+ "RU": ("7", 10),
317
+ "KZ": ("7", 10),
318
+ "TL": ("670", 7),
319
+ "NF": ("672", 7),
320
+ "HM": ("672", 7),
321
+ "BN": ("673", 7),
322
+ "NR": ("674", 7),
323
+ "PG": ("675", 7),
324
+ "TO": ("676", 7),
325
+ "SB": ("677", 7),
326
+ "VU": ("678", 7),
327
+ "FJ": ("679", 7),
328
+ "PW": ("680", 7),
329
+ "WF": ("681", 7),
330
+ "CK": ("682", 5),
331
+ "NU": ("683", 7),
332
+ "WS": ("685", 7),
333
+ "KI": ("686", 7),
334
+ "NC": ("687", 7),
335
+ "TV": ("688", 7),
336
+ "PF": ("689", 7),
337
+ "TK": ("690", 7),
338
+ "FM": ("691", 7),
339
+ "MH": ("692", 7),
340
+ }
@@ -4,22 +4,6 @@ from pandas.api.types import is_object_dtype, is_string_dtype
4
4
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
5
5
 
6
6
 
7
- class CountrySearchKeyConverter:
8
-
9
- def __init__(self, country_col: str):
10
- self.country_col = country_col
11
-
12
- def convert(self, df: pd.DataFrame) -> pd.DataFrame:
13
- df[self.country_col] = (
14
- df[self.country_col]
15
- .astype("string")
16
- .str.upper()
17
- .str.replace(r"[^A-Z]", "", regex=True)
18
- .str.replace("UK", "GB", regex=False)
19
- )
20
- return df
21
-
22
-
23
7
  class CountrySearchKeyDetector(BaseSearchKeyDetector):
24
8
  def _is_search_key_by_name(self, column_name: str) -> bool:
25
9
  return "country" in str(column_name).lower()
@@ -1,16 +1,18 @@
1
1
  import datetime
2
2
  import logging
3
3
  import re
4
- import pytz
5
4
  from typing import Dict, List, Optional
6
5
 
7
6
  import numpy as np
8
7
  import pandas as pd
9
8
  from dateutil.relativedelta import relativedelta
10
- from pandas.api.types import is_numeric_dtype, is_period_dtype
9
+ from pandas.api.types import (
10
+ is_numeric_dtype,
11
+ is_period_dtype,
12
+ )
11
13
 
12
14
  from upgini.errors import ValidationError
13
- from upgini.metadata import EVAL_SET_INDEX, SearchKey
15
+ from upgini.metadata import SearchKey
14
16
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
15
17
  from upgini.utils.warning_counter import WarningCounter
16
18
 
@@ -29,22 +31,18 @@ DATE_FORMATS = [
29
31
  "%Y-%m-%dT%H:%M:%S.%f",
30
32
  ]
31
33
 
32
- DATETIME_PATTERN = r"^[\d\s\.\-:T/+]+$"
34
+ DATETIME_PATTERN = r"^[\d\s\.\-:T/]+$"
33
35
 
34
36
 
35
37
  class DateTimeSearchKeyConverter:
36
38
  DATETIME_COL = "_date_time"
37
- # MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31) # 946684800000 # 2000-01-01
38
- MIN_SUPPORTED_DATE_TS = pd.to_datetime(datetime.datetime(1999, 12, 31)).tz_localize(None)
39
39
 
40
40
  def __init__(
41
41
  self,
42
42
  date_column: str,
43
43
  date_format: Optional[str] = None,
44
44
  logger: Optional[logging.Logger] = None,
45
- bundle: Optional[ResourceBundle] = None,
46
- warnings_counter: Optional[WarningCounter] = None,
47
- silent_mode=False,
45
+ bundle: ResourceBundle = None,
48
46
  ):
49
47
  self.date_column = date_column
50
48
  self.date_format = date_format
@@ -55,8 +53,6 @@ class DateTimeSearchKeyConverter:
55
53
  self.logger.setLevel("FATAL")
56
54
  self.generated_features: List[str] = []
57
55
  self.bundle = bundle or get_custom_bundle()
58
- self.warnings_counter = warnings_counter or WarningCounter()
59
- self.silent_mode = silent_mode
60
56
 
61
57
  @staticmethod
62
58
  def _int_to_opt(i: int) -> Optional[int]:
@@ -92,13 +88,13 @@ class DateTimeSearchKeyConverter:
92
88
  # 315532801000 - 2524608001000 - milliseconds
93
89
  # 315532801000000 - 2524608001000000 - microseconds
94
90
  # 315532801000000000 - 2524608001000000000 - nanoseconds
95
- if df[self.date_column].apply(lambda x: 10**16 < x).all():
91
+ if df[self.date_column].apply(lambda x: 10 ** 16 < x).all():
96
92
  df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ns")
97
- elif df[self.date_column].apply(lambda x: 10**14 < x < 10**16).all():
93
+ elif df[self.date_column].apply(lambda x: 10 ** 14 < x < 10 ** 16).all():
98
94
  df[self.date_column] = pd.to_datetime(df[self.date_column], unit="us")
99
- elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
95
+ elif df[self.date_column].apply(lambda x: 10 ** 11 < x < 10 ** 14).all():
100
96
  df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ms")
101
- elif df[self.date_column].apply(lambda x: 0 < x < 10**11).all():
97
+ elif df[self.date_column].apply(lambda x: 0 < x < 10 ** 11).all():
102
98
  df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
103
99
  else:
104
100
  msg = self.bundle.get("unsupported_date_type").format(self.date_column)
@@ -112,9 +108,6 @@ class DateTimeSearchKeyConverter:
112
108
  # as additional features
113
109
  seconds = "datetime_seconds"
114
110
  df[self.date_column] = df[self.date_column].dt.tz_localize(None)
115
-
116
- df = self.clean_old_dates(df)
117
-
118
111
  df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
119
112
 
120
113
  seconds_without_na = df[seconds].dropna()
@@ -159,19 +152,6 @@ class DateTimeSearchKeyConverter:
159
152
  except ValueError:
160
153
  raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
161
154
 
162
- def clean_old_dates(self, df: pd.DataFrame) -> pd.DataFrame:
163
- condition = df[self.date_column] <= self.MIN_SUPPORTED_DATE_TS
164
- old_subset = df[condition]
165
- if len(old_subset) > 0:
166
- df.loc[condition, self.date_column] = None
167
- self.logger.info(f"Set to None: {len(old_subset)} of {len(df)} rows because they are before 2000-01-01")
168
- msg = self.bundle.get("dataset_drop_old_dates")
169
- self.logger.warning(msg)
170
- if not self.silent_mode:
171
- print(msg)
172
- self.warnings_counter.increment()
173
- return df
174
-
175
155
 
176
156
  def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
177
157
  try:
@@ -258,18 +238,16 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
258
238
 
259
239
 
260
240
  def validate_dates_distribution(
261
- df: pd.DataFrame,
241
+ X: pd.DataFrame,
262
242
  search_keys: Dict[str, SearchKey],
263
243
  logger: Optional[logging.Logger] = None,
264
244
  bundle: Optional[ResourceBundle] = None,
265
245
  warning_counter: Optional[WarningCounter] = None,
266
246
  ):
267
- maybe_date_col = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
268
-
269
- if EVAL_SET_INDEX in df.columns:
270
- X = df.query(f"{EVAL_SET_INDEX} == 0")
271
- else:
272
- X = df
247
+ maybe_date_col = None
248
+ for key, key_type in search_keys.items():
249
+ if key_type in [SearchKey.DATE, SearchKey.DATETIME]:
250
+ maybe_date_col = key
273
251
 
274
252
  if maybe_date_col is None:
275
253
  for col in X.columns:
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from pandas.api.types import is_object_dtype, is_string_dtype
8
8
 
9
9
  from upgini.metadata import SearchKey
10
- from upgini.resource_bundle import ResourceBundle, get_custom_bundle
10
+ from upgini.resource_bundle import bundle
11
11
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
12
12
 
13
13
  EMAIL_REGEX = re.compile(r"^[a-zA-Z0-9.!#$%&’*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*$")
@@ -28,53 +28,29 @@ class EmailSearchKeyDetector(BaseSearchKeyDetector):
28
28
  return is_email_count / all_count > 0.1
29
29
 
30
30
 
31
- class EmailDomainGenerator:
32
- DOMAIN_SUFFIX = "_domain"
33
-
34
- def __init__(self, email_columns: List[str]):
35
- self.email_columns = email_columns
36
- self.generated_features = []
37
-
38
- def generate(self, df: pd.DataFrame) -> pd.DataFrame:
39
- for email_col in self.email_columns:
40
- domain_feature = email_col + self.DOMAIN_SUFFIX
41
- df[domain_feature] = df[email_col].apply(self._email_to_domain)
42
- self.generated_features.append(domain_feature)
43
- return df
44
-
45
- @staticmethod
46
- def _email_to_domain(email: str) -> Optional[str]:
47
- if email is not None and isinstance(email, str) and "@" in email:
48
- name_and_domain = email.split("@")
49
- if len(name_and_domain) == 2 and len(name_and_domain[1]) > 0:
50
- return name_and_domain[1]
51
-
52
-
53
31
  class EmailSearchKeyConverter:
54
- HEM_SUFFIX = "_hem"
55
- ONE_DOMAIN_SUFFIX = "_one_domain"
32
+ HEM_COLUMN_NAME = "hashed_email"
33
+ DOMAIN_COLUMN_NAME = "email_domain"
34
+ EMAIL_ONE_DOMAIN_COLUMN_NAME = "email_one_domain"
56
35
 
57
36
  def __init__(
58
37
  self,
59
38
  email_column: str,
60
39
  hem_column: Optional[str],
61
40
  search_keys: Dict[str, SearchKey],
62
- columns_renaming: Dict[str, str],
63
41
  unnest_search_keys: Optional[List[str]] = None,
64
- bundle: Optional[ResourceBundle] = None,
65
42
  logger: Optional[logging.Logger] = None,
66
43
  ):
67
44
  self.email_column = email_column
68
45
  self.hem_column = hem_column
69
46
  self.search_keys = search_keys
70
- self.columns_renaming = columns_renaming
71
47
  self.unnest_search_keys = unnest_search_keys
72
- self.bundle = bundle or get_custom_bundle()
73
48
  if logger is not None:
74
49
  self.logger = logger
75
50
  else:
76
51
  self.logger = logging.getLogger()
77
52
  self.logger.setLevel("FATAL")
53
+ self.generated_features: List[str] = []
78
54
  self.email_converted_to_hem = False
79
55
 
80
56
  @staticmethod
@@ -85,7 +61,7 @@ class EmailSearchKeyConverter:
85
61
  if not EMAIL_REGEX.fullmatch(email):
86
62
  return None
87
63
 
88
- return sha256(email.lower().encode("utf-8")).hexdigest().lower()
64
+ return sha256(email.lower().encode("utf-8")).hexdigest()
89
65
 
90
66
  @staticmethod
91
67
  def _email_to_one_domain(email: str) -> Optional[str]:
@@ -96,36 +72,28 @@ class EmailSearchKeyConverter:
96
72
 
97
73
  def convert(self, df: pd.DataFrame) -> pd.DataFrame:
98
74
  df = df.copy()
99
- original_email_column = self.columns_renaming[self.email_column]
100
75
  if self.hem_column is None:
101
- hem_name = self.email_column + self.HEM_SUFFIX
102
- df[hem_name] = df[self.email_column].apply(self._email_to_hem)
103
- if df[hem_name].isna().all():
104
- msg = self.bundle.get("all_emails_invalid").format(self.email_column)
76
+ df[self.HEM_COLUMN_NAME] = df[self.email_column].apply(self._email_to_hem)
77
+ if df[self.HEM_COLUMN_NAME].isna().all():
78
+ msg = bundle.get("all_emails_invalid").format(self.email_column)
105
79
  print(msg)
106
80
  self.logger.warning(msg)
107
- df = df.drop(columns=hem_name)
81
+ df = df.drop(columns=self.HEM_COLUMN_NAME)
108
82
  del self.search_keys[self.email_column]
109
83
  return df
110
- self.search_keys[hem_name] = SearchKey.HEM
111
- if self.email_column in self.unnest_search_keys:
112
- self.unnest_search_keys.append(hem_name)
113
- self.columns_renaming[hem_name] = original_email_column # it could be upgini_email_unnest...
84
+ self.search_keys[self.HEM_COLUMN_NAME] = SearchKey.HEM
85
+ self.unnest_search_keys.append(self.HEM_COLUMN_NAME)
114
86
  self.email_converted_to_hem = True
115
- else:
116
- df[self.hem_column] = df[self.hem_column].astype("string").str.lower()
117
87
 
118
88
  del self.search_keys[self.email_column]
119
89
  if self.email_column in self.unnest_search_keys:
120
90
  self.unnest_search_keys.remove(self.email_column)
121
91
 
122
- one_domain_name = self.email_column + self.ONE_DOMAIN_SUFFIX
123
- df[one_domain_name] = df[self.email_column].apply(self._email_to_one_domain)
124
- self.columns_renaming[one_domain_name] = original_email_column
125
- self.search_keys[one_domain_name] = SearchKey.EMAIL_ONE_DOMAIN
92
+ df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = df[self.email_column].apply(self._email_to_one_domain)
93
+
94
+ self.search_keys[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = SearchKey.EMAIL_ONE_DOMAIN
126
95
 
127
- if self.email_converted_to_hem:
128
- df = df.drop(columns=self.email_column)
129
- del self.columns_renaming[self.email_column]
96
+ df[self.DOMAIN_COLUMN_NAME] = df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME].str[1:]
97
+ self.generated_features.append(self.DOMAIN_COLUMN_NAME)
130
98
 
131
99
  return df