upgini 1.1.309a1__py3-none-any.whl → 1.1.309a3511.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/metadata.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from enum import Enum
4
- from typing import Dict, List, Optional, Set, Union
4
+ from typing import Dict, List, Optional, Set
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
@@ -113,21 +113,6 @@ class SearchKey(Enum):
113
113
  if meaning_type == FileColumnMeaningType.MSISDN_RANGE_TO:
114
114
  return SearchKey.MSISDN_RANGE_TO
115
115
 
116
- @staticmethod
117
- def find_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[SearchKey]:
118
- if isinstance(keys, SearchKey):
119
- keys = [keys]
120
- for col, key_type in search_keys.items():
121
- if key_type in keys:
122
- return col
123
- return None
124
-
125
- @staticmethod
126
- def find_all_keys(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> List[SearchKey]:
127
- if isinstance(keys, SearchKey):
128
- keys = [keys]
129
- return [col for col, key_type in search_keys.items() if key_type in keys]
130
-
131
116
 
132
117
  class DataType(Enum):
133
118
  INT = "INT"
@@ -0,0 +1,340 @@
1
+ from typing import Optional
2
+
3
+ import pandas as pd
4
+ from pandas.api.types import is_float_dtype, is_int64_dtype, is_object_dtype, is_string_dtype
5
+
6
+ from upgini.errors import ValidationError
7
+
8
+
9
+ class PhoneNormalizer:
10
+ def __init__(self, df: pd.DataFrame, phone_column_name: str, country_column_name: Optional[str] = None):
11
+ self.df = df
12
+ self.phone_column_name = phone_column_name
13
+ self.country_column_name = country_column_name
14
+
15
+ def normalize(self) -> pd.DataFrame:
16
+ self.phone_to_int()
17
+ if self.country_column_name is not None:
18
+ self.df = self.df.apply(self.add_prefix, axis=1)
19
+ return self.df[self.phone_column_name].astype("Int64")
20
+
21
+ def add_prefix(self, row):
22
+ phone = row[self.phone_column_name]
23
+ if pd.isna(phone):
24
+ return row
25
+ country = row[self.country_column_name]
26
+ country_prefix_tuple = self.COUNTRIES_PREFIXES.get(country)
27
+ if country_prefix_tuple is not None:
28
+ country_prefix, number_of_digits = country_prefix_tuple
29
+ if len(str(phone)) == number_of_digits:
30
+ row[self.phone_column_name] = int(country_prefix + str(phone))
31
+ return row
32
+
33
+ def phone_to_int(self):
34
+ """
35
+ Convention: phone number is always presented as int number.
36
+ phone_number = Country code + National Destination Code + Subscriber Number.
37
+ Examples:
38
+ 41793834315 for Switzerland
39
+ 46767040672 for Sweden
40
+ 861065529988 for China
41
+ 18143008198 for the USA
42
+ Inplace conversion of phone to int.
43
+
44
+ Method will remove all non numeric chars from string and convert it to int.
45
+ None will be set for phone numbers that couldn"t be converted to int
46
+ """
47
+ if is_string_dtype(self.df[self.phone_column_name]) or is_object_dtype(self.df[self.phone_column_name]):
48
+ convert_func = self.phone_str_to_int_safe
49
+ elif is_float_dtype(self.df[self.phone_column_name]):
50
+ convert_func = self.phone_float_to_int_safe
51
+ elif is_int64_dtype(self.df[self.phone_column_name]):
52
+ convert_func = self.phone_int_to_int_safe
53
+ else:
54
+ raise ValidationError(
55
+ f"phone_column_name {self.phone_column_name} doesn't have supported dtype. "
56
+ f"Dataset dtypes: {self.df.dtypes}. "
57
+ f"Contact developer and request to implement conversion of {self.phone_column_name} to int"
58
+ )
59
+ self.df[self.phone_column_name] = self.df[self.phone_column_name].apply(convert_func).astype("Int64")
60
+
61
+ @staticmethod
62
+ def phone_float_to_int_safe(value: float) -> Optional[int]:
63
+ try:
64
+ return PhoneNormalizer.validate_length(int(value))
65
+ except Exception:
66
+ return None
67
+
68
+ @staticmethod
69
+ def phone_int_to_int_safe(value: int) -> Optional[int]:
70
+ try:
71
+ return PhoneNormalizer.validate_length(int(value))
72
+ except Exception:
73
+ return None
74
+
75
+ @staticmethod
76
+ def phone_str_to_int_safe(value: str) -> Optional[int]:
77
+ try:
78
+ value = str(value)
79
+ if value.endswith(".0"):
80
+ value = value[: len(value) - 2]
81
+ numeric_filter = filter(str.isdigit, value)
82
+ numeric_string = "".join(numeric_filter)
83
+ return PhoneNormalizer.validate_length(int(numeric_string))
84
+ except Exception:
85
+ return None
86
+
87
+ @staticmethod
88
+ def validate_length(value: int) -> Optional[int]:
89
+ if value < 10000000 or value > 999999999999999:
90
+ return None
91
+ else:
92
+ return value
93
+
94
+ COUNTRIES_PREFIXES = {
95
+ "US": ("1", 10),
96
+ "CA": ("1", 10),
97
+ "AI": ("1", 10),
98
+ "AG": ("1", 10),
99
+ "AS": ("1", 10),
100
+ "BB": ("1", 10),
101
+ "BS": ("1", 10),
102
+ "VG": ("1", 10),
103
+ "VI": ("1", 10),
104
+ "KY": ("1", 10),
105
+ "BM": ("1", 10),
106
+ "GD": ("1", 10),
107
+ "TC": ("1", 10),
108
+ "MS": ("1", 10),
109
+ "MP": ("1", 10),
110
+ "GU": ("1", 10),
111
+ "SX": ("1", 10),
112
+ "LC": ("1", 10),
113
+ "DM": ("1", 10),
114
+ "VC": ("1", 10),
115
+ "PR": ("1", 10),
116
+ "TT": ("1", 10),
117
+ "KN": ("1", 10),
118
+ "JM": ("1", 10),
119
+ "EG": ("20", 9),
120
+ "SS": ("211", 9),
121
+ "MA": ("212", 9),
122
+ "EH": ("212", 4),
123
+ "DZ": ("213", 8),
124
+ "TN": ("216", 8),
125
+ "LY": ("218", 9),
126
+ "GM": ("220", 6),
127
+ "SN": ("221", 9),
128
+ "MR": ("222", 7),
129
+ "ML": ("223", 8),
130
+ "GN": ("224", 9),
131
+ "CI": ("225", 7),
132
+ "BF": ("226", 8),
133
+ "NE": ("227", 8),
134
+ "TG": ("228", 8),
135
+ "BJ": ("229", 8),
136
+ "MU": ("230", 7),
137
+ "LR": ("231", 9),
138
+ "SL": ("232", 8),
139
+ "GH": ("233", 9),
140
+ "NG": ("234", 9),
141
+ "TD": ("235", 8),
142
+ "CF": ("236", 7),
143
+ "CM": ("237", 9),
144
+ "CV": ("238", 7),
145
+ "ST": ("239", 7),
146
+ "GQ": ("240", 9),
147
+ "GA": ("241", 8),
148
+ "CG": ("242", 7),
149
+ "CD": ("243", 9),
150
+ "AO": ("244", 9),
151
+ "GW": ("245", 6),
152
+ "IO": ("246", 7),
153
+ "AC": ("247", 5),
154
+ "SC": ("248", 7),
155
+ "SD": ("249", 9),
156
+ "RW": ("250", 9),
157
+ "ET": ("251", 9),
158
+ "SO": ("252", 9),
159
+ "DJ": ("253", 8),
160
+ "KE": ("254", 9),
161
+ "TZ": ("255", 9),
162
+ "UG": ("256", 9),
163
+ "BI": ("257", 8),
164
+ "MZ": ("258", 8),
165
+ "ZM": ("260", 9),
166
+ "MG": ("261", 9),
167
+ "RE": ("262", 9),
168
+ "YT": ("262", 9),
169
+ "TF": ("262", 9),
170
+ "ZW": ("263", 9),
171
+ "NA": ("264", 9),
172
+ "MW": ("265", 7),
173
+ "LS": ("266", 8),
174
+ "BW": ("267", 7),
175
+ "SZ": ("268", 8),
176
+ "KM": ("269", 7),
177
+ "ZA": ("27", 10),
178
+ "SH": ("290", 5),
179
+ "TA": ("290", 5),
180
+ "ER": ("291", 7),
181
+ "AT": ("43", 10),
182
+ "AW": ("297", 7),
183
+ "FO": ("298", 6),
184
+ "GL": ("299", 6),
185
+ "GR": ("30", 10),
186
+ "BE": ("32", 8),
187
+ "FR": ("33", 9),
188
+ "ES": ("34", 9),
189
+ "GI": ("350", 8),
190
+ "PE": ("51", 8),
191
+ "MX": ("52", 10),
192
+ "CU": ("53", 8),
193
+ "AR": ("54", 10),
194
+ "BR": ("55", 10),
195
+ "CL": ("56", 9),
196
+ "CO": ("57", 8),
197
+ "VE": ("58", 10),
198
+ "PT": ("351", 9),
199
+ "LU": ("352", 8),
200
+ "IE": ("353", 8),
201
+ "IS": ("354", 7),
202
+ "AL": ("355", 8),
203
+ "MT": ("356", 8),
204
+ "CY": ("357", 8),
205
+ "FI": ("358", 9),
206
+ "BG": ("359", 8),
207
+ "HU": ("36", 8),
208
+ "LT": ("370", 8),
209
+ "LV": ("371", 8),
210
+ "EE": ("372", 7),
211
+ "MD": ("373", 8),
212
+ "AM": ("374", 8),
213
+ "BY": ("375", 9),
214
+ "AD": ("376", 6),
215
+ "MC": ("377", 8),
216
+ "SM": ("378", 9),
217
+ "VA": ("3906698", 5),
218
+ "UA": ("380", 9),
219
+ "RS": ("381", 9),
220
+ "ME": ("382", 8),
221
+ "HR": ("385", 8),
222
+ "SI": ("386", 8),
223
+ "BA": ("387", 8),
224
+ "MK": ("389", 8),
225
+ "MY": ("60", 9),
226
+ "AU": ("61", 9),
227
+ "CX": ("61", 9),
228
+ "CC": ("61", 9),
229
+ "ID": ("62", 9),
230
+ "PH": ("632", 7),
231
+ "NZ": ("64", 8),
232
+ "PN": ("64", 8),
233
+ "SG": ("65", 8),
234
+ "TH": ("66", 8),
235
+ "IT": ("39", 10),
236
+ "RO": ("40", 9),
237
+ "CH": ("41", 9),
238
+ "CZ": ("420", 9),
239
+ "SK": ("421", 9),
240
+ "GB": ("44", 10),
241
+ "LI": ("423", 7),
242
+ "GG": ("44", 10),
243
+ "IM": ("44", 10),
244
+ "JE": ("44", 10),
245
+ "DK": ("45", 8),
246
+ "SE": ("46", 8),
247
+ "BD": ("880", 8),
248
+ "TW": ("886", 9),
249
+ "JP": ("81", 9),
250
+ "KR": ("82", 9),
251
+ "VN": ("84", 10),
252
+ "KP": ("850", 8),
253
+ "HK": ("852", 8),
254
+ "MO": ("853", 8),
255
+ "KH": ("855", 8),
256
+ "LA": ("856", 8),
257
+ "NO": ("47", 8),
258
+ "SJ": ("47", 8),
259
+ "BV": ("47", 8),
260
+ "PL": ("48", 9),
261
+ "DE": ("49", 10),
262
+ "TR": ("90", 10),
263
+ "IN": ("91", 10),
264
+ "PK": ("92", 9),
265
+ "AF": ("93", 9),
266
+ "LK": ("94", 9),
267
+ "MM": ("95", 7),
268
+ "IR": ("98", 10),
269
+ "MV": ("960", 7),
270
+ "LB": ("961", 7),
271
+ "JO": ("962", 9),
272
+ "SY": ("963", 10),
273
+ "IQ": ("964", 10),
274
+ "KW": ("965", 7),
275
+ "SA": ("966", 9),
276
+ "YE": ("967", 7),
277
+ "OM": ("968", 8),
278
+ "PS": ("970", 8),
279
+ "AE": ("971", 8),
280
+ "IL": ("972", 9),
281
+ "BH": ("973", 8),
282
+ "QA": ("974", 8),
283
+ "BT": ("975", 7),
284
+ "MN": ("976", 8),
285
+ "NP": ("977", 8),
286
+ "TJ": ("992", 9),
287
+ "TM": ("993", 8),
288
+ "AZ": ("994", 9),
289
+ "GE": ("995", 9),
290
+ "KG": ("996", 9),
291
+ "UZ": ("998", 9),
292
+ "FK": ("500", 5),
293
+ "BZ": ("501", 7),
294
+ "GT": ("502", 8),
295
+ "SV": ("503", 8),
296
+ "HN": ("504", 8),
297
+ "NI": ("505", 8),
298
+ "CR": ("506", 8),
299
+ "PA": ("507", 7),
300
+ "PM": ("508", 6),
301
+ "HT": ("509", 8),
302
+ "GS": ("500", 5),
303
+ "MF": ("590", 9),
304
+ "BL": ("590", 9),
305
+ "GP": ("590", 9),
306
+ "BO": ("591", 9),
307
+ "GY": ("592", 9),
308
+ "EC": ("593", 9),
309
+ "GF": ("594", 9),
310
+ "PY": ("595", 9),
311
+ "MQ": ("596", 9),
312
+ "SR": ("597", 9),
313
+ "UY": ("598", 9),
314
+ "CW": ("599", 9),
315
+ "BQ": ("599", 9),
316
+ "RU": ("7", 10),
317
+ "KZ": ("7", 10),
318
+ "TL": ("670", 7),
319
+ "NF": ("672", 7),
320
+ "HM": ("672", 7),
321
+ "BN": ("673", 7),
322
+ "NR": ("674", 7),
323
+ "PG": ("675", 7),
324
+ "TO": ("676", 7),
325
+ "SB": ("677", 7),
326
+ "VU": ("678", 7),
327
+ "FJ": ("679", 7),
328
+ "PW": ("680", 7),
329
+ "WF": ("681", 7),
330
+ "CK": ("682", 5),
331
+ "NU": ("683", 7),
332
+ "WS": ("685", 7),
333
+ "KI": ("686", 7),
334
+ "NC": ("687", 7),
335
+ "TV": ("688", 7),
336
+ "PF": ("689", 7),
337
+ "TK": ("690", 7),
338
+ "FM": ("691", 7),
339
+ "MH": ("692", 7),
340
+ }
@@ -4,22 +4,6 @@ from pandas.api.types import is_object_dtype, is_string_dtype
4
4
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
5
5
 
6
6
 
7
- class CountrySearchKeyConverter:
8
-
9
- def __init__(self, country_col: str):
10
- self.country_col = country_col
11
-
12
- def convert(self, df: pd.DataFrame) -> pd.DataFrame:
13
- df[self.country_col] = (
14
- df[self.country_col]
15
- .astype("string")
16
- .str.upper()
17
- .str.replace(r"[^A-Z]", "", regex=True)
18
- .str.replace("UK", "GB", regex=False)
19
- )
20
- return df
21
-
22
-
23
7
  class CountrySearchKeyDetector(BaseSearchKeyDetector):
24
8
  def _is_search_key_by_name(self, column_name: str) -> bool:
25
9
  return "country" in str(column_name).lower()
@@ -6,10 +6,13 @@ from typing import Dict, List, Optional
6
6
  import numpy as np
7
7
  import pandas as pd
8
8
  from dateutil.relativedelta import relativedelta
9
- from pandas.api.types import is_numeric_dtype, is_period_dtype
9
+ from pandas.api.types import (
10
+ is_numeric_dtype,
11
+ is_period_dtype,
12
+ )
10
13
 
11
14
  from upgini.errors import ValidationError
12
- from upgini.metadata import EVAL_SET_INDEX, SearchKey
15
+ from upgini.metadata import SearchKey
13
16
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
14
17
  from upgini.utils.warning_counter import WarningCounter
15
18
 
@@ -33,16 +36,13 @@ DATETIME_PATTERN = r"^[\d\s\.\-:T/]+$"
33
36
 
34
37
  class DateTimeSearchKeyConverter:
35
38
  DATETIME_COL = "_date_time"
36
- MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31) # 946684800000 # 2000-01-01
37
39
 
38
40
  def __init__(
39
41
  self,
40
42
  date_column: str,
41
43
  date_format: Optional[str] = None,
42
44
  logger: Optional[logging.Logger] = None,
43
- bundle: Optional[ResourceBundle] = None,
44
- warnings_counter: Optional[WarningCounter] = None,
45
- silent_mode=False,
45
+ bundle: ResourceBundle = None,
46
46
  ):
47
47
  self.date_column = date_column
48
48
  self.date_format = date_format
@@ -53,8 +53,6 @@ class DateTimeSearchKeyConverter:
53
53
  self.logger.setLevel("FATAL")
54
54
  self.generated_features: List[str] = []
55
55
  self.bundle = bundle or get_custom_bundle()
56
- self.warnings_counter = warnings_counter or WarningCounter()
57
- self.silent_mode = silent_mode
58
56
 
59
57
  @staticmethod
60
58
  def _int_to_opt(i: int) -> Optional[int]:
@@ -90,13 +88,13 @@ class DateTimeSearchKeyConverter:
90
88
  # 315532801000 - 2524608001000 - milliseconds
91
89
  # 315532801000000 - 2524608001000000 - microseconds
92
90
  # 315532801000000000 - 2524608001000000000 - nanoseconds
93
- if df[self.date_column].apply(lambda x: 10**16 < x).all():
91
+ if df[self.date_column].apply(lambda x: 10 ** 16 < x).all():
94
92
  df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ns")
95
- elif df[self.date_column].apply(lambda x: 10**14 < x < 10**16).all():
93
+ elif df[self.date_column].apply(lambda x: 10 ** 14 < x < 10 ** 16).all():
96
94
  df[self.date_column] = pd.to_datetime(df[self.date_column], unit="us")
97
- elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
95
+ elif df[self.date_column].apply(lambda x: 10 ** 11 < x < 10 ** 14).all():
98
96
  df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ms")
99
- elif df[self.date_column].apply(lambda x: 0 < x < 10**11).all():
97
+ elif df[self.date_column].apply(lambda x: 0 < x < 10 ** 11).all():
100
98
  df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
101
99
  else:
102
100
  msg = self.bundle.get("unsupported_date_type").format(self.date_column)
@@ -106,8 +104,6 @@ class DateTimeSearchKeyConverter:
106
104
  df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
107
105
  df[self.date_column] = self.parse_date(df)
108
106
 
109
- df = self.clean_old_dates(df)
110
-
111
107
  # If column with date is datetime then extract seconds of the day and minute of the hour
112
108
  # as additional features
113
109
  seconds = "datetime_seconds"
@@ -156,19 +152,6 @@ class DateTimeSearchKeyConverter:
156
152
  except ValueError:
157
153
  raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
158
154
 
159
- def clean_old_dates(self, df: pd.DataFrame) -> pd.DataFrame:
160
- condition = df[self.date_column] <= self.MIN_SUPPORTED_DATE_TS
161
- old_subset = df[condition]
162
- if len(old_subset) > 0:
163
- df.loc[condition, self.date_column] = None
164
- self.logger.info(f"Set to None: {len(old_subset)} of {len(df)} rows because they are before 2000-01-01")
165
- msg = self.bundle.get("dataset_drop_old_dates")
166
- self.logger.warning(msg)
167
- if not self.silent_mode:
168
- print(msg)
169
- self.warnings_counter.increment()
170
- return df
171
-
172
155
 
173
156
  def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
174
157
  try:
@@ -255,18 +238,16 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
255
238
 
256
239
 
257
240
  def validate_dates_distribution(
258
- df: pd.DataFrame,
241
+ X: pd.DataFrame,
259
242
  search_keys: Dict[str, SearchKey],
260
243
  logger: Optional[logging.Logger] = None,
261
244
  bundle: Optional[ResourceBundle] = None,
262
245
  warning_counter: Optional[WarningCounter] = None,
263
246
  ):
264
- maybe_date_col = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
265
-
266
- if EVAL_SET_INDEX in df.columns:
267
- X = df.query(f"{EVAL_SET_INDEX} == 0")
268
- else:
269
- X = df
247
+ maybe_date_col = None
248
+ for key, key_type in search_keys.items():
249
+ if key_type in [SearchKey.DATE, SearchKey.DATETIME]:
250
+ maybe_date_col = key
270
251
 
271
252
  if maybe_date_col is None:
272
253
  for col in X.columns:
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from pandas.api.types import is_object_dtype, is_string_dtype
8
8
 
9
9
  from upgini.metadata import SearchKey
10
- from upgini.resource_bundle import ResourceBundle, get_custom_bundle
10
+ from upgini.resource_bundle import bundle
11
11
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
12
12
 
13
13
  EMAIL_REGEX = re.compile(r"^[a-zA-Z0-9.!#$%&’*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*$")
@@ -38,17 +38,13 @@ class EmailSearchKeyConverter:
38
38
  email_column: str,
39
39
  hem_column: Optional[str],
40
40
  search_keys: Dict[str, SearchKey],
41
- columns_renaming: Dict[str, str],
42
41
  unnest_search_keys: Optional[List[str]] = None,
43
- bundle: Optional[ResourceBundle] = None,
44
42
  logger: Optional[logging.Logger] = None,
45
43
  ):
46
44
  self.email_column = email_column
47
45
  self.hem_column = hem_column
48
46
  self.search_keys = search_keys
49
- self.columns_renaming = columns_renaming
50
47
  self.unnest_search_keys = unnest_search_keys
51
- self.bundle = bundle or get_custom_bundle()
52
48
  if logger is not None:
53
49
  self.logger = logger
54
50
  else:
@@ -65,7 +61,7 @@ class EmailSearchKeyConverter:
65
61
  if not EMAIL_REGEX.fullmatch(email):
66
62
  return None
67
63
 
68
- return sha256(email.lower().encode("utf-8")).hexdigest().lower()
64
+ return sha256(email.lower().encode("utf-8")).hexdigest()
69
65
 
70
66
  @staticmethod
71
67
  def _email_to_one_domain(email: str) -> Optional[str]:
@@ -76,38 +72,28 @@ class EmailSearchKeyConverter:
76
72
 
77
73
  def convert(self, df: pd.DataFrame) -> pd.DataFrame:
78
74
  df = df.copy()
79
- original_email_column = self.columns_renaming[self.email_column]
80
75
  if self.hem_column is None:
81
76
  df[self.HEM_COLUMN_NAME] = df[self.email_column].apply(self._email_to_hem)
82
77
  if df[self.HEM_COLUMN_NAME].isna().all():
83
- msg = self.bundle.get("all_emails_invalid").format(self.email_column)
78
+ msg = bundle.get("all_emails_invalid").format(self.email_column)
84
79
  print(msg)
85
80
  self.logger.warning(msg)
86
81
  df = df.drop(columns=self.HEM_COLUMN_NAME)
87
82
  del self.search_keys[self.email_column]
88
83
  return df
89
84
  self.search_keys[self.HEM_COLUMN_NAME] = SearchKey.HEM
90
- if self.email_column in self.unnest_search_keys:
91
- self.unnest_search_keys.append(self.HEM_COLUMN_NAME)
92
- self.columns_renaming[self.HEM_COLUMN_NAME] = original_email_column # it could be upgini_email_unnest...
85
+ self.unnest_search_keys.append(self.HEM_COLUMN_NAME)
93
86
  self.email_converted_to_hem = True
94
- else:
95
- df[self.hem_column] = df[self.hem_column].astype("string").str.lower()
96
87
 
97
88
  del self.search_keys[self.email_column]
98
89
  if self.email_column in self.unnest_search_keys:
99
90
  self.unnest_search_keys.remove(self.email_column)
100
91
 
101
92
  df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = df[self.email_column].apply(self._email_to_one_domain)
102
- self.columns_renaming[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = original_email_column
103
- self.search_keys[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = SearchKey.EMAIL_ONE_DOMAIN
104
93
 
105
- if self.email_converted_to_hem:
106
- df = df.drop(columns=self.email_column)
107
- del self.columns_renaming[self.email_column]
94
+ self.search_keys[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = SearchKey.EMAIL_ONE_DOMAIN
108
95
 
109
96
  df[self.DOMAIN_COLUMN_NAME] = df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME].str[1:]
110
97
  self.generated_features.append(self.DOMAIN_COLUMN_NAME)
111
- self.columns_renaming[self.DOMAIN_COLUMN_NAME] = original_email_column
112
98
 
113
99
  return df