upgini 1.1.312a5__py3-none-any.whl → 1.1.313a3511.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/all_operands.py +26 -7
- upgini/autofe/binary.py +95 -4
- upgini/autofe/date.py +26 -6
- upgini/autofe/feature.py +25 -11
- upgini/autofe/unary.py +7 -0
- upgini/dataset.py +386 -33
- upgini/features_enricher.py +145 -295
- upgini/metadata.py +1 -16
- upgini/normalizer/phone_normalizer.py +340 -0
- upgini/utils/country_utils.py +0 -16
- upgini/utils/datetime_utils.py +16 -38
- upgini/utils/email_utils.py +17 -49
- upgini/utils/ip_utils.py +1 -100
- upgini/utils/phone_utils.py +0 -345
- upgini/utils/postal_code_utils.py +0 -34
- {upgini-1.1.312a5.dist-info → upgini-1.1.313a3511.dev1.dist-info}/METADATA +3 -1
- {upgini-1.1.312a5.dist-info → upgini-1.1.313a3511.dev1.dist-info}/RECORD +20 -20
- {upgini-1.1.312a5.dist-info → upgini-1.1.313a3511.dev1.dist-info}/WHEEL +1 -1
- upgini/normalizer/normalize_utils.py +0 -203
- {upgini-1.1.312a5.dist-info → upgini-1.1.313a3511.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/metadata.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Dict, List, Optional, Set
|
|
4
|
+
from typing import Dict, List, Optional, Set
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
@@ -113,21 +113,6 @@ class SearchKey(Enum):
|
|
|
113
113
|
if meaning_type == FileColumnMeaningType.MSISDN_RANGE_TO:
|
|
114
114
|
return SearchKey.MSISDN_RANGE_TO
|
|
115
115
|
|
|
116
|
-
@staticmethod
|
|
117
|
-
def find_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[SearchKey]:
|
|
118
|
-
if isinstance(keys, SearchKey):
|
|
119
|
-
keys = [keys]
|
|
120
|
-
for col, key_type in search_keys.items():
|
|
121
|
-
if key_type in keys:
|
|
122
|
-
return col
|
|
123
|
-
return None
|
|
124
|
-
|
|
125
|
-
@staticmethod
|
|
126
|
-
def find_all_keys(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> List[SearchKey]:
|
|
127
|
-
if isinstance(keys, SearchKey):
|
|
128
|
-
keys = [keys]
|
|
129
|
-
return [col for col, key_type in search_keys.items() if key_type in keys]
|
|
130
|
-
|
|
131
116
|
|
|
132
117
|
class DataType(Enum):
|
|
133
118
|
INT = "INT"
|
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from pandas.api.types import is_float_dtype, is_int64_dtype, is_object_dtype, is_string_dtype
|
|
5
|
+
|
|
6
|
+
from upgini.errors import ValidationError
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PhoneNormalizer:
|
|
10
|
+
def __init__(self, df: pd.DataFrame, phone_column_name: str, country_column_name: Optional[str] = None):
|
|
11
|
+
self.df = df
|
|
12
|
+
self.phone_column_name = phone_column_name
|
|
13
|
+
self.country_column_name = country_column_name
|
|
14
|
+
|
|
15
|
+
def normalize(self) -> pd.DataFrame:
|
|
16
|
+
self.phone_to_int()
|
|
17
|
+
if self.country_column_name is not None:
|
|
18
|
+
self.df = self.df.apply(self.add_prefix, axis=1)
|
|
19
|
+
return self.df[self.phone_column_name].astype("Int64")
|
|
20
|
+
|
|
21
|
+
def add_prefix(self, row):
|
|
22
|
+
phone = row[self.phone_column_name]
|
|
23
|
+
if pd.isna(phone):
|
|
24
|
+
return row
|
|
25
|
+
country = row[self.country_column_name]
|
|
26
|
+
country_prefix_tuple = self.COUNTRIES_PREFIXES.get(country)
|
|
27
|
+
if country_prefix_tuple is not None:
|
|
28
|
+
country_prefix, number_of_digits = country_prefix_tuple
|
|
29
|
+
if len(str(phone)) == number_of_digits:
|
|
30
|
+
row[self.phone_column_name] = int(country_prefix + str(phone))
|
|
31
|
+
return row
|
|
32
|
+
|
|
33
|
+
def phone_to_int(self):
|
|
34
|
+
"""
|
|
35
|
+
Convention: phone number is always presented as int number.
|
|
36
|
+
phone_number = Country code + National Destination Code + Subscriber Number.
|
|
37
|
+
Examples:
|
|
38
|
+
41793834315 for Switzerland
|
|
39
|
+
46767040672 for Sweden
|
|
40
|
+
861065529988 for China
|
|
41
|
+
18143008198 for the USA
|
|
42
|
+
Inplace conversion of phone to int.
|
|
43
|
+
|
|
44
|
+
Method will remove all non numeric chars from string and convert it to int.
|
|
45
|
+
None will be set for phone numbers that couldn"t be converted to int
|
|
46
|
+
"""
|
|
47
|
+
if is_string_dtype(self.df[self.phone_column_name]) or is_object_dtype(self.df[self.phone_column_name]):
|
|
48
|
+
convert_func = self.phone_str_to_int_safe
|
|
49
|
+
elif is_float_dtype(self.df[self.phone_column_name]):
|
|
50
|
+
convert_func = self.phone_float_to_int_safe
|
|
51
|
+
elif is_int64_dtype(self.df[self.phone_column_name]):
|
|
52
|
+
convert_func = self.phone_int_to_int_safe
|
|
53
|
+
else:
|
|
54
|
+
raise ValidationError(
|
|
55
|
+
f"phone_column_name {self.phone_column_name} doesn't have supported dtype. "
|
|
56
|
+
f"Dataset dtypes: {self.df.dtypes}. "
|
|
57
|
+
f"Contact developer and request to implement conversion of {self.phone_column_name} to int"
|
|
58
|
+
)
|
|
59
|
+
self.df[self.phone_column_name] = self.df[self.phone_column_name].apply(convert_func).astype("Int64")
|
|
60
|
+
|
|
61
|
+
@staticmethod
|
|
62
|
+
def phone_float_to_int_safe(value: float) -> Optional[int]:
|
|
63
|
+
try:
|
|
64
|
+
return PhoneNormalizer.validate_length(int(value))
|
|
65
|
+
except Exception:
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
def phone_int_to_int_safe(value: int) -> Optional[int]:
|
|
70
|
+
try:
|
|
71
|
+
return PhoneNormalizer.validate_length(int(value))
|
|
72
|
+
except Exception:
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
@staticmethod
|
|
76
|
+
def phone_str_to_int_safe(value: str) -> Optional[int]:
|
|
77
|
+
try:
|
|
78
|
+
value = str(value)
|
|
79
|
+
if value.endswith(".0"):
|
|
80
|
+
value = value[: len(value) - 2]
|
|
81
|
+
numeric_filter = filter(str.isdigit, value)
|
|
82
|
+
numeric_string = "".join(numeric_filter)
|
|
83
|
+
return PhoneNormalizer.validate_length(int(numeric_string))
|
|
84
|
+
except Exception:
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
@staticmethod
|
|
88
|
+
def validate_length(value: int) -> Optional[int]:
|
|
89
|
+
if value < 10000000 or value > 999999999999999:
|
|
90
|
+
return None
|
|
91
|
+
else:
|
|
92
|
+
return value
|
|
93
|
+
|
|
94
|
+
COUNTRIES_PREFIXES = {
|
|
95
|
+
"US": ("1", 10),
|
|
96
|
+
"CA": ("1", 10),
|
|
97
|
+
"AI": ("1", 10),
|
|
98
|
+
"AG": ("1", 10),
|
|
99
|
+
"AS": ("1", 10),
|
|
100
|
+
"BB": ("1", 10),
|
|
101
|
+
"BS": ("1", 10),
|
|
102
|
+
"VG": ("1", 10),
|
|
103
|
+
"VI": ("1", 10),
|
|
104
|
+
"KY": ("1", 10),
|
|
105
|
+
"BM": ("1", 10),
|
|
106
|
+
"GD": ("1", 10),
|
|
107
|
+
"TC": ("1", 10),
|
|
108
|
+
"MS": ("1", 10),
|
|
109
|
+
"MP": ("1", 10),
|
|
110
|
+
"GU": ("1", 10),
|
|
111
|
+
"SX": ("1", 10),
|
|
112
|
+
"LC": ("1", 10),
|
|
113
|
+
"DM": ("1", 10),
|
|
114
|
+
"VC": ("1", 10),
|
|
115
|
+
"PR": ("1", 10),
|
|
116
|
+
"TT": ("1", 10),
|
|
117
|
+
"KN": ("1", 10),
|
|
118
|
+
"JM": ("1", 10),
|
|
119
|
+
"EG": ("20", 9),
|
|
120
|
+
"SS": ("211", 9),
|
|
121
|
+
"MA": ("212", 9),
|
|
122
|
+
"EH": ("212", 4),
|
|
123
|
+
"DZ": ("213", 8),
|
|
124
|
+
"TN": ("216", 8),
|
|
125
|
+
"LY": ("218", 9),
|
|
126
|
+
"GM": ("220", 6),
|
|
127
|
+
"SN": ("221", 9),
|
|
128
|
+
"MR": ("222", 7),
|
|
129
|
+
"ML": ("223", 8),
|
|
130
|
+
"GN": ("224", 9),
|
|
131
|
+
"CI": ("225", 7),
|
|
132
|
+
"BF": ("226", 8),
|
|
133
|
+
"NE": ("227", 8),
|
|
134
|
+
"TG": ("228", 8),
|
|
135
|
+
"BJ": ("229", 8),
|
|
136
|
+
"MU": ("230", 7),
|
|
137
|
+
"LR": ("231", 9),
|
|
138
|
+
"SL": ("232", 8),
|
|
139
|
+
"GH": ("233", 9),
|
|
140
|
+
"NG": ("234", 9),
|
|
141
|
+
"TD": ("235", 8),
|
|
142
|
+
"CF": ("236", 7),
|
|
143
|
+
"CM": ("237", 9),
|
|
144
|
+
"CV": ("238", 7),
|
|
145
|
+
"ST": ("239", 7),
|
|
146
|
+
"GQ": ("240", 9),
|
|
147
|
+
"GA": ("241", 8),
|
|
148
|
+
"CG": ("242", 7),
|
|
149
|
+
"CD": ("243", 9),
|
|
150
|
+
"AO": ("244", 9),
|
|
151
|
+
"GW": ("245", 6),
|
|
152
|
+
"IO": ("246", 7),
|
|
153
|
+
"AC": ("247", 5),
|
|
154
|
+
"SC": ("248", 7),
|
|
155
|
+
"SD": ("249", 9),
|
|
156
|
+
"RW": ("250", 9),
|
|
157
|
+
"ET": ("251", 9),
|
|
158
|
+
"SO": ("252", 9),
|
|
159
|
+
"DJ": ("253", 8),
|
|
160
|
+
"KE": ("254", 9),
|
|
161
|
+
"TZ": ("255", 9),
|
|
162
|
+
"UG": ("256", 9),
|
|
163
|
+
"BI": ("257", 8),
|
|
164
|
+
"MZ": ("258", 8),
|
|
165
|
+
"ZM": ("260", 9),
|
|
166
|
+
"MG": ("261", 9),
|
|
167
|
+
"RE": ("262", 9),
|
|
168
|
+
"YT": ("262", 9),
|
|
169
|
+
"TF": ("262", 9),
|
|
170
|
+
"ZW": ("263", 9),
|
|
171
|
+
"NA": ("264", 9),
|
|
172
|
+
"MW": ("265", 7),
|
|
173
|
+
"LS": ("266", 8),
|
|
174
|
+
"BW": ("267", 7),
|
|
175
|
+
"SZ": ("268", 8),
|
|
176
|
+
"KM": ("269", 7),
|
|
177
|
+
"ZA": ("27", 10),
|
|
178
|
+
"SH": ("290", 5),
|
|
179
|
+
"TA": ("290", 5),
|
|
180
|
+
"ER": ("291", 7),
|
|
181
|
+
"AT": ("43", 10),
|
|
182
|
+
"AW": ("297", 7),
|
|
183
|
+
"FO": ("298", 6),
|
|
184
|
+
"GL": ("299", 6),
|
|
185
|
+
"GR": ("30", 10),
|
|
186
|
+
"BE": ("32", 8),
|
|
187
|
+
"FR": ("33", 9),
|
|
188
|
+
"ES": ("34", 9),
|
|
189
|
+
"GI": ("350", 8),
|
|
190
|
+
"PE": ("51", 8),
|
|
191
|
+
"MX": ("52", 10),
|
|
192
|
+
"CU": ("53", 8),
|
|
193
|
+
"AR": ("54", 10),
|
|
194
|
+
"BR": ("55", 10),
|
|
195
|
+
"CL": ("56", 9),
|
|
196
|
+
"CO": ("57", 8),
|
|
197
|
+
"VE": ("58", 10),
|
|
198
|
+
"PT": ("351", 9),
|
|
199
|
+
"LU": ("352", 8),
|
|
200
|
+
"IE": ("353", 8),
|
|
201
|
+
"IS": ("354", 7),
|
|
202
|
+
"AL": ("355", 8),
|
|
203
|
+
"MT": ("356", 8),
|
|
204
|
+
"CY": ("357", 8),
|
|
205
|
+
"FI": ("358", 9),
|
|
206
|
+
"BG": ("359", 8),
|
|
207
|
+
"HU": ("36", 8),
|
|
208
|
+
"LT": ("370", 8),
|
|
209
|
+
"LV": ("371", 8),
|
|
210
|
+
"EE": ("372", 7),
|
|
211
|
+
"MD": ("373", 8),
|
|
212
|
+
"AM": ("374", 8),
|
|
213
|
+
"BY": ("375", 9),
|
|
214
|
+
"AD": ("376", 6),
|
|
215
|
+
"MC": ("377", 8),
|
|
216
|
+
"SM": ("378", 9),
|
|
217
|
+
"VA": ("3906698", 5),
|
|
218
|
+
"UA": ("380", 9),
|
|
219
|
+
"RS": ("381", 9),
|
|
220
|
+
"ME": ("382", 8),
|
|
221
|
+
"HR": ("385", 8),
|
|
222
|
+
"SI": ("386", 8),
|
|
223
|
+
"BA": ("387", 8),
|
|
224
|
+
"MK": ("389", 8),
|
|
225
|
+
"MY": ("60", 9),
|
|
226
|
+
"AU": ("61", 9),
|
|
227
|
+
"CX": ("61", 9),
|
|
228
|
+
"CC": ("61", 9),
|
|
229
|
+
"ID": ("62", 9),
|
|
230
|
+
"PH": ("632", 7),
|
|
231
|
+
"NZ": ("64", 8),
|
|
232
|
+
"PN": ("64", 8),
|
|
233
|
+
"SG": ("65", 8),
|
|
234
|
+
"TH": ("66", 8),
|
|
235
|
+
"IT": ("39", 10),
|
|
236
|
+
"RO": ("40", 9),
|
|
237
|
+
"CH": ("41", 9),
|
|
238
|
+
"CZ": ("420", 9),
|
|
239
|
+
"SK": ("421", 9),
|
|
240
|
+
"GB": ("44", 10),
|
|
241
|
+
"LI": ("423", 7),
|
|
242
|
+
"GG": ("44", 10),
|
|
243
|
+
"IM": ("44", 10),
|
|
244
|
+
"JE": ("44", 10),
|
|
245
|
+
"DK": ("45", 8),
|
|
246
|
+
"SE": ("46", 8),
|
|
247
|
+
"BD": ("880", 8),
|
|
248
|
+
"TW": ("886", 9),
|
|
249
|
+
"JP": ("81", 9),
|
|
250
|
+
"KR": ("82", 9),
|
|
251
|
+
"VN": ("84", 10),
|
|
252
|
+
"KP": ("850", 8),
|
|
253
|
+
"HK": ("852", 8),
|
|
254
|
+
"MO": ("853", 8),
|
|
255
|
+
"KH": ("855", 8),
|
|
256
|
+
"LA": ("856", 8),
|
|
257
|
+
"NO": ("47", 8),
|
|
258
|
+
"SJ": ("47", 8),
|
|
259
|
+
"BV": ("47", 8),
|
|
260
|
+
"PL": ("48", 9),
|
|
261
|
+
"DE": ("49", 10),
|
|
262
|
+
"TR": ("90", 10),
|
|
263
|
+
"IN": ("91", 10),
|
|
264
|
+
"PK": ("92", 9),
|
|
265
|
+
"AF": ("93", 9),
|
|
266
|
+
"LK": ("94", 9),
|
|
267
|
+
"MM": ("95", 7),
|
|
268
|
+
"IR": ("98", 10),
|
|
269
|
+
"MV": ("960", 7),
|
|
270
|
+
"LB": ("961", 7),
|
|
271
|
+
"JO": ("962", 9),
|
|
272
|
+
"SY": ("963", 10),
|
|
273
|
+
"IQ": ("964", 10),
|
|
274
|
+
"KW": ("965", 7),
|
|
275
|
+
"SA": ("966", 9),
|
|
276
|
+
"YE": ("967", 7),
|
|
277
|
+
"OM": ("968", 8),
|
|
278
|
+
"PS": ("970", 8),
|
|
279
|
+
"AE": ("971", 8),
|
|
280
|
+
"IL": ("972", 9),
|
|
281
|
+
"BH": ("973", 8),
|
|
282
|
+
"QA": ("974", 8),
|
|
283
|
+
"BT": ("975", 7),
|
|
284
|
+
"MN": ("976", 8),
|
|
285
|
+
"NP": ("977", 8),
|
|
286
|
+
"TJ": ("992", 9),
|
|
287
|
+
"TM": ("993", 8),
|
|
288
|
+
"AZ": ("994", 9),
|
|
289
|
+
"GE": ("995", 9),
|
|
290
|
+
"KG": ("996", 9),
|
|
291
|
+
"UZ": ("998", 9),
|
|
292
|
+
"FK": ("500", 5),
|
|
293
|
+
"BZ": ("501", 7),
|
|
294
|
+
"GT": ("502", 8),
|
|
295
|
+
"SV": ("503", 8),
|
|
296
|
+
"HN": ("504", 8),
|
|
297
|
+
"NI": ("505", 8),
|
|
298
|
+
"CR": ("506", 8),
|
|
299
|
+
"PA": ("507", 7),
|
|
300
|
+
"PM": ("508", 6),
|
|
301
|
+
"HT": ("509", 8),
|
|
302
|
+
"GS": ("500", 5),
|
|
303
|
+
"MF": ("590", 9),
|
|
304
|
+
"BL": ("590", 9),
|
|
305
|
+
"GP": ("590", 9),
|
|
306
|
+
"BO": ("591", 9),
|
|
307
|
+
"GY": ("592", 9),
|
|
308
|
+
"EC": ("593", 9),
|
|
309
|
+
"GF": ("594", 9),
|
|
310
|
+
"PY": ("595", 9),
|
|
311
|
+
"MQ": ("596", 9),
|
|
312
|
+
"SR": ("597", 9),
|
|
313
|
+
"UY": ("598", 9),
|
|
314
|
+
"CW": ("599", 9),
|
|
315
|
+
"BQ": ("599", 9),
|
|
316
|
+
"RU": ("7", 10),
|
|
317
|
+
"KZ": ("7", 10),
|
|
318
|
+
"TL": ("670", 7),
|
|
319
|
+
"NF": ("672", 7),
|
|
320
|
+
"HM": ("672", 7),
|
|
321
|
+
"BN": ("673", 7),
|
|
322
|
+
"NR": ("674", 7),
|
|
323
|
+
"PG": ("675", 7),
|
|
324
|
+
"TO": ("676", 7),
|
|
325
|
+
"SB": ("677", 7),
|
|
326
|
+
"VU": ("678", 7),
|
|
327
|
+
"FJ": ("679", 7),
|
|
328
|
+
"PW": ("680", 7),
|
|
329
|
+
"WF": ("681", 7),
|
|
330
|
+
"CK": ("682", 5),
|
|
331
|
+
"NU": ("683", 7),
|
|
332
|
+
"WS": ("685", 7),
|
|
333
|
+
"KI": ("686", 7),
|
|
334
|
+
"NC": ("687", 7),
|
|
335
|
+
"TV": ("688", 7),
|
|
336
|
+
"PF": ("689", 7),
|
|
337
|
+
"TK": ("690", 7),
|
|
338
|
+
"FM": ("691", 7),
|
|
339
|
+
"MH": ("692", 7),
|
|
340
|
+
}
|
upgini/utils/country_utils.py
CHANGED
|
@@ -4,22 +4,6 @@ from pandas.api.types import is_object_dtype, is_string_dtype
|
|
|
4
4
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
class CountrySearchKeyConverter:
|
|
8
|
-
|
|
9
|
-
def __init__(self, country_col: str):
|
|
10
|
-
self.country_col = country_col
|
|
11
|
-
|
|
12
|
-
def convert(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
13
|
-
df[self.country_col] = (
|
|
14
|
-
df[self.country_col]
|
|
15
|
-
.astype("string")
|
|
16
|
-
.str.upper()
|
|
17
|
-
.str.replace(r"[^A-Z]", "", regex=True)
|
|
18
|
-
.str.replace("UK", "GB", regex=False)
|
|
19
|
-
)
|
|
20
|
-
return df
|
|
21
|
-
|
|
22
|
-
|
|
23
7
|
class CountrySearchKeyDetector(BaseSearchKeyDetector):
|
|
24
8
|
def _is_search_key_by_name(self, column_name: str) -> bool:
|
|
25
9
|
return "country" in str(column_name).lower()
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -1,16 +1,18 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import logging
|
|
3
3
|
import re
|
|
4
|
-
import pytz
|
|
5
4
|
from typing import Dict, List, Optional
|
|
6
5
|
|
|
7
6
|
import numpy as np
|
|
8
7
|
import pandas as pd
|
|
9
8
|
from dateutil.relativedelta import relativedelta
|
|
10
|
-
from pandas.api.types import
|
|
9
|
+
from pandas.api.types import (
|
|
10
|
+
is_numeric_dtype,
|
|
11
|
+
is_period_dtype,
|
|
12
|
+
)
|
|
11
13
|
|
|
12
14
|
from upgini.errors import ValidationError
|
|
13
|
-
from upgini.metadata import
|
|
15
|
+
from upgini.metadata import SearchKey
|
|
14
16
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
15
17
|
from upgini.utils.warning_counter import WarningCounter
|
|
16
18
|
|
|
@@ -29,22 +31,18 @@ DATE_FORMATS = [
|
|
|
29
31
|
"%Y-%m-%dT%H:%M:%S.%f",
|
|
30
32
|
]
|
|
31
33
|
|
|
32
|
-
DATETIME_PATTERN = r"^[\d\s\.\-:T
|
|
34
|
+
DATETIME_PATTERN = r"^[\d\s\.\-:T/]+$"
|
|
33
35
|
|
|
34
36
|
|
|
35
37
|
class DateTimeSearchKeyConverter:
|
|
36
38
|
DATETIME_COL = "_date_time"
|
|
37
|
-
# MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31) # 946684800000 # 2000-01-01
|
|
38
|
-
MIN_SUPPORTED_DATE_TS = pd.to_datetime(datetime.datetime(1999, 12, 31)).tz_localize(None)
|
|
39
39
|
|
|
40
40
|
def __init__(
|
|
41
41
|
self,
|
|
42
42
|
date_column: str,
|
|
43
43
|
date_format: Optional[str] = None,
|
|
44
44
|
logger: Optional[logging.Logger] = None,
|
|
45
|
-
bundle:
|
|
46
|
-
warnings_counter: Optional[WarningCounter] = None,
|
|
47
|
-
silent_mode=False,
|
|
45
|
+
bundle: ResourceBundle = None,
|
|
48
46
|
):
|
|
49
47
|
self.date_column = date_column
|
|
50
48
|
self.date_format = date_format
|
|
@@ -55,8 +53,6 @@ class DateTimeSearchKeyConverter:
|
|
|
55
53
|
self.logger.setLevel("FATAL")
|
|
56
54
|
self.generated_features: List[str] = []
|
|
57
55
|
self.bundle = bundle or get_custom_bundle()
|
|
58
|
-
self.warnings_counter = warnings_counter or WarningCounter()
|
|
59
|
-
self.silent_mode = silent_mode
|
|
60
56
|
|
|
61
57
|
@staticmethod
|
|
62
58
|
def _int_to_opt(i: int) -> Optional[int]:
|
|
@@ -92,13 +88,13 @@ class DateTimeSearchKeyConverter:
|
|
|
92
88
|
# 315532801000 - 2524608001000 - milliseconds
|
|
93
89
|
# 315532801000000 - 2524608001000000 - microseconds
|
|
94
90
|
# 315532801000000000 - 2524608001000000000 - nanoseconds
|
|
95
|
-
if df[self.date_column].apply(lambda x: 10**16 < x).all():
|
|
91
|
+
if df[self.date_column].apply(lambda x: 10 ** 16 < x).all():
|
|
96
92
|
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ns")
|
|
97
|
-
elif df[self.date_column].apply(lambda x: 10**14 < x < 10**16).all():
|
|
93
|
+
elif df[self.date_column].apply(lambda x: 10 ** 14 < x < 10 ** 16).all():
|
|
98
94
|
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="us")
|
|
99
|
-
elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
|
|
95
|
+
elif df[self.date_column].apply(lambda x: 10 ** 11 < x < 10 ** 14).all():
|
|
100
96
|
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ms")
|
|
101
|
-
elif df[self.date_column].apply(lambda x: 0 < x < 10**11).all():
|
|
97
|
+
elif df[self.date_column].apply(lambda x: 0 < x < 10 ** 11).all():
|
|
102
98
|
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
|
|
103
99
|
else:
|
|
104
100
|
msg = self.bundle.get("unsupported_date_type").format(self.date_column)
|
|
@@ -112,9 +108,6 @@ class DateTimeSearchKeyConverter:
|
|
|
112
108
|
# as additional features
|
|
113
109
|
seconds = "datetime_seconds"
|
|
114
110
|
df[self.date_column] = df[self.date_column].dt.tz_localize(None)
|
|
115
|
-
|
|
116
|
-
df = self.clean_old_dates(df)
|
|
117
|
-
|
|
118
111
|
df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
|
|
119
112
|
|
|
120
113
|
seconds_without_na = df[seconds].dropna()
|
|
@@ -159,19 +152,6 @@ class DateTimeSearchKeyConverter:
|
|
|
159
152
|
except ValueError:
|
|
160
153
|
raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
|
|
161
154
|
|
|
162
|
-
def clean_old_dates(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
163
|
-
condition = df[self.date_column] <= self.MIN_SUPPORTED_DATE_TS
|
|
164
|
-
old_subset = df[condition]
|
|
165
|
-
if len(old_subset) > 0:
|
|
166
|
-
df.loc[condition, self.date_column] = None
|
|
167
|
-
self.logger.info(f"Set to None: {len(old_subset)} of {len(df)} rows because they are before 2000-01-01")
|
|
168
|
-
msg = self.bundle.get("dataset_drop_old_dates")
|
|
169
|
-
self.logger.warning(msg)
|
|
170
|
-
if not self.silent_mode:
|
|
171
|
-
print(msg)
|
|
172
|
-
self.warnings_counter.increment()
|
|
173
|
-
return df
|
|
174
|
-
|
|
175
155
|
|
|
176
156
|
def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
|
|
177
157
|
try:
|
|
@@ -258,18 +238,16 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
|
|
|
258
238
|
|
|
259
239
|
|
|
260
240
|
def validate_dates_distribution(
|
|
261
|
-
|
|
241
|
+
X: pd.DataFrame,
|
|
262
242
|
search_keys: Dict[str, SearchKey],
|
|
263
243
|
logger: Optional[logging.Logger] = None,
|
|
264
244
|
bundle: Optional[ResourceBundle] = None,
|
|
265
245
|
warning_counter: Optional[WarningCounter] = None,
|
|
266
246
|
):
|
|
267
|
-
maybe_date_col =
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
else:
|
|
272
|
-
X = df
|
|
247
|
+
maybe_date_col = None
|
|
248
|
+
for key, key_type in search_keys.items():
|
|
249
|
+
if key_type in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
250
|
+
maybe_date_col = key
|
|
273
251
|
|
|
274
252
|
if maybe_date_col is None:
|
|
275
253
|
for col in X.columns:
|
upgini/utils/email_utils.py
CHANGED
|
@@ -7,7 +7,7 @@ import pandas as pd
|
|
|
7
7
|
from pandas.api.types import is_object_dtype, is_string_dtype
|
|
8
8
|
|
|
9
9
|
from upgini.metadata import SearchKey
|
|
10
|
-
from upgini.resource_bundle import
|
|
10
|
+
from upgini.resource_bundle import bundle
|
|
11
11
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
12
12
|
|
|
13
13
|
EMAIL_REGEX = re.compile(r"^[a-zA-Z0-9.!#$%&’*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*$")
|
|
@@ -28,53 +28,29 @@ class EmailSearchKeyDetector(BaseSearchKeyDetector):
|
|
|
28
28
|
return is_email_count / all_count > 0.1
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
class EmailDomainGenerator:
|
|
32
|
-
DOMAIN_SUFFIX = "_domain"
|
|
33
|
-
|
|
34
|
-
def __init__(self, email_columns: List[str]):
|
|
35
|
-
self.email_columns = email_columns
|
|
36
|
-
self.generated_features = []
|
|
37
|
-
|
|
38
|
-
def generate(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
39
|
-
for email_col in self.email_columns:
|
|
40
|
-
domain_feature = email_col + self.DOMAIN_SUFFIX
|
|
41
|
-
df[domain_feature] = df[email_col].apply(self._email_to_domain)
|
|
42
|
-
self.generated_features.append(domain_feature)
|
|
43
|
-
return df
|
|
44
|
-
|
|
45
|
-
@staticmethod
|
|
46
|
-
def _email_to_domain(email: str) -> Optional[str]:
|
|
47
|
-
if email is not None and isinstance(email, str) and "@" in email:
|
|
48
|
-
name_and_domain = email.split("@")
|
|
49
|
-
if len(name_and_domain) == 2 and len(name_and_domain[1]) > 0:
|
|
50
|
-
return name_and_domain[1]
|
|
51
|
-
|
|
52
|
-
|
|
53
31
|
class EmailSearchKeyConverter:
|
|
54
|
-
|
|
55
|
-
|
|
32
|
+
HEM_COLUMN_NAME = "hashed_email"
|
|
33
|
+
DOMAIN_COLUMN_NAME = "email_domain"
|
|
34
|
+
EMAIL_ONE_DOMAIN_COLUMN_NAME = "email_one_domain"
|
|
56
35
|
|
|
57
36
|
def __init__(
|
|
58
37
|
self,
|
|
59
38
|
email_column: str,
|
|
60
39
|
hem_column: Optional[str],
|
|
61
40
|
search_keys: Dict[str, SearchKey],
|
|
62
|
-
columns_renaming: Dict[str, str],
|
|
63
41
|
unnest_search_keys: Optional[List[str]] = None,
|
|
64
|
-
bundle: Optional[ResourceBundle] = None,
|
|
65
42
|
logger: Optional[logging.Logger] = None,
|
|
66
43
|
):
|
|
67
44
|
self.email_column = email_column
|
|
68
45
|
self.hem_column = hem_column
|
|
69
46
|
self.search_keys = search_keys
|
|
70
|
-
self.columns_renaming = columns_renaming
|
|
71
47
|
self.unnest_search_keys = unnest_search_keys
|
|
72
|
-
self.bundle = bundle or get_custom_bundle()
|
|
73
48
|
if logger is not None:
|
|
74
49
|
self.logger = logger
|
|
75
50
|
else:
|
|
76
51
|
self.logger = logging.getLogger()
|
|
77
52
|
self.logger.setLevel("FATAL")
|
|
53
|
+
self.generated_features: List[str] = []
|
|
78
54
|
self.email_converted_to_hem = False
|
|
79
55
|
|
|
80
56
|
@staticmethod
|
|
@@ -85,7 +61,7 @@ class EmailSearchKeyConverter:
|
|
|
85
61
|
if not EMAIL_REGEX.fullmatch(email):
|
|
86
62
|
return None
|
|
87
63
|
|
|
88
|
-
return sha256(email.lower().encode("utf-8")).hexdigest()
|
|
64
|
+
return sha256(email.lower().encode("utf-8")).hexdigest()
|
|
89
65
|
|
|
90
66
|
@staticmethod
|
|
91
67
|
def _email_to_one_domain(email: str) -> Optional[str]:
|
|
@@ -96,36 +72,28 @@ class EmailSearchKeyConverter:
|
|
|
96
72
|
|
|
97
73
|
def convert(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
98
74
|
df = df.copy()
|
|
99
|
-
original_email_column = self.columns_renaming[self.email_column]
|
|
100
75
|
if self.hem_column is None:
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
msg = self.bundle.get("all_emails_invalid").format(self.email_column)
|
|
76
|
+
df[self.HEM_COLUMN_NAME] = df[self.email_column].apply(self._email_to_hem)
|
|
77
|
+
if df[self.HEM_COLUMN_NAME].isna().all():
|
|
78
|
+
msg = bundle.get("all_emails_invalid").format(self.email_column)
|
|
105
79
|
print(msg)
|
|
106
80
|
self.logger.warning(msg)
|
|
107
|
-
df = df.drop(columns=
|
|
81
|
+
df = df.drop(columns=self.HEM_COLUMN_NAME)
|
|
108
82
|
del self.search_keys[self.email_column]
|
|
109
83
|
return df
|
|
110
|
-
self.search_keys[
|
|
111
|
-
|
|
112
|
-
self.unnest_search_keys.append(hem_name)
|
|
113
|
-
self.columns_renaming[hem_name] = original_email_column # it could be upgini_email_unnest...
|
|
84
|
+
self.search_keys[self.HEM_COLUMN_NAME] = SearchKey.HEM
|
|
85
|
+
self.unnest_search_keys.append(self.HEM_COLUMN_NAME)
|
|
114
86
|
self.email_converted_to_hem = True
|
|
115
|
-
else:
|
|
116
|
-
df[self.hem_column] = df[self.hem_column].astype("string").str.lower()
|
|
117
87
|
|
|
118
88
|
del self.search_keys[self.email_column]
|
|
119
89
|
if self.email_column in self.unnest_search_keys:
|
|
120
90
|
self.unnest_search_keys.remove(self.email_column)
|
|
121
91
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
self.
|
|
125
|
-
self.search_keys[one_domain_name] = SearchKey.EMAIL_ONE_DOMAIN
|
|
92
|
+
df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = df[self.email_column].apply(self._email_to_one_domain)
|
|
93
|
+
|
|
94
|
+
self.search_keys[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = SearchKey.EMAIL_ONE_DOMAIN
|
|
126
95
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
del self.columns_renaming[self.email_column]
|
|
96
|
+
df[self.DOMAIN_COLUMN_NAME] = df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME].str[1:]
|
|
97
|
+
self.generated_features.append(self.DOMAIN_COLUMN_NAME)
|
|
130
98
|
|
|
131
99
|
return df
|