upgini 1.1.312__py3-none-any.whl → 1.1.312a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -1,340 +0,0 @@
1
- from typing import Optional
2
-
3
- import pandas as pd
4
- from pandas.api.types import is_float_dtype, is_int64_dtype, is_object_dtype, is_string_dtype
5
-
6
- from upgini.errors import ValidationError
7
-
8
-
9
- class PhoneNormalizer:
10
- def __init__(self, df: pd.DataFrame, phone_column_name: str, country_column_name: Optional[str] = None):
11
- self.df = df
12
- self.phone_column_name = phone_column_name
13
- self.country_column_name = country_column_name
14
-
15
- def normalize(self) -> pd.DataFrame:
16
- self.phone_to_int()
17
- if self.country_column_name is not None:
18
- self.df = self.df.apply(self.add_prefix, axis=1)
19
- return self.df[self.phone_column_name].astype("Int64")
20
-
21
- def add_prefix(self, row):
22
- phone = row[self.phone_column_name]
23
- if pd.isna(phone):
24
- return row
25
- country = row[self.country_column_name]
26
- country_prefix_tuple = self.COUNTRIES_PREFIXES.get(country)
27
- if country_prefix_tuple is not None:
28
- country_prefix, number_of_digits = country_prefix_tuple
29
- if len(str(phone)) == number_of_digits:
30
- row[self.phone_column_name] = int(country_prefix + str(phone))
31
- return row
32
-
33
- def phone_to_int(self):
34
- """
35
- Convention: phone number is always presented as int number.
36
- phone_number = Country code + National Destination Code + Subscriber Number.
37
- Examples:
38
- 41793834315 for Switzerland
39
- 46767040672 for Sweden
40
- 861065529988 for China
41
- 18143008198 for the USA
42
- Inplace conversion of phone to int.
43
-
44
- Method will remove all non numeric chars from string and convert it to int.
45
- None will be set for phone numbers that couldn"t be converted to int
46
- """
47
- if is_string_dtype(self.df[self.phone_column_name]) or is_object_dtype(self.df[self.phone_column_name]):
48
- convert_func = self.phone_str_to_int_safe
49
- elif is_float_dtype(self.df[self.phone_column_name]):
50
- convert_func = self.phone_float_to_int_safe
51
- elif is_int64_dtype(self.df[self.phone_column_name]):
52
- convert_func = self.phone_int_to_int_safe
53
- else:
54
- raise ValidationError(
55
- f"phone_column_name {self.phone_column_name} doesn't have supported dtype. "
56
- f"Dataset dtypes: {self.df.dtypes}. "
57
- f"Contact developer and request to implement conversion of {self.phone_column_name} to int"
58
- )
59
- self.df[self.phone_column_name] = self.df[self.phone_column_name].apply(convert_func).astype("Int64")
60
-
61
- @staticmethod
62
- def phone_float_to_int_safe(value: float) -> Optional[int]:
63
- try:
64
- return PhoneNormalizer.validate_length(int(value))
65
- except Exception:
66
- return None
67
-
68
- @staticmethod
69
- def phone_int_to_int_safe(value: int) -> Optional[int]:
70
- try:
71
- return PhoneNormalizer.validate_length(int(value))
72
- except Exception:
73
- return None
74
-
75
- @staticmethod
76
- def phone_str_to_int_safe(value: str) -> Optional[int]:
77
- try:
78
- value = str(value)
79
- if value.endswith(".0"):
80
- value = value[: len(value) - 2]
81
- numeric_filter = filter(str.isdigit, value)
82
- numeric_string = "".join(numeric_filter)
83
- return PhoneNormalizer.validate_length(int(numeric_string))
84
- except Exception:
85
- return None
86
-
87
- @staticmethod
88
- def validate_length(value: int) -> Optional[int]:
89
- if value < 10000000 or value > 999999999999999:
90
- return None
91
- else:
92
- return value
93
-
94
- COUNTRIES_PREFIXES = {
95
- "US": ("1", 10),
96
- "CA": ("1", 10),
97
- "AI": ("1", 10),
98
- "AG": ("1", 10),
99
- "AS": ("1", 10),
100
- "BB": ("1", 10),
101
- "BS": ("1", 10),
102
- "VG": ("1", 10),
103
- "VI": ("1", 10),
104
- "KY": ("1", 10),
105
- "BM": ("1", 10),
106
- "GD": ("1", 10),
107
- "TC": ("1", 10),
108
- "MS": ("1", 10),
109
- "MP": ("1", 10),
110
- "GU": ("1", 10),
111
- "SX": ("1", 10),
112
- "LC": ("1", 10),
113
- "DM": ("1", 10),
114
- "VC": ("1", 10),
115
- "PR": ("1", 10),
116
- "TT": ("1", 10),
117
- "KN": ("1", 10),
118
- "JM": ("1", 10),
119
- "EG": ("20", 9),
120
- "SS": ("211", 9),
121
- "MA": ("212", 9),
122
- "EH": ("212", 4),
123
- "DZ": ("213", 8),
124
- "TN": ("216", 8),
125
- "LY": ("218", 9),
126
- "GM": ("220", 6),
127
- "SN": ("221", 9),
128
- "MR": ("222", 7),
129
- "ML": ("223", 8),
130
- "GN": ("224", 9),
131
- "CI": ("225", 7),
132
- "BF": ("226", 8),
133
- "NE": ("227", 8),
134
- "TG": ("228", 8),
135
- "BJ": ("229", 8),
136
- "MU": ("230", 7),
137
- "LR": ("231", 9),
138
- "SL": ("232", 8),
139
- "GH": ("233", 9),
140
- "NG": ("234", 9),
141
- "TD": ("235", 8),
142
- "CF": ("236", 7),
143
- "CM": ("237", 9),
144
- "CV": ("238", 7),
145
- "ST": ("239", 7),
146
- "GQ": ("240", 9),
147
- "GA": ("241", 8),
148
- "CG": ("242", 7),
149
- "CD": ("243", 9),
150
- "AO": ("244", 9),
151
- "GW": ("245", 6),
152
- "IO": ("246", 7),
153
- "AC": ("247", 5),
154
- "SC": ("248", 7),
155
- "SD": ("249", 9),
156
- "RW": ("250", 9),
157
- "ET": ("251", 9),
158
- "SO": ("252", 9),
159
- "DJ": ("253", 8),
160
- "KE": ("254", 9),
161
- "TZ": ("255", 9),
162
- "UG": ("256", 9),
163
- "BI": ("257", 8),
164
- "MZ": ("258", 8),
165
- "ZM": ("260", 9),
166
- "MG": ("261", 9),
167
- "RE": ("262", 9),
168
- "YT": ("262", 9),
169
- "TF": ("262", 9),
170
- "ZW": ("263", 9),
171
- "NA": ("264", 9),
172
- "MW": ("265", 7),
173
- "LS": ("266", 8),
174
- "BW": ("267", 7),
175
- "SZ": ("268", 8),
176
- "KM": ("269", 7),
177
- "ZA": ("27", 10),
178
- "SH": ("290", 5),
179
- "TA": ("290", 5),
180
- "ER": ("291", 7),
181
- "AT": ("43", 10),
182
- "AW": ("297", 7),
183
- "FO": ("298", 6),
184
- "GL": ("299", 6),
185
- "GR": ("30", 10),
186
- "BE": ("32", 8),
187
- "FR": ("33", 9),
188
- "ES": ("34", 9),
189
- "GI": ("350", 8),
190
- "PE": ("51", 8),
191
- "MX": ("52", 10),
192
- "CU": ("53", 8),
193
- "AR": ("54", 10),
194
- "BR": ("55", 10),
195
- "CL": ("56", 9),
196
- "CO": ("57", 8),
197
- "VE": ("58", 10),
198
- "PT": ("351", 9),
199
- "LU": ("352", 8),
200
- "IE": ("353", 8),
201
- "IS": ("354", 7),
202
- "AL": ("355", 8),
203
- "MT": ("356", 8),
204
- "CY": ("357", 8),
205
- "FI": ("358", 9),
206
- "BG": ("359", 8),
207
- "HU": ("36", 8),
208
- "LT": ("370", 8),
209
- "LV": ("371", 8),
210
- "EE": ("372", 7),
211
- "MD": ("373", 8),
212
- "AM": ("374", 8),
213
- "BY": ("375", 9),
214
- "AD": ("376", 6),
215
- "MC": ("377", 8),
216
- "SM": ("378", 9),
217
- "VA": ("3906698", 5),
218
- "UA": ("380", 9),
219
- "RS": ("381", 9),
220
- "ME": ("382", 8),
221
- "HR": ("385", 8),
222
- "SI": ("386", 8),
223
- "BA": ("387", 8),
224
- "MK": ("389", 8),
225
- "MY": ("60", 9),
226
- "AU": ("61", 9),
227
- "CX": ("61", 9),
228
- "CC": ("61", 9),
229
- "ID": ("62", 9),
230
- "PH": ("632", 7),
231
- "NZ": ("64", 8),
232
- "PN": ("64", 8),
233
- "SG": ("65", 8),
234
- "TH": ("66", 8),
235
- "IT": ("39", 10),
236
- "RO": ("40", 9),
237
- "CH": ("41", 9),
238
- "CZ": ("420", 9),
239
- "SK": ("421", 9),
240
- "GB": ("44", 10),
241
- "LI": ("423", 7),
242
- "GG": ("44", 10),
243
- "IM": ("44", 10),
244
- "JE": ("44", 10),
245
- "DK": ("45", 8),
246
- "SE": ("46", 8),
247
- "BD": ("880", 8),
248
- "TW": ("886", 9),
249
- "JP": ("81", 9),
250
- "KR": ("82", 9),
251
- "VN": ("84", 10),
252
- "KP": ("850", 8),
253
- "HK": ("852", 8),
254
- "MO": ("853", 8),
255
- "KH": ("855", 8),
256
- "LA": ("856", 8),
257
- "NO": ("47", 8),
258
- "SJ": ("47", 8),
259
- "BV": ("47", 8),
260
- "PL": ("48", 9),
261
- "DE": ("49", 10),
262
- "TR": ("90", 10),
263
- "IN": ("91", 10),
264
- "PK": ("92", 9),
265
- "AF": ("93", 9),
266
- "LK": ("94", 9),
267
- "MM": ("95", 7),
268
- "IR": ("98", 10),
269
- "MV": ("960", 7),
270
- "LB": ("961", 7),
271
- "JO": ("962", 9),
272
- "SY": ("963", 10),
273
- "IQ": ("964", 10),
274
- "KW": ("965", 7),
275
- "SA": ("966", 9),
276
- "YE": ("967", 7),
277
- "OM": ("968", 8),
278
- "PS": ("970", 8),
279
- "AE": ("971", 8),
280
- "IL": ("972", 9),
281
- "BH": ("973", 8),
282
- "QA": ("974", 8),
283
- "BT": ("975", 7),
284
- "MN": ("976", 8),
285
- "NP": ("977", 8),
286
- "TJ": ("992", 9),
287
- "TM": ("993", 8),
288
- "AZ": ("994", 9),
289
- "GE": ("995", 9),
290
- "KG": ("996", 9),
291
- "UZ": ("998", 9),
292
- "FK": ("500", 5),
293
- "BZ": ("501", 7),
294
- "GT": ("502", 8),
295
- "SV": ("503", 8),
296
- "HN": ("504", 8),
297
- "NI": ("505", 8),
298
- "CR": ("506", 8),
299
- "PA": ("507", 7),
300
- "PM": ("508", 6),
301
- "HT": ("509", 8),
302
- "GS": ("500", 5),
303
- "MF": ("590", 9),
304
- "BL": ("590", 9),
305
- "GP": ("590", 9),
306
- "BO": ("591", 9),
307
- "GY": ("592", 9),
308
- "EC": ("593", 9),
309
- "GF": ("594", 9),
310
- "PY": ("595", 9),
311
- "MQ": ("596", 9),
312
- "SR": ("597", 9),
313
- "UY": ("598", 9),
314
- "CW": ("599", 9),
315
- "BQ": ("599", 9),
316
- "RU": ("7", 10),
317
- "KZ": ("7", 10),
318
- "TL": ("670", 7),
319
- "NF": ("672", 7),
320
- "HM": ("672", 7),
321
- "BN": ("673", 7),
322
- "NR": ("674", 7),
323
- "PG": ("675", 7),
324
- "TO": ("676", 7),
325
- "SB": ("677", 7),
326
- "VU": ("678", 7),
327
- "FJ": ("679", 7),
328
- "PW": ("680", 7),
329
- "WF": ("681", 7),
330
- "CK": ("682", 5),
331
- "NU": ("683", 7),
332
- "WS": ("685", 7),
333
- "KI": ("686", 7),
334
- "NC": ("687", 7),
335
- "TV": ("688", 7),
336
- "PF": ("689", 7),
337
- "TK": ("690", 7),
338
- "FM": ("691", 7),
339
- "MH": ("692", 7),
340
- }