upgini 1.1.309a1__py3-none-any.whl → 1.1.309a3511.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/all_operands.py +33 -7
- upgini/autofe/binary.py +93 -2
- upgini/autofe/date.py +16 -3
- upgini/autofe/feature.py +24 -11
- upgini/autofe/unary.py +7 -0
- upgini/dataset.py +385 -30
- upgini/features_enricher.py +120 -276
- upgini/metadata.py +1 -16
- upgini/normalizer/phone_normalizer.py +340 -0
- upgini/utils/country_utils.py +0 -16
- upgini/utils/datetime_utils.py +15 -34
- upgini/utils/email_utils.py +5 -19
- upgini/utils/ip_utils.py +1 -100
- upgini/utils/phone_utils.py +0 -345
- upgini/utils/postal_code_utils.py +0 -34
- {upgini-1.1.309a1.dist-info → upgini-1.1.309a3511.dev1.dist-info}/METADATA +3 -1
- {upgini-1.1.309a1.dist-info → upgini-1.1.309a3511.dev1.dist-info}/RECORD +20 -20
- {upgini-1.1.309a1.dist-info → upgini-1.1.309a3511.dev1.dist-info}/WHEEL +1 -1
- upgini/normalizer/normalize_utils.py +0 -203
- {upgini-1.1.309a1.dist-info → upgini-1.1.309a3511.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/utils/ip_utils.py
CHANGED
|
@@ -1,114 +1,15 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from
|
|
3
|
-
from typing import Dict, List, Optional, Union
|
|
2
|
+
from typing import Dict, List, Optional
|
|
4
3
|
|
|
5
4
|
import pandas as pd
|
|
6
5
|
from requests import get
|
|
7
6
|
|
|
8
|
-
from upgini.errors import ValidationError
|
|
9
7
|
from upgini.metadata import SearchKey
|
|
10
|
-
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
11
8
|
|
|
12
9
|
# from upgini.resource_bundle import bundle
|
|
13
10
|
# from upgini.utils.track_info import get_track_metrics
|
|
14
11
|
|
|
15
12
|
|
|
16
|
-
class IpSearchKeyConverter:
|
|
17
|
-
def __init__(
|
|
18
|
-
self,
|
|
19
|
-
ip_column: str,
|
|
20
|
-
search_keys: Dict[str, SearchKey],
|
|
21
|
-
columns_renaming: Dict[str, str],
|
|
22
|
-
unnest_search_keys: Optional[List[str]] = None,
|
|
23
|
-
bundle: Optional[ResourceBundle] = None,
|
|
24
|
-
logger: Optional[logging.Logger] = None,
|
|
25
|
-
):
|
|
26
|
-
self.ip_column = ip_column
|
|
27
|
-
self.search_keys = search_keys
|
|
28
|
-
self.columns_renaming = columns_renaming
|
|
29
|
-
self.unnest_search_keys = unnest_search_keys
|
|
30
|
-
self.bundle = bundle or get_custom_bundle()
|
|
31
|
-
if logger is not None:
|
|
32
|
-
self.logger = logger
|
|
33
|
-
else:
|
|
34
|
-
self.logger = logging.getLogger()
|
|
35
|
-
self.logger.setLevel("FATAL")
|
|
36
|
-
|
|
37
|
-
@staticmethod
|
|
38
|
-
def _ip_to_int(ip: Optional[_BaseAddress]) -> Optional[int]:
|
|
39
|
-
try:
|
|
40
|
-
if isinstance(ip, (IPv4Address, IPv6Address)):
|
|
41
|
-
return int(ip)
|
|
42
|
-
except Exception:
|
|
43
|
-
pass
|
|
44
|
-
|
|
45
|
-
@staticmethod
|
|
46
|
-
def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
|
|
47
|
-
try:
|
|
48
|
-
if isinstance(ip, (IPv4Address, IPv6Address)):
|
|
49
|
-
return str(int(ip))
|
|
50
|
-
except Exception:
|
|
51
|
-
pass
|
|
52
|
-
|
|
53
|
-
@staticmethod
|
|
54
|
-
def _safe_ip_parse(ip: Union[str, int, IPv4Address, IPv6Address]) -> Optional[_BaseAddress]:
|
|
55
|
-
try:
|
|
56
|
-
return ip_address(ip)
|
|
57
|
-
except ValueError:
|
|
58
|
-
pass
|
|
59
|
-
|
|
60
|
-
@staticmethod
|
|
61
|
-
def _is_ipv4(ip: Optional[_BaseAddress]):
|
|
62
|
-
return ip is not None and (
|
|
63
|
-
isinstance(ip, IPv4Address) or (isinstance(ip, IPv6Address) and ip.ipv4_mapped is not None)
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
@staticmethod
|
|
67
|
-
def _to_ipv4(ip: Optional[_BaseAddress]) -> Optional[IPv4Address]:
|
|
68
|
-
if isinstance(ip, IPv4Address):
|
|
69
|
-
return ip
|
|
70
|
-
return None
|
|
71
|
-
|
|
72
|
-
@staticmethod
|
|
73
|
-
def _to_ipv6(ip: Optional[_BaseAddress]) -> Optional[IPv6Address]:
|
|
74
|
-
if isinstance(ip, IPv6Address):
|
|
75
|
-
return ip
|
|
76
|
-
if isinstance(ip, IPv4Address):
|
|
77
|
-
return IPv6Address("::ffff:" + str(ip))
|
|
78
|
-
return None
|
|
79
|
-
|
|
80
|
-
def convert(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
81
|
-
"""Convert ip address to int"""
|
|
82
|
-
self.logger.info("Convert ip address to int")
|
|
83
|
-
original_ip = self.columns_renaming[self.ip_column]
|
|
84
|
-
|
|
85
|
-
df[self.ip_column] = df[self.ip_column].apply(self._safe_ip_parse)
|
|
86
|
-
if df[self.ip_column].isnull().all():
|
|
87
|
-
raise ValidationError(self.bundle.get("invalid_ip").format(self.ip_column))
|
|
88
|
-
|
|
89
|
-
# legacy support
|
|
90
|
-
ipv4 = self.ip_column + "_v4"
|
|
91
|
-
df[ipv4] = df[self.ip_column].apply(self._to_ipv4).apply(self._ip_to_int).astype("Int64")
|
|
92
|
-
self.search_keys[ipv4] = SearchKey.IP
|
|
93
|
-
self.columns_renaming[ipv4] = original_ip
|
|
94
|
-
|
|
95
|
-
ipv6 = self.ip_column + "_v6"
|
|
96
|
-
df[ipv6] = (
|
|
97
|
-
df[self.ip_column]
|
|
98
|
-
.apply(self._to_ipv6)
|
|
99
|
-
.apply(self._ip_to_int_str)
|
|
100
|
-
.astype("string")
|
|
101
|
-
# .str.replace(".0", "", regex=False)
|
|
102
|
-
)
|
|
103
|
-
df = df.drop(columns=self.ip_column)
|
|
104
|
-
del self.search_keys[self.ip_column]
|
|
105
|
-
del self.columns_renaming[self.ip_column]
|
|
106
|
-
self.search_keys[ipv6] = SearchKey.IPV6_ADDRESS
|
|
107
|
-
self.columns_renaming[ipv6] = original_ip # could be __unnest_ip...
|
|
108
|
-
|
|
109
|
-
return df
|
|
110
|
-
|
|
111
|
-
|
|
112
13
|
class IpToCountrySearchKeyConverter:
|
|
113
14
|
url = "http://ip-api.com/json/{}"
|
|
114
15
|
|
upgini/utils/phone_utils.py
CHANGED
|
@@ -1,14 +1,5 @@
|
|
|
1
|
-
from typing import Optional
|
|
2
|
-
|
|
3
1
|
import pandas as pd
|
|
4
|
-
from pandas.api.types import (
|
|
5
|
-
is_float_dtype,
|
|
6
|
-
is_int64_dtype,
|
|
7
|
-
is_object_dtype,
|
|
8
|
-
is_string_dtype,
|
|
9
|
-
)
|
|
10
2
|
|
|
11
|
-
from upgini.errors import ValidationError
|
|
12
3
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
13
4
|
|
|
14
5
|
|
|
@@ -18,339 +9,3 @@ class PhoneSearchKeyDetector(BaseSearchKeyDetector):
|
|
|
18
9
|
|
|
19
10
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
20
11
|
return False
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class PhoneSearchKeyConverter:
|
|
24
|
-
|
|
25
|
-
def __init__(self, phone_column: str, country_column: Optional[str] = None):
|
|
26
|
-
self.phone_column = phone_column
|
|
27
|
-
self.country_column = country_column
|
|
28
|
-
|
|
29
|
-
def convert(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
30
|
-
df = self.phone_to_int(df)
|
|
31
|
-
if self.country_column is not None:
|
|
32
|
-
df = df.apply(self.add_prefix, axis=1)
|
|
33
|
-
df[self.phone_column] = df[self.phone_column].astype("Int64")
|
|
34
|
-
return df
|
|
35
|
-
|
|
36
|
-
def add_prefix(self, row):
|
|
37
|
-
phone = row[self.phone_column]
|
|
38
|
-
if pd.isna(phone):
|
|
39
|
-
return row
|
|
40
|
-
country = row[self.country_column]
|
|
41
|
-
country_prefix_tuple = self.COUNTRIES_PREFIXES.get(country)
|
|
42
|
-
if country_prefix_tuple is not None:
|
|
43
|
-
country_prefix, number_of_digits = country_prefix_tuple
|
|
44
|
-
if len(str(phone)) == number_of_digits:
|
|
45
|
-
row[self.phone_column] = int(country_prefix + str(phone))
|
|
46
|
-
return row
|
|
47
|
-
|
|
48
|
-
def phone_to_int(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
49
|
-
"""
|
|
50
|
-
Convention: phone number is always presented as int number.
|
|
51
|
-
phone_number = Country code + National Destination Code + Subscriber Number.
|
|
52
|
-
Examples:
|
|
53
|
-
41793834315 for Switzerland
|
|
54
|
-
46767040672 for Sweden
|
|
55
|
-
861065529988 for China
|
|
56
|
-
18143008198 for the USA
|
|
57
|
-
Inplace conversion of phone to int.
|
|
58
|
-
|
|
59
|
-
Method will remove all non numeric chars from string and convert it to int.
|
|
60
|
-
None will be set for phone numbers that couldn"t be converted to int
|
|
61
|
-
"""
|
|
62
|
-
if is_string_dtype(df[self.phone_column]) or is_object_dtype(df[self.phone_column]):
|
|
63
|
-
convert_func = self.phone_str_to_int_safe
|
|
64
|
-
elif is_float_dtype(df[self.phone_column]):
|
|
65
|
-
convert_func = self.phone_float_to_int_safe
|
|
66
|
-
elif is_int64_dtype(df[self.phone_column]):
|
|
67
|
-
convert_func = self.phone_int_to_int_safe
|
|
68
|
-
else:
|
|
69
|
-
raise ValidationError(
|
|
70
|
-
f"phone_column_name {self.phone_column} doesn't have supported dtype. "
|
|
71
|
-
f"Dataset dtypes: {df.dtypes}. "
|
|
72
|
-
f"Contact developer and request to implement conversion of {self.phone_column} to int"
|
|
73
|
-
)
|
|
74
|
-
df[self.phone_column] = df[self.phone_column].apply(convert_func).astype("Int64")
|
|
75
|
-
return df
|
|
76
|
-
|
|
77
|
-
@staticmethod
|
|
78
|
-
def phone_float_to_int_safe(value: float) -> Optional[int]:
|
|
79
|
-
try:
|
|
80
|
-
return PhoneSearchKeyConverter.validate_length(int(value))
|
|
81
|
-
except Exception:
|
|
82
|
-
return None
|
|
83
|
-
|
|
84
|
-
@staticmethod
|
|
85
|
-
def phone_int_to_int_safe(value: int) -> Optional[int]:
|
|
86
|
-
try:
|
|
87
|
-
return PhoneSearchKeyConverter.validate_length(int(value))
|
|
88
|
-
except Exception:
|
|
89
|
-
return None
|
|
90
|
-
|
|
91
|
-
@staticmethod
|
|
92
|
-
def phone_str_to_int_safe(value: str) -> Optional[int]:
|
|
93
|
-
try:
|
|
94
|
-
value = str(value)
|
|
95
|
-
if value.endswith(".0"):
|
|
96
|
-
value = value[: len(value) - 2]
|
|
97
|
-
numeric_filter = filter(str.isdigit, value)
|
|
98
|
-
numeric_string = "".join(numeric_filter)
|
|
99
|
-
return PhoneSearchKeyConverter.validate_length(int(numeric_string))
|
|
100
|
-
except Exception:
|
|
101
|
-
return None
|
|
102
|
-
|
|
103
|
-
@staticmethod
|
|
104
|
-
def validate_length(value: int) -> Optional[int]:
|
|
105
|
-
if value < 10000000 or value > 999999999999999:
|
|
106
|
-
return None
|
|
107
|
-
else:
|
|
108
|
-
return value
|
|
109
|
-
|
|
110
|
-
COUNTRIES_PREFIXES = {
|
|
111
|
-
"US": ("1", 10),
|
|
112
|
-
"CA": ("1", 10),
|
|
113
|
-
"AI": ("1", 10),
|
|
114
|
-
"AG": ("1", 10),
|
|
115
|
-
"AS": ("1", 10),
|
|
116
|
-
"BB": ("1", 10),
|
|
117
|
-
"BS": ("1", 10),
|
|
118
|
-
"VG": ("1", 10),
|
|
119
|
-
"VI": ("1", 10),
|
|
120
|
-
"KY": ("1", 10),
|
|
121
|
-
"BM": ("1", 10),
|
|
122
|
-
"GD": ("1", 10),
|
|
123
|
-
"TC": ("1", 10),
|
|
124
|
-
"MS": ("1", 10),
|
|
125
|
-
"MP": ("1", 10),
|
|
126
|
-
"GU": ("1", 10),
|
|
127
|
-
"SX": ("1", 10),
|
|
128
|
-
"LC": ("1", 10),
|
|
129
|
-
"DM": ("1", 10),
|
|
130
|
-
"VC": ("1", 10),
|
|
131
|
-
"PR": ("1", 10),
|
|
132
|
-
"TT": ("1", 10),
|
|
133
|
-
"KN": ("1", 10),
|
|
134
|
-
"JM": ("1", 10),
|
|
135
|
-
"EG": ("20", 9),
|
|
136
|
-
"SS": ("211", 9),
|
|
137
|
-
"MA": ("212", 9),
|
|
138
|
-
"EH": ("212", 4),
|
|
139
|
-
"DZ": ("213", 8),
|
|
140
|
-
"TN": ("216", 8),
|
|
141
|
-
"LY": ("218", 9),
|
|
142
|
-
"GM": ("220", 6),
|
|
143
|
-
"SN": ("221", 9),
|
|
144
|
-
"MR": ("222", 7),
|
|
145
|
-
"ML": ("223", 8),
|
|
146
|
-
"GN": ("224", 9),
|
|
147
|
-
"CI": ("225", 7),
|
|
148
|
-
"BF": ("226", 8),
|
|
149
|
-
"NE": ("227", 8),
|
|
150
|
-
"TG": ("228", 8),
|
|
151
|
-
"BJ": ("229", 8),
|
|
152
|
-
"MU": ("230", 7),
|
|
153
|
-
"LR": ("231", 9),
|
|
154
|
-
"SL": ("232", 8),
|
|
155
|
-
"GH": ("233", 9),
|
|
156
|
-
"NG": ("234", 9),
|
|
157
|
-
"TD": ("235", 8),
|
|
158
|
-
"CF": ("236", 7),
|
|
159
|
-
"CM": ("237", 9),
|
|
160
|
-
"CV": ("238", 7),
|
|
161
|
-
"ST": ("239", 7),
|
|
162
|
-
"GQ": ("240", 9),
|
|
163
|
-
"GA": ("241", 8),
|
|
164
|
-
"CG": ("242", 7),
|
|
165
|
-
"CD": ("243", 9),
|
|
166
|
-
"AO": ("244", 9),
|
|
167
|
-
"GW": ("245", 6),
|
|
168
|
-
"IO": ("246", 7),
|
|
169
|
-
"AC": ("247", 5),
|
|
170
|
-
"SC": ("248", 7),
|
|
171
|
-
"SD": ("249", 9),
|
|
172
|
-
"RW": ("250", 9),
|
|
173
|
-
"ET": ("251", 9),
|
|
174
|
-
"SO": ("252", 9),
|
|
175
|
-
"DJ": ("253", 8),
|
|
176
|
-
"KE": ("254", 9),
|
|
177
|
-
"TZ": ("255", 9),
|
|
178
|
-
"UG": ("256", 9),
|
|
179
|
-
"BI": ("257", 8),
|
|
180
|
-
"MZ": ("258", 8),
|
|
181
|
-
"ZM": ("260", 9),
|
|
182
|
-
"MG": ("261", 9),
|
|
183
|
-
"RE": ("262", 9),
|
|
184
|
-
"YT": ("262", 9),
|
|
185
|
-
"TF": ("262", 9),
|
|
186
|
-
"ZW": ("263", 9),
|
|
187
|
-
"NA": ("264", 9),
|
|
188
|
-
"MW": ("265", 7),
|
|
189
|
-
"LS": ("266", 8),
|
|
190
|
-
"BW": ("267", 7),
|
|
191
|
-
"SZ": ("268", 8),
|
|
192
|
-
"KM": ("269", 7),
|
|
193
|
-
"ZA": ("27", 10),
|
|
194
|
-
"SH": ("290", 5),
|
|
195
|
-
"TA": ("290", 5),
|
|
196
|
-
"ER": ("291", 7),
|
|
197
|
-
"AT": ("43", 10),
|
|
198
|
-
"AW": ("297", 7),
|
|
199
|
-
"FO": ("298", 6),
|
|
200
|
-
"GL": ("299", 6),
|
|
201
|
-
"GR": ("30", 10),
|
|
202
|
-
"BE": ("32", 8),
|
|
203
|
-
"FR": ("33", 9),
|
|
204
|
-
"ES": ("34", 9),
|
|
205
|
-
"GI": ("350", 8),
|
|
206
|
-
"PE": ("51", 8),
|
|
207
|
-
"MX": ("52", 10),
|
|
208
|
-
"CU": ("53", 8),
|
|
209
|
-
"AR": ("54", 10),
|
|
210
|
-
"BR": ("55", 10),
|
|
211
|
-
"CL": ("56", 9),
|
|
212
|
-
"CO": ("57", 8),
|
|
213
|
-
"VE": ("58", 10),
|
|
214
|
-
"PT": ("351", 9),
|
|
215
|
-
"LU": ("352", 8),
|
|
216
|
-
"IE": ("353", 8),
|
|
217
|
-
"IS": ("354", 7),
|
|
218
|
-
"AL": ("355", 8),
|
|
219
|
-
"MT": ("356", 8),
|
|
220
|
-
"CY": ("357", 8),
|
|
221
|
-
"FI": ("358", 9),
|
|
222
|
-
"BG": ("359", 8),
|
|
223
|
-
"HU": ("36", 8),
|
|
224
|
-
"LT": ("370", 8),
|
|
225
|
-
"LV": ("371", 8),
|
|
226
|
-
"EE": ("372", 7),
|
|
227
|
-
"MD": ("373", 8),
|
|
228
|
-
"AM": ("374", 8),
|
|
229
|
-
"BY": ("375", 9),
|
|
230
|
-
"AD": ("376", 6),
|
|
231
|
-
"MC": ("377", 8),
|
|
232
|
-
"SM": ("378", 9),
|
|
233
|
-
"VA": ("3906698", 5),
|
|
234
|
-
"UA": ("380", 9),
|
|
235
|
-
"RS": ("381", 9),
|
|
236
|
-
"ME": ("382", 8),
|
|
237
|
-
"HR": ("385", 8),
|
|
238
|
-
"SI": ("386", 8),
|
|
239
|
-
"BA": ("387", 8),
|
|
240
|
-
"MK": ("389", 8),
|
|
241
|
-
"MY": ("60", 9),
|
|
242
|
-
"AU": ("61", 9),
|
|
243
|
-
"CX": ("61", 9),
|
|
244
|
-
"CC": ("61", 9),
|
|
245
|
-
"ID": ("62", 9),
|
|
246
|
-
"PH": ("632", 7),
|
|
247
|
-
"NZ": ("64", 8),
|
|
248
|
-
"PN": ("64", 8),
|
|
249
|
-
"SG": ("65", 8),
|
|
250
|
-
"TH": ("66", 8),
|
|
251
|
-
"IT": ("39", 10),
|
|
252
|
-
"RO": ("40", 9),
|
|
253
|
-
"CH": ("41", 9),
|
|
254
|
-
"CZ": ("420", 9),
|
|
255
|
-
"SK": ("421", 9),
|
|
256
|
-
"GB": ("44", 10),
|
|
257
|
-
"LI": ("423", 7),
|
|
258
|
-
"GG": ("44", 10),
|
|
259
|
-
"IM": ("44", 10),
|
|
260
|
-
"JE": ("44", 10),
|
|
261
|
-
"DK": ("45", 8),
|
|
262
|
-
"SE": ("46", 8),
|
|
263
|
-
"BD": ("880", 8),
|
|
264
|
-
"TW": ("886", 9),
|
|
265
|
-
"JP": ("81", 9),
|
|
266
|
-
"KR": ("82", 9),
|
|
267
|
-
"VN": ("84", 10),
|
|
268
|
-
"KP": ("850", 8),
|
|
269
|
-
"HK": ("852", 8),
|
|
270
|
-
"MO": ("853", 8),
|
|
271
|
-
"KH": ("855", 8),
|
|
272
|
-
"LA": ("856", 8),
|
|
273
|
-
"NO": ("47", 8),
|
|
274
|
-
"SJ": ("47", 8),
|
|
275
|
-
"BV": ("47", 8),
|
|
276
|
-
"PL": ("48", 9),
|
|
277
|
-
"DE": ("49", 10),
|
|
278
|
-
"TR": ("90", 10),
|
|
279
|
-
"IN": ("91", 10),
|
|
280
|
-
"PK": ("92", 9),
|
|
281
|
-
"AF": ("93", 9),
|
|
282
|
-
"LK": ("94", 9),
|
|
283
|
-
"MM": ("95", 7),
|
|
284
|
-
"IR": ("98", 10),
|
|
285
|
-
"MV": ("960", 7),
|
|
286
|
-
"LB": ("961", 7),
|
|
287
|
-
"JO": ("962", 9),
|
|
288
|
-
"SY": ("963", 10),
|
|
289
|
-
"IQ": ("964", 10),
|
|
290
|
-
"KW": ("965", 7),
|
|
291
|
-
"SA": ("966", 9),
|
|
292
|
-
"YE": ("967", 7),
|
|
293
|
-
"OM": ("968", 8),
|
|
294
|
-
"PS": ("970", 8),
|
|
295
|
-
"AE": ("971", 8),
|
|
296
|
-
"IL": ("972", 9),
|
|
297
|
-
"BH": ("973", 8),
|
|
298
|
-
"QA": ("974", 8),
|
|
299
|
-
"BT": ("975", 7),
|
|
300
|
-
"MN": ("976", 8),
|
|
301
|
-
"NP": ("977", 8),
|
|
302
|
-
"TJ": ("992", 9),
|
|
303
|
-
"TM": ("993", 8),
|
|
304
|
-
"AZ": ("994", 9),
|
|
305
|
-
"GE": ("995", 9),
|
|
306
|
-
"KG": ("996", 9),
|
|
307
|
-
"UZ": ("998", 9),
|
|
308
|
-
"FK": ("500", 5),
|
|
309
|
-
"BZ": ("501", 7),
|
|
310
|
-
"GT": ("502", 8),
|
|
311
|
-
"SV": ("503", 8),
|
|
312
|
-
"HN": ("504", 8),
|
|
313
|
-
"NI": ("505", 8),
|
|
314
|
-
"CR": ("506", 8),
|
|
315
|
-
"PA": ("507", 7),
|
|
316
|
-
"PM": ("508", 6),
|
|
317
|
-
"HT": ("509", 8),
|
|
318
|
-
"GS": ("500", 5),
|
|
319
|
-
"MF": ("590", 9),
|
|
320
|
-
"BL": ("590", 9),
|
|
321
|
-
"GP": ("590", 9),
|
|
322
|
-
"BO": ("591", 9),
|
|
323
|
-
"GY": ("592", 9),
|
|
324
|
-
"EC": ("593", 9),
|
|
325
|
-
"GF": ("594", 9),
|
|
326
|
-
"PY": ("595", 9),
|
|
327
|
-
"MQ": ("596", 9),
|
|
328
|
-
"SR": ("597", 9),
|
|
329
|
-
"UY": ("598", 9),
|
|
330
|
-
"CW": ("599", 9),
|
|
331
|
-
"BQ": ("599", 9),
|
|
332
|
-
"RU": ("7", 10),
|
|
333
|
-
"KZ": ("7", 10),
|
|
334
|
-
"TL": ("670", 7),
|
|
335
|
-
"NF": ("672", 7),
|
|
336
|
-
"HM": ("672", 7),
|
|
337
|
-
"BN": ("673", 7),
|
|
338
|
-
"NR": ("674", 7),
|
|
339
|
-
"PG": ("675", 7),
|
|
340
|
-
"TO": ("676", 7),
|
|
341
|
-
"SB": ("677", 7),
|
|
342
|
-
"VU": ("678", 7),
|
|
343
|
-
"FJ": ("679", 7),
|
|
344
|
-
"PW": ("680", 7),
|
|
345
|
-
"WF": ("681", 7),
|
|
346
|
-
"CK": ("682", 5),
|
|
347
|
-
"NU": ("683", 7),
|
|
348
|
-
"WS": ("685", 7),
|
|
349
|
-
"KI": ("686", 7),
|
|
350
|
-
"NC": ("687", 7),
|
|
351
|
-
"TV": ("688", 7),
|
|
352
|
-
"PF": ("689", 7),
|
|
353
|
-
"TK": ("690", 7),
|
|
354
|
-
"FM": ("691", 7),
|
|
355
|
-
"MH": ("692", 7),
|
|
356
|
-
}
|
|
@@ -1,9 +1,4 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
-
from pandas.api.types import (
|
|
3
|
-
is_float_dtype,
|
|
4
|
-
is_object_dtype,
|
|
5
|
-
is_string_dtype,
|
|
6
|
-
)
|
|
7
2
|
|
|
8
3
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
9
4
|
|
|
@@ -14,32 +9,3 @@ class PostalCodeSearchKeyDetector(BaseSearchKeyDetector):
|
|
|
14
9
|
|
|
15
10
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
16
11
|
return False
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class PostalCodeSearchKeyConverter:
|
|
20
|
-
|
|
21
|
-
def __init__(self, postal_code_column: str):
|
|
22
|
-
self.postal_code_column = postal_code_column
|
|
23
|
-
|
|
24
|
-
def convert(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
25
|
-
if is_string_dtype(df[self.postal_code_column]) or is_object_dtype(df[self.postal_code_column]):
|
|
26
|
-
try:
|
|
27
|
-
df[self.postal_code_column] = (
|
|
28
|
-
df[self.postal_code_column].astype("string").astype("Float64").astype("Int64").astype("string")
|
|
29
|
-
)
|
|
30
|
-
except Exception:
|
|
31
|
-
pass
|
|
32
|
-
elif is_float_dtype(df[self.postal_code_column]):
|
|
33
|
-
df[self.postal_code_column] = df[self.postal_code_column].astype("Int64").astype("string")
|
|
34
|
-
|
|
35
|
-
df[self.postal_code_column] = (
|
|
36
|
-
df[self.postal_code_column]
|
|
37
|
-
.astype("string")
|
|
38
|
-
.str.upper()
|
|
39
|
-
.str.replace(r"[^0-9A-Z]", "", regex=True) # remove non alphanumeric characters
|
|
40
|
-
.str.replace(r"^0+\B", "", regex=True) # remove leading zeros
|
|
41
|
-
)
|
|
42
|
-
# if (df[self.postal_code_column] == "").all():
|
|
43
|
-
# raise ValidationError(self.bundle.get("invalid_postal_code").format(self.postal_code_column))
|
|
44
|
-
|
|
45
|
-
return df
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.309a3511.dev1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -26,6 +26,8 @@ Requires-Python: <3.11,>=3.8
|
|
|
26
26
|
Requires-Dist: catboost>=1.0.3
|
|
27
27
|
Requires-Dist: fastparquet>=0.8.1
|
|
28
28
|
Requires-Dist: ipywidgets>=8.1.0
|
|
29
|
+
Requires-Dist: jarowinkler>=2.0.0
|
|
30
|
+
Requires-Dist: levenshtein>=0.25.1
|
|
29
31
|
Requires-Dist: lightgbm>=3.3.2
|
|
30
32
|
Requires-Dist: numpy>=1.19.0
|
|
31
33
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=0P3dkxdi76OjQXeOAzNBJ_LhQgj5SsS2hOszg6gn51I,34
|
|
2
2
|
upgini/__init__.py,sha256=Xs0YFVBu1KUdtZzbStGRPQtLt3YLzJnjx5nIUBlX8BE,415
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=MOzBVsvzlHLxNfPWtMaXC_jIPeW7_gUvbSGeXnsPgNI,46158
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=JzSnwqxUKuYqBC4DHgPcG4MxQzvnCfKuOgihTllwRis,182583
|
|
7
7
|
upgini/http.py,sha256=a4Epc9YLIJBuYk4t8E_2-QDLBtJFqKO35jn2SnYQZCg,42920
|
|
8
8
|
upgini/lazy_import.py,sha256=EwoM0msNGbSmWBhGbrLDny1DSnOlvTxCjmMKPxYlDms,610
|
|
9
|
-
upgini/metadata.py,sha256=
|
|
9
|
+
upgini/metadata.py,sha256=E5WWZ_MkjGyYNQh_LnwMIBHyqPx1fxk-qhEfQIJnzq8,10209
|
|
10
10
|
upgini/metrics.py,sha256=Tu5cN8RlhOSSMWUTXRSkdl8SWBqR1N_2eJpBum9pZxc,30926
|
|
11
11
|
upgini/search_task.py,sha256=LtRJ9bCPjMo1gJ-sUDKERhDwGcWKImrzwVFHjkMSQHQ,17071
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
@@ -14,20 +14,20 @@ upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1
|
|
|
14
14
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
15
15
|
upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
upgini/autofe/all_operands.py,sha256=
|
|
18
|
-
upgini/autofe/binary.py,sha256=
|
|
19
|
-
upgini/autofe/date.py,sha256=
|
|
20
|
-
upgini/autofe/feature.py,sha256=
|
|
17
|
+
upgini/autofe/all_operands.py,sha256=LZfcfRqYTrrRbUsTEiY6O7TImr0NR4Idn-WoJongrTM,2594
|
|
18
|
+
upgini/autofe/binary.py,sha256=9W1DL2kZEmgV1P-0BEy8JYj9u_xhiDPKfeEsFQfrlkU,6860
|
|
19
|
+
upgini/autofe/date.py,sha256=INgiSfhkEiK3s6JL47O9EQrXITwqFrXo-KoihCdO5B4,8440
|
|
20
|
+
upgini/autofe/feature.py,sha256=99xakEK6kQKoduAtblIAAkCsG8fezfHQA4eji8c1i8E,14200
|
|
21
21
|
upgini/autofe/groupby.py,sha256=4WjDzQxqpZxB79Ih4ihMMI5GDxaFqiH6ZelfV82ClT4,3091
|
|
22
22
|
upgini/autofe/operand.py,sha256=MKEsl3zxpWzRDpTkE0sNJxTu62U20sWOvEKhPjUWS6s,2915
|
|
23
|
-
upgini/autofe/unary.py,sha256=
|
|
23
|
+
upgini/autofe/unary.py,sha256=B4wp8oKnlJ0nUng-DRMKSiF8MHlhAFYbgmo9Nd_0ZaA,3777
|
|
24
24
|
upgini/autofe/vector.py,sha256=dLxfAstJs-gw_OQ1xxoxcM6pVzORlV0HVzdzt7cLXVQ,606
|
|
25
25
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
26
|
upgini/data_source/data_source_publisher.py,sha256=y0tYFp7N3bSI7BwQ5SRF8r0bRaI3z6Zc1fsZezVg7hE,20552
|
|
27
27
|
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
28
28
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
|
29
29
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
-
upgini/normalizer/
|
|
30
|
+
upgini/normalizer/phone_normalizer.py,sha256=EzTaahk6myRv6ZXgbyVFGY4kpo_2VlQgOrm5_lfbmNI,9996
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
33
|
upgini/resource_bundle/strings.properties,sha256=WZAuYPX2Dpn6BHoA3RX8uvMNMr-yJE2fF7Gz0i24x2s,26459
|
|
@@ -39,25 +39,25 @@ upgini/sampler/utils.py,sha256=PYOk3kKSnFlyxcpdtDNLBEEhTB4lO_iP7pQHqeUcmAc,20211
|
|
|
39
39
|
upgini/utils/__init__.py,sha256=O_KgzKiJjW3g4NoqZ7lAxUpoHcBi_gze6r3ndEjCH74,842
|
|
40
40
|
upgini/utils/base_search_key_detector.py,sha256=Inc6iGG-VXQdejWFfbekIkZk2ahC4k7CdGqzOkie6Bs,1021
|
|
41
41
|
upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl1UOB4s,3382
|
|
42
|
-
upgini/utils/country_utils.py,sha256=
|
|
42
|
+
upgini/utils/country_utils.py,sha256=yE8oRgMpXuJxPfQm4fioY6dg6700HgVnHSk4Cv9sUyM,6511
|
|
43
43
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
44
44
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
45
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
45
|
+
upgini/utils/datetime_utils.py,sha256=uJ3wJNr4KQvDJ-gSOLcmP85hLtASK271o6mob4aZT90,11064
|
|
46
46
|
upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
|
|
47
47
|
upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
|
|
48
|
-
upgini/utils/email_utils.py,sha256=
|
|
48
|
+
upgini/utils/email_utils.py,sha256=aKHa4xVBSsEsiZtFCPj_DrUaFupceYfvJeP_e8w_D5E,3813
|
|
49
49
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
50
50
|
upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V5UO1A,3307
|
|
51
51
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
52
|
-
upgini/utils/ip_utils.py,sha256=
|
|
53
|
-
upgini/utils/phone_utils.py,sha256=
|
|
54
|
-
upgini/utils/postal_code_utils.py,sha256=
|
|
52
|
+
upgini/utils/ip_utils.py,sha256=Zf3F2cnQmOCH09QLQHetpjMFu1PnD0cTmDymn0SnSy8,1672
|
|
53
|
+
upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,408
|
|
54
|
+
upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3x_zs,409
|
|
55
55
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
56
56
|
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
57
57
|
upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
62
|
-
upgini-1.1.
|
|
63
|
-
upgini-1.1.
|
|
60
|
+
upgini-1.1.309a3511.dev1.dist-info/METADATA,sha256=N8KmmTFnzh0IbvUEjcBaK3-V1pg54v_DO_6fW7EHrsA,48232
|
|
61
|
+
upgini-1.1.309a3511.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
62
|
+
upgini-1.1.309a3511.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.1.309a3511.dev1.dist-info/RECORD,,
|