upgini 1.1.315a3579.dev1__py3-none-any.whl → 1.1.316a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/binary.py +71 -71
- upgini/autofe/date.py +21 -21
- upgini/autofe/feature.py +2 -2
- upgini/autofe/groupby.py +22 -22
- upgini/autofe/operand.py +4 -4
- upgini/autofe/unary.py +47 -46
- upgini/autofe/vector.py +8 -8
- upgini/data_source/data_source_publisher.py +9 -0
- upgini/dataset.py +34 -387
- upgini/features_enricher.py +338 -169
- upgini/http.py +20 -31
- upgini/lazy_import.py +14 -1
- upgini/metadata.py +72 -57
- upgini/normalizer/normalize_utils.py +202 -0
- upgini/utils/country_utils.py +16 -0
- upgini/utils/datetime_utils.py +41 -20
- upgini/utils/email_utils.py +49 -17
- upgini/utils/ip_utils.py +100 -1
- upgini/utils/phone_utils.py +343 -0
- upgini/utils/postal_code_utils.py +34 -0
- upgini/utils/target_utils.py +4 -1
- {upgini-1.1.315a3579.dev1.dist-info → upgini-1.1.316a1.dist-info}/METADATA +3 -3
- {upgini-1.1.315a3579.dev1.dist-info → upgini-1.1.316a1.dist-info}/RECORD +26 -26
- {upgini-1.1.315a3579.dev1.dist-info → upgini-1.1.316a1.dist-info}/WHEEL +1 -1
- upgini/normalizer/phone_normalizer.py +0 -340
- {upgini-1.1.315a3579.dev1.dist-info → upgini-1.1.316a1.dist-info}/licenses/LICENSE +0 -0
upgini/utils/datetime_utils.py
CHANGED
|
@@ -6,13 +6,10 @@ from typing import Dict, List, Optional
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
8
|
from dateutil.relativedelta import relativedelta
|
|
9
|
-
from pandas.api.types import
|
|
10
|
-
is_numeric_dtype,
|
|
11
|
-
is_period_dtype,
|
|
12
|
-
)
|
|
9
|
+
from pandas.api.types import is_numeric_dtype
|
|
13
10
|
|
|
14
11
|
from upgini.errors import ValidationError
|
|
15
|
-
from upgini.metadata import SearchKey
|
|
12
|
+
from upgini.metadata import EVAL_SET_INDEX, SearchKey
|
|
16
13
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
17
14
|
from upgini.utils.warning_counter import WarningCounter
|
|
18
15
|
|
|
@@ -31,18 +28,22 @@ DATE_FORMATS = [
|
|
|
31
28
|
"%Y-%m-%dT%H:%M:%S.%f",
|
|
32
29
|
]
|
|
33
30
|
|
|
34
|
-
DATETIME_PATTERN = r"^[\d\s\.\-:T
|
|
31
|
+
DATETIME_PATTERN = r"^[\d\s\.\-:T/+]+$"
|
|
35
32
|
|
|
36
33
|
|
|
37
34
|
class DateTimeSearchKeyConverter:
|
|
38
35
|
DATETIME_COL = "_date_time"
|
|
36
|
+
# MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31) # 946684800000 # 2000-01-01
|
|
37
|
+
MIN_SUPPORTED_DATE_TS = pd.to_datetime(datetime.datetime(1999, 12, 31)).tz_localize(None)
|
|
39
38
|
|
|
40
39
|
def __init__(
|
|
41
40
|
self,
|
|
42
41
|
date_column: str,
|
|
43
42
|
date_format: Optional[str] = None,
|
|
44
43
|
logger: Optional[logging.Logger] = None,
|
|
45
|
-
bundle: ResourceBundle = None,
|
|
44
|
+
bundle: Optional[ResourceBundle] = None,
|
|
45
|
+
warnings_counter: Optional[WarningCounter] = None,
|
|
46
|
+
silent_mode=False,
|
|
46
47
|
):
|
|
47
48
|
self.date_column = date_column
|
|
48
49
|
self.date_format = date_format
|
|
@@ -53,6 +54,8 @@ class DateTimeSearchKeyConverter:
|
|
|
53
54
|
self.logger.setLevel("FATAL")
|
|
54
55
|
self.generated_features: List[str] = []
|
|
55
56
|
self.bundle = bundle or get_custom_bundle()
|
|
57
|
+
self.warnings_counter = warnings_counter or WarningCounter()
|
|
58
|
+
self.silent_mode = silent_mode
|
|
56
59
|
|
|
57
60
|
@staticmethod
|
|
58
61
|
def _int_to_opt(i: int) -> Optional[int]:
|
|
@@ -81,20 +84,20 @@ class DateTimeSearchKeyConverter:
|
|
|
81
84
|
df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
|
|
82
85
|
elif isinstance(df[self.date_column].values[0], datetime.date):
|
|
83
86
|
df[self.date_column] = pd.to_datetime(df[self.date_column], errors="coerce")
|
|
84
|
-
elif
|
|
87
|
+
elif isinstance(df[self.date_column].dtype, pd.PeriodDtype):
|
|
85
88
|
df[self.date_column] = df[self.date_column].dt.to_timestamp()
|
|
86
89
|
elif is_numeric_dtype(df[self.date_column]):
|
|
87
90
|
# 315532801 - 2524608001 - seconds
|
|
88
91
|
# 315532801000 - 2524608001000 - milliseconds
|
|
89
92
|
# 315532801000000 - 2524608001000000 - microseconds
|
|
90
93
|
# 315532801000000000 - 2524608001000000000 - nanoseconds
|
|
91
|
-
if df[self.date_column].apply(lambda x: 10
|
|
94
|
+
if df[self.date_column].apply(lambda x: 10**16 < x).all():
|
|
92
95
|
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ns")
|
|
93
|
-
elif df[self.date_column].apply(lambda x: 10
|
|
96
|
+
elif df[self.date_column].apply(lambda x: 10**14 < x < 10**16).all():
|
|
94
97
|
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="us")
|
|
95
|
-
elif df[self.date_column].apply(lambda x: 10
|
|
98
|
+
elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
|
|
96
99
|
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ms")
|
|
97
|
-
elif df[self.date_column].apply(lambda x: 0 < x < 10
|
|
100
|
+
elif df[self.date_column].apply(lambda x: 0 < x < 10**11).all():
|
|
98
101
|
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
|
|
99
102
|
else:
|
|
100
103
|
msg = self.bundle.get("unsupported_date_type").format(self.date_column)
|
|
@@ -108,6 +111,9 @@ class DateTimeSearchKeyConverter:
|
|
|
108
111
|
# as additional features
|
|
109
112
|
seconds = "datetime_seconds"
|
|
110
113
|
df[self.date_column] = df[self.date_column].dt.tz_localize(None)
|
|
114
|
+
|
|
115
|
+
df = self.clean_old_dates(df)
|
|
116
|
+
|
|
111
117
|
df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
|
|
112
118
|
|
|
113
119
|
seconds_without_na = df[seconds].dropna()
|
|
@@ -152,6 +158,19 @@ class DateTimeSearchKeyConverter:
|
|
|
152
158
|
except ValueError:
|
|
153
159
|
raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
|
|
154
160
|
|
|
161
|
+
def clean_old_dates(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
162
|
+
condition = df[self.date_column] <= self.MIN_SUPPORTED_DATE_TS
|
|
163
|
+
old_subset = df[condition]
|
|
164
|
+
if len(old_subset) > 0:
|
|
165
|
+
df.loc[condition, self.date_column] = None
|
|
166
|
+
self.logger.info(f"Set to None: {len(old_subset)} of {len(df)} rows because they are before 2000-01-01")
|
|
167
|
+
msg = self.bundle.get("dataset_drop_old_dates")
|
|
168
|
+
self.logger.warning(msg)
|
|
169
|
+
if not self.silent_mode:
|
|
170
|
+
print(msg)
|
|
171
|
+
self.warnings_counter.increment()
|
|
172
|
+
return df
|
|
173
|
+
|
|
155
174
|
|
|
156
175
|
def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
|
|
157
176
|
try:
|
|
@@ -188,7 +207,7 @@ def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
|
|
|
188
207
|
def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[str]) -> bool:
|
|
189
208
|
df = df.copy()
|
|
190
209
|
seconds = "datetime_seconds"
|
|
191
|
-
if
|
|
210
|
+
if isinstance(df[date_col].dtype, pd.PeriodDtype):
|
|
192
211
|
df[date_col] = df[date_col].dt.to_timestamp()
|
|
193
212
|
else:
|
|
194
213
|
df[date_col] = pd.to_datetime(df[date_col])
|
|
@@ -238,23 +257,25 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
|
|
|
238
257
|
|
|
239
258
|
|
|
240
259
|
def validate_dates_distribution(
|
|
241
|
-
|
|
260
|
+
df: pd.DataFrame,
|
|
242
261
|
search_keys: Dict[str, SearchKey],
|
|
243
262
|
logger: Optional[logging.Logger] = None,
|
|
244
263
|
bundle: Optional[ResourceBundle] = None,
|
|
245
264
|
warning_counter: Optional[WarningCounter] = None,
|
|
246
265
|
):
|
|
247
|
-
maybe_date_col =
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
266
|
+
maybe_date_col = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
267
|
+
|
|
268
|
+
if EVAL_SET_INDEX in df.columns:
|
|
269
|
+
X = df.query(f"{EVAL_SET_INDEX} == 0")
|
|
270
|
+
else:
|
|
271
|
+
X = df
|
|
251
272
|
|
|
252
273
|
if maybe_date_col is None:
|
|
253
274
|
for col in X.columns:
|
|
254
275
|
if col in search_keys:
|
|
255
276
|
continue
|
|
256
277
|
try:
|
|
257
|
-
if
|
|
278
|
+
if isinstance(X[col].dtype, pd.PeriodDtype):
|
|
258
279
|
pass
|
|
259
280
|
elif pd.__version__ >= "2.0.0":
|
|
260
281
|
# Format mixed to avoid massive warnings
|
|
@@ -269,7 +290,7 @@ def validate_dates_distribution(
|
|
|
269
290
|
if maybe_date_col is None:
|
|
270
291
|
return
|
|
271
292
|
|
|
272
|
-
if
|
|
293
|
+
if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
|
|
273
294
|
dates = X[maybe_date_col].dt.to_timestamp().dt.date
|
|
274
295
|
elif pd.__version__ >= "2.0.0":
|
|
275
296
|
dates = pd.to_datetime(X[maybe_date_col], format="mixed").dt.date
|
upgini/utils/email_utils.py
CHANGED
|
@@ -7,7 +7,7 @@ import pandas as pd
|
|
|
7
7
|
from pandas.api.types import is_object_dtype, is_string_dtype
|
|
8
8
|
|
|
9
9
|
from upgini.metadata import SearchKey
|
|
10
|
-
from upgini.resource_bundle import
|
|
10
|
+
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
11
11
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
12
12
|
|
|
13
13
|
EMAIL_REGEX = re.compile(r"^[a-zA-Z0-9.!#$%&’*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*$")
|
|
@@ -28,29 +28,53 @@ class EmailSearchKeyDetector(BaseSearchKeyDetector):
|
|
|
28
28
|
return is_email_count / all_count > 0.1
|
|
29
29
|
|
|
30
30
|
|
|
31
|
+
class EmailDomainGenerator:
|
|
32
|
+
DOMAIN_SUFFIX = "_domain"
|
|
33
|
+
|
|
34
|
+
def __init__(self, email_columns: List[str]):
|
|
35
|
+
self.email_columns = email_columns
|
|
36
|
+
self.generated_features = []
|
|
37
|
+
|
|
38
|
+
def generate(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
39
|
+
for email_col in self.email_columns:
|
|
40
|
+
domain_feature = email_col + self.DOMAIN_SUFFIX
|
|
41
|
+
df[domain_feature] = df[email_col].apply(self._email_to_domain)
|
|
42
|
+
self.generated_features.append(domain_feature)
|
|
43
|
+
return df
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def _email_to_domain(email: str) -> Optional[str]:
|
|
47
|
+
if email is not None and isinstance(email, str) and "@" in email:
|
|
48
|
+
name_and_domain = email.split("@")
|
|
49
|
+
if len(name_and_domain) == 2 and len(name_and_domain[1]) > 0:
|
|
50
|
+
return name_and_domain[1]
|
|
51
|
+
|
|
52
|
+
|
|
31
53
|
class EmailSearchKeyConverter:
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
EMAIL_ONE_DOMAIN_COLUMN_NAME = "email_one_domain"
|
|
54
|
+
HEM_SUFFIX = "_hem"
|
|
55
|
+
ONE_DOMAIN_SUFFIX = "_one_domain"
|
|
35
56
|
|
|
36
57
|
def __init__(
|
|
37
58
|
self,
|
|
38
59
|
email_column: str,
|
|
39
60
|
hem_column: Optional[str],
|
|
40
61
|
search_keys: Dict[str, SearchKey],
|
|
62
|
+
columns_renaming: Dict[str, str],
|
|
41
63
|
unnest_search_keys: Optional[List[str]] = None,
|
|
64
|
+
bundle: Optional[ResourceBundle] = None,
|
|
42
65
|
logger: Optional[logging.Logger] = None,
|
|
43
66
|
):
|
|
44
67
|
self.email_column = email_column
|
|
45
68
|
self.hem_column = hem_column
|
|
46
69
|
self.search_keys = search_keys
|
|
70
|
+
self.columns_renaming = columns_renaming
|
|
47
71
|
self.unnest_search_keys = unnest_search_keys
|
|
72
|
+
self.bundle = bundle or get_custom_bundle()
|
|
48
73
|
if logger is not None:
|
|
49
74
|
self.logger = logger
|
|
50
75
|
else:
|
|
51
76
|
self.logger = logging.getLogger()
|
|
52
77
|
self.logger.setLevel("FATAL")
|
|
53
|
-
self.generated_features: List[str] = []
|
|
54
78
|
self.email_converted_to_hem = False
|
|
55
79
|
|
|
56
80
|
@staticmethod
|
|
@@ -61,7 +85,7 @@ class EmailSearchKeyConverter:
|
|
|
61
85
|
if not EMAIL_REGEX.fullmatch(email):
|
|
62
86
|
return None
|
|
63
87
|
|
|
64
|
-
return sha256(email.lower().encode("utf-8")).hexdigest()
|
|
88
|
+
return sha256(email.lower().encode("utf-8")).hexdigest().lower()
|
|
65
89
|
|
|
66
90
|
@staticmethod
|
|
67
91
|
def _email_to_one_domain(email: str) -> Optional[str]:
|
|
@@ -72,28 +96,36 @@ class EmailSearchKeyConverter:
|
|
|
72
96
|
|
|
73
97
|
def convert(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
74
98
|
df = df.copy()
|
|
99
|
+
original_email_column = self.columns_renaming[self.email_column]
|
|
75
100
|
if self.hem_column is None:
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
101
|
+
hem_name = self.email_column + self.HEM_SUFFIX
|
|
102
|
+
df[hem_name] = df[self.email_column].apply(self._email_to_hem)
|
|
103
|
+
if df[hem_name].isna().all():
|
|
104
|
+
msg = self.bundle.get("all_emails_invalid").format(self.email_column)
|
|
79
105
|
print(msg)
|
|
80
106
|
self.logger.warning(msg)
|
|
81
|
-
df = df.drop(columns=
|
|
107
|
+
df = df.drop(columns=hem_name)
|
|
82
108
|
del self.search_keys[self.email_column]
|
|
83
109
|
return df
|
|
84
|
-
self.search_keys[
|
|
85
|
-
self.
|
|
110
|
+
self.search_keys[hem_name] = SearchKey.HEM
|
|
111
|
+
if self.email_column in self.unnest_search_keys:
|
|
112
|
+
self.unnest_search_keys.append(hem_name)
|
|
113
|
+
self.columns_renaming[hem_name] = original_email_column # it could be upgini_email_unnest...
|
|
86
114
|
self.email_converted_to_hem = True
|
|
115
|
+
else:
|
|
116
|
+
df[self.hem_column] = df[self.hem_column].astype("string").str.lower()
|
|
87
117
|
|
|
88
118
|
del self.search_keys[self.email_column]
|
|
89
119
|
if self.email_column in self.unnest_search_keys:
|
|
90
120
|
self.unnest_search_keys.remove(self.email_column)
|
|
91
121
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
self.
|
|
122
|
+
one_domain_name = self.email_column + self.ONE_DOMAIN_SUFFIX
|
|
123
|
+
df[one_domain_name] = df[self.email_column].apply(self._email_to_one_domain)
|
|
124
|
+
self.columns_renaming[one_domain_name] = original_email_column
|
|
125
|
+
self.search_keys[one_domain_name] = SearchKey.EMAIL_ONE_DOMAIN
|
|
95
126
|
|
|
96
|
-
|
|
97
|
-
|
|
127
|
+
if self.email_converted_to_hem:
|
|
128
|
+
df = df.drop(columns=self.email_column)
|
|
129
|
+
del self.columns_renaming[self.email_column]
|
|
98
130
|
|
|
99
131
|
return df
|
upgini/utils/ip_utils.py
CHANGED
|
@@ -1,15 +1,114 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from
|
|
2
|
+
from ipaddress import IPv4Address, IPv6Address, _BaseAddress, ip_address
|
|
3
|
+
from typing import Dict, List, Optional, Union
|
|
3
4
|
|
|
4
5
|
import pandas as pd
|
|
5
6
|
from requests import get
|
|
6
7
|
|
|
8
|
+
from upgini.errors import ValidationError
|
|
7
9
|
from upgini.metadata import SearchKey
|
|
10
|
+
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
8
11
|
|
|
9
12
|
# from upgini.resource_bundle import bundle
|
|
10
13
|
# from upgini.utils.track_info import get_track_metrics
|
|
11
14
|
|
|
12
15
|
|
|
16
|
+
class IpSearchKeyConverter:
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
ip_column: str,
|
|
20
|
+
search_keys: Dict[str, SearchKey],
|
|
21
|
+
columns_renaming: Dict[str, str],
|
|
22
|
+
unnest_search_keys: Optional[List[str]] = None,
|
|
23
|
+
bundle: Optional[ResourceBundle] = None,
|
|
24
|
+
logger: Optional[logging.Logger] = None,
|
|
25
|
+
):
|
|
26
|
+
self.ip_column = ip_column
|
|
27
|
+
self.search_keys = search_keys
|
|
28
|
+
self.columns_renaming = columns_renaming
|
|
29
|
+
self.unnest_search_keys = unnest_search_keys
|
|
30
|
+
self.bundle = bundle or get_custom_bundle()
|
|
31
|
+
if logger is not None:
|
|
32
|
+
self.logger = logger
|
|
33
|
+
else:
|
|
34
|
+
self.logger = logging.getLogger()
|
|
35
|
+
self.logger.setLevel("FATAL")
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def _ip_to_int(ip: Optional[_BaseAddress]) -> Optional[int]:
|
|
39
|
+
try:
|
|
40
|
+
if isinstance(ip, (IPv4Address, IPv6Address)):
|
|
41
|
+
return int(ip)
|
|
42
|
+
except Exception:
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
|
|
47
|
+
try:
|
|
48
|
+
if isinstance(ip, (IPv4Address, IPv6Address)):
|
|
49
|
+
return str(int(ip))
|
|
50
|
+
except Exception:
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def _safe_ip_parse(ip: Union[str, int, IPv4Address, IPv6Address]) -> Optional[_BaseAddress]:
|
|
55
|
+
try:
|
|
56
|
+
return ip_address(ip)
|
|
57
|
+
except ValueError:
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def _is_ipv4(ip: Optional[_BaseAddress]):
|
|
62
|
+
return ip is not None and (
|
|
63
|
+
isinstance(ip, IPv4Address) or (isinstance(ip, IPv6Address) and ip.ipv4_mapped is not None)
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
@staticmethod
|
|
67
|
+
def _to_ipv4(ip: Optional[_BaseAddress]) -> Optional[IPv4Address]:
|
|
68
|
+
if isinstance(ip, IPv4Address):
|
|
69
|
+
return ip
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def _to_ipv6(ip: Optional[_BaseAddress]) -> Optional[IPv6Address]:
|
|
74
|
+
if isinstance(ip, IPv6Address):
|
|
75
|
+
return ip
|
|
76
|
+
if isinstance(ip, IPv4Address):
|
|
77
|
+
return IPv6Address("::ffff:" + str(ip))
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
def convert(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
81
|
+
"""Convert ip address to int"""
|
|
82
|
+
self.logger.info("Convert ip address to int")
|
|
83
|
+
original_ip = self.columns_renaming[self.ip_column]
|
|
84
|
+
|
|
85
|
+
df[self.ip_column] = df[self.ip_column].apply(self._safe_ip_parse)
|
|
86
|
+
if df[self.ip_column].isnull().all():
|
|
87
|
+
raise ValidationError(self.bundle.get("invalid_ip").format(self.ip_column))
|
|
88
|
+
|
|
89
|
+
# legacy support
|
|
90
|
+
ipv4 = self.ip_column + "_v4"
|
|
91
|
+
df[ipv4] = df[self.ip_column].apply(self._to_ipv4).apply(self._ip_to_int).astype("Int64")
|
|
92
|
+
self.search_keys[ipv4] = SearchKey.IP
|
|
93
|
+
self.columns_renaming[ipv4] = original_ip
|
|
94
|
+
|
|
95
|
+
ipv6 = self.ip_column + "_v6"
|
|
96
|
+
df[ipv6] = (
|
|
97
|
+
df[self.ip_column]
|
|
98
|
+
.apply(self._to_ipv6)
|
|
99
|
+
.apply(self._ip_to_int_str)
|
|
100
|
+
.astype("string")
|
|
101
|
+
# .str.replace(".0", "", regex=False)
|
|
102
|
+
)
|
|
103
|
+
df = df.drop(columns=self.ip_column)
|
|
104
|
+
del self.search_keys[self.ip_column]
|
|
105
|
+
del self.columns_renaming[self.ip_column]
|
|
106
|
+
self.search_keys[ipv6] = SearchKey.IPV6_ADDRESS
|
|
107
|
+
self.columns_renaming[ipv6] = original_ip # could be __unnest_ip...
|
|
108
|
+
|
|
109
|
+
return df
|
|
110
|
+
|
|
111
|
+
|
|
13
112
|
class IpToCountrySearchKeyConverter:
|
|
14
113
|
url = "http://ip-api.com/json/{}"
|
|
15
114
|
|