upgini 1.2.50__py3-none-any.whl → 1.2.51__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +16 -4
- upgini/metadata.py +1 -0
- upgini/utils/ip_utils.py +35 -14
- {upgini-1.2.50.dist-info → upgini-1.2.51.dist-info}/METADATA +1 -1
- {upgini-1.2.50.dist-info → upgini-1.2.51.dist-info}/RECORD +8 -8
- {upgini-1.2.50.dist-info → upgini-1.2.51.dist-info}/WHEEL +0 -0
- {upgini-1.2.50.dist-info → upgini-1.2.51.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.51"
|
upgini/features_enricher.py
CHANGED
|
@@ -2270,6 +2270,7 @@ if response.status_code == 200:
|
|
|
2270
2270
|
df = converter.convert(df)
|
|
2271
2271
|
|
|
2272
2272
|
ip_column = self._get_ip_column(search_keys)
|
|
2273
|
+
ip_prefix_column = None
|
|
2273
2274
|
if ip_column:
|
|
2274
2275
|
converter = IpSearchKeyConverter(
|
|
2275
2276
|
ip_column,
|
|
@@ -2280,6 +2281,7 @@ if response.status_code == 200:
|
|
|
2280
2281
|
self.logger,
|
|
2281
2282
|
)
|
|
2282
2283
|
df = converter.convert(df)
|
|
2284
|
+
ip_prefix_column = converter.ip_prefix_column
|
|
2283
2285
|
|
|
2284
2286
|
phone_column = self._get_phone_column(search_keys)
|
|
2285
2287
|
country_column = self._get_country_column(search_keys)
|
|
@@ -2299,12 +2301,15 @@ if response.status_code == 200:
|
|
|
2299
2301
|
# generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
2300
2302
|
|
|
2301
2303
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
2304
|
+
if ip_prefix_column:
|
|
2305
|
+
meaning_types[ip_prefix_column] = FileColumnMeaningType.IP_PREFIX
|
|
2302
2306
|
for col in features_for_transform:
|
|
2303
2307
|
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
2304
2308
|
features_not_to_pass = [
|
|
2305
2309
|
c
|
|
2306
2310
|
for c in df.columns
|
|
2307
2311
|
if c not in search_keys.keys()
|
|
2312
|
+
and c != ip_prefix_column
|
|
2308
2313
|
and c not in features_for_transform
|
|
2309
2314
|
and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
|
|
2310
2315
|
]
|
|
@@ -2766,6 +2771,7 @@ if response.status_code == 200:
|
|
|
2766
2771
|
df = converter.convert(df)
|
|
2767
2772
|
|
|
2768
2773
|
ip_column = self._get_ip_column(self.fit_search_keys)
|
|
2774
|
+
ip_prefix_column = None
|
|
2769
2775
|
if ip_column:
|
|
2770
2776
|
converter = IpSearchKeyConverter(
|
|
2771
2777
|
ip_column,
|
|
@@ -2776,7 +2782,7 @@ if response.status_code == 200:
|
|
|
2776
2782
|
self.logger,
|
|
2777
2783
|
)
|
|
2778
2784
|
df = converter.convert(df)
|
|
2779
|
-
|
|
2785
|
+
ip_prefix_column = converter.ip_prefix_column
|
|
2780
2786
|
phone_column = self._get_phone_column(self.fit_search_keys)
|
|
2781
2787
|
country_column = self._get_country_column(self.fit_search_keys)
|
|
2782
2788
|
if phone_column:
|
|
@@ -2792,9 +2798,13 @@ if response.status_code == 200:
|
|
|
2792
2798
|
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2793
2799
|
df = converter.convert(df)
|
|
2794
2800
|
|
|
2795
|
-
non_feature_columns = [
|
|
2796
|
-
self.
|
|
2797
|
-
|
|
2801
|
+
non_feature_columns = [
|
|
2802
|
+
self.TARGET_NAME,
|
|
2803
|
+
EVAL_SET_INDEX,
|
|
2804
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
2805
|
+
SEARCH_KEY_UNNEST,
|
|
2806
|
+
ip_prefix_column,
|
|
2807
|
+
] + list(self.fit_search_keys.keys())
|
|
2798
2808
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2799
2809
|
non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2800
2810
|
|
|
@@ -2815,6 +2825,8 @@ if response.status_code == 200:
|
|
|
2815
2825
|
**{col: key.value for col, key in self.fit_search_keys.items()},
|
|
2816
2826
|
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2817
2827
|
}
|
|
2828
|
+
if ip_prefix_column:
|
|
2829
|
+
meaning_types[ip_prefix_column] = FileColumnMeaningType.IP_PREFIX
|
|
2818
2830
|
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2819
2831
|
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
2820
2832
|
if SEARCH_KEY_UNNEST in df.columns:
|
upgini/metadata.py
CHANGED
|
@@ -44,6 +44,7 @@ class FileColumnMeaningType(Enum):
|
|
|
44
44
|
ENTITY_SYSTEM_RECORD_ID = "ENTITY_SYSTEM_RECORD_ID"
|
|
45
45
|
UNNEST_KEY = "UNNEST_KEY"
|
|
46
46
|
IP_BINARY = "IP_BINARY"
|
|
47
|
+
IP_PREFIX = "IP_PREFIX"
|
|
47
48
|
IP_RANGE_FROM_BINARY = "IP_RANGE_FROM_BINARY"
|
|
48
49
|
IP_RANGE_TO_BINARY = "IP_RANGE_TO_BINARY"
|
|
49
50
|
|
upgini/utils/ip_utils.py
CHANGED
|
@@ -33,9 +33,12 @@ class IpSearchKeyConverter:
|
|
|
33
33
|
else:
|
|
34
34
|
self.logger = logging.getLogger()
|
|
35
35
|
self.logger.setLevel("FATAL")
|
|
36
|
+
self.ip_prefix_column = None
|
|
36
37
|
|
|
37
38
|
@staticmethod
|
|
38
39
|
def _ip_to_int(ip: Optional[_BaseAddress]) -> Optional[int]:
|
|
40
|
+
if ip is None:
|
|
41
|
+
return None
|
|
39
42
|
try:
|
|
40
43
|
if isinstance(ip, (IPv4Address, IPv6Address)):
|
|
41
44
|
return int(ip)
|
|
@@ -44,6 +47,8 @@ class IpSearchKeyConverter:
|
|
|
44
47
|
|
|
45
48
|
@staticmethod
|
|
46
49
|
def _ip_to_binary(ip: Optional[_BaseAddress]) -> Optional[bytes]:
|
|
50
|
+
if ip is None:
|
|
51
|
+
return None
|
|
47
52
|
try:
|
|
48
53
|
if isinstance(ip, IPv6Address) and ip.ipv4_mapped is not None:
|
|
49
54
|
return ip.ipv4_mapped.packed
|
|
@@ -52,6 +57,20 @@ class IpSearchKeyConverter:
|
|
|
52
57
|
except Exception:
|
|
53
58
|
pass
|
|
54
59
|
|
|
60
|
+
@staticmethod
|
|
61
|
+
def _ip_to_prefix(ip: Optional[_BaseAddress]) -> Optional[str]:
|
|
62
|
+
if ip is None:
|
|
63
|
+
return None
|
|
64
|
+
try:
|
|
65
|
+
if isinstance(ip, IPv6Address):
|
|
66
|
+
if ip.ipv4_mapped is not None:
|
|
67
|
+
return ".".join(ip.ipv4_mapped.exploded.split(".")[:2])
|
|
68
|
+
return ":".join(ip.exploded.split(":")[:2]) # TODO use 3 in future
|
|
69
|
+
else:
|
|
70
|
+
return ".".join(ip.exploded.split(".")[:2])
|
|
71
|
+
except Exception:
|
|
72
|
+
pass
|
|
73
|
+
|
|
55
74
|
@staticmethod
|
|
56
75
|
def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
|
|
57
76
|
try:
|
|
@@ -102,24 +121,26 @@ class IpSearchKeyConverter:
|
|
|
102
121
|
# self.search_keys[ipv4] = SearchKey.IP
|
|
103
122
|
# self.columns_renaming[ipv4] = original_ip
|
|
104
123
|
|
|
105
|
-
ipv6 = self.ip_column + "_v6"
|
|
106
|
-
df[ipv6] = (
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
|
|
124
|
+
# ipv6 = self.ip_column + "_v6"
|
|
125
|
+
# df[ipv6] = (
|
|
126
|
+
# df[self.ip_column]
|
|
127
|
+
# .apply(self._to_ipv6)
|
|
128
|
+
# .apply(self._ip_to_int_str)
|
|
129
|
+
# .astype("string")
|
|
130
|
+
# # .str.replace(".0", "", regex=False)
|
|
131
|
+
# )
|
|
132
|
+
ip_binary = self.ip_column + "_binary"
|
|
133
|
+
df[ip_binary] = df[self.ip_column].apply(self._ip_to_binary)
|
|
134
|
+
self.ip_prefix_column = self.ip_column + "_prefix"
|
|
135
|
+
df[self.ip_prefix_column] = df[self.ip_column].apply(self._ip_to_prefix)
|
|
115
136
|
|
|
116
137
|
df = df.drop(columns=self.ip_column)
|
|
117
138
|
del self.search_keys[self.ip_column]
|
|
118
139
|
del self.columns_renaming[self.ip_column]
|
|
119
|
-
self.search_keys[ipv6] = SearchKey.IPV6_ADDRESS
|
|
120
|
-
|
|
121
|
-
self.columns_renaming[ipv6] = original_ip
|
|
122
|
-
|
|
140
|
+
# self.search_keys[ipv6] = SearchKey.IPV6_ADDRESS
|
|
141
|
+
self.search_keys[ip_binary] = SearchKey.IP_BINARY
|
|
142
|
+
# self.columns_renaming[ipv6] = original_ip
|
|
143
|
+
self.columns_renaming[ip_binary] = original_ip
|
|
123
144
|
|
|
124
145
|
return df
|
|
125
146
|
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=kgsz9u_lLDc3N0akch6v9PpMXz_PW7_aEHXRb1pWgHg,23
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=vT4JyHmafLNbj54SySXr93f5hNS6-t94aFslbBy-7No,33535
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=80h-1a-UxhuknvuEk1tQk5q5dckqlD_DHzNfufNuaPI,201110
|
|
7
7
|
upgini/http.py,sha256=danPeX7nTMa_70S-pk-4UUm5yOvXYlR84jgyjoHYBkU,43367
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
|
-
upgini/metadata.py,sha256
|
|
9
|
+
upgini/metadata.py,sha256=zuLdt5XyO_ZH4VsUNshzRHgv6VfYiXy0M8jeohloFBw,12082
|
|
10
10
|
upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
|
|
11
11
|
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
@@ -51,7 +51,7 @@ upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0-
|
|
|
51
51
|
upgini/utils/feature_info.py,sha256=0rOXSyCj-sw-8migWP0ge8qrOzGU50dQvH0JUJUrDfQ,6766
|
|
52
52
|
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
|
53
53
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
54
|
-
upgini/utils/ip_utils.py,sha256=
|
|
54
|
+
upgini/utils/ip_utils.py,sha256=GZqBaV-nky-_Yb9KclmTrYovCG4kawYcbdjEpw1e5Mo,6500
|
|
55
55
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
56
56
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
57
57
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
59
59
|
upgini/utils/target_utils.py,sha256=RlpKGss9kMibVSlA8iZuO_qxmyeplqzn7X8g6hiGGGs,14341
|
|
60
60
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
61
61
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
62
|
+
upgini-1.2.51.dist-info/METADATA,sha256=WXti81Fx4H5NawX2D7XvQ5cPEUVi4mlMkykbn94gXKI,49055
|
|
63
|
+
upgini-1.2.51.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
64
|
+
upgini-1.2.51.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
65
|
+
upgini-1.2.51.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|