upgini 1.1.280a3418.post2__py3-none-any.whl → 1.2.31a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/__init__.py +4 -20
- upgini/autofe/all_operands.py +39 -10
- upgini/autofe/binary.py +148 -45
- upgini/autofe/date.py +197 -26
- upgini/autofe/feature.py +102 -19
- upgini/autofe/groupby.py +22 -22
- upgini/autofe/operand.py +9 -6
- upgini/autofe/unary.py +78 -54
- upgini/autofe/vector.py +8 -8
- upgini/data_source/data_source_publisher.py +128 -5
- upgini/dataset.py +50 -386
- upgini/features_enricher.py +936 -541
- upgini/http.py +27 -16
- upgini/lazy_import.py +35 -0
- upgini/metadata.py +84 -59
- upgini/metrics.py +164 -34
- upgini/normalizer/normalize_utils.py +197 -0
- upgini/resource_bundle/strings.properties +66 -51
- upgini/search_task.py +10 -4
- upgini/utils/Roboto-Regular.ttf +0 -0
- upgini/utils/base_search_key_detector.py +14 -12
- upgini/utils/country_utils.py +16 -0
- upgini/utils/custom_loss_utils.py +39 -36
- upgini/utils/datetime_utils.py +98 -45
- upgini/utils/deduplicate_utils.py +135 -112
- upgini/utils/display_utils.py +46 -15
- upgini/utils/email_utils.py +54 -16
- upgini/utils/feature_info.py +172 -0
- upgini/utils/features_validator.py +34 -20
- upgini/utils/ip_utils.py +100 -1
- upgini/utils/phone_utils.py +343 -0
- upgini/utils/postal_code_utils.py +34 -0
- upgini/utils/sklearn_ext.py +28 -19
- upgini/utils/target_utils.py +113 -57
- upgini/utils/warning_counter.py +1 -0
- upgini/version_validator.py +8 -4
- {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31a1.dist-info}/METADATA +31 -16
- upgini-1.2.31a1.dist-info/RECORD +65 -0
- upgini/normalizer/phone_normalizer.py +0 -340
- upgini-1.1.280a3418.post2.dist-info/RECORD +0 -62
- {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31a1.dist-info}/WHEEL +0 -0
- {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31a1.dist-info}/licenses/LICENSE +0 -0
upgini/utils/display_utils.py
CHANGED
|
@@ -4,12 +4,14 @@ import textwrap
|
|
|
4
4
|
import urllib.parse
|
|
5
5
|
import uuid
|
|
6
6
|
from datetime import datetime, timezone
|
|
7
|
-
from io import
|
|
7
|
+
from io import StringIO
|
|
8
8
|
from typing import Callable, List, Optional
|
|
9
9
|
|
|
10
10
|
import pandas as pd
|
|
11
11
|
from xhtml2pdf import pisa
|
|
12
12
|
|
|
13
|
+
from upgini.__about__ import __version__
|
|
14
|
+
|
|
13
15
|
|
|
14
16
|
def ipython_available() -> bool:
|
|
15
17
|
try:
|
|
@@ -71,7 +73,9 @@ def make_table(df: pd.DataFrame, wrap_long_string=None) -> str:
|
|
|
71
73
|
)
|
|
72
74
|
|
|
73
75
|
|
|
74
|
-
def display_html_dataframe(
|
|
76
|
+
def display_html_dataframe(
|
|
77
|
+
df: pd.DataFrame, internal_df: pd.DataFrame, header: str, display_id: Optional[str] = None, display_handle=None
|
|
78
|
+
):
|
|
75
79
|
if not ipython_available():
|
|
76
80
|
print(header)
|
|
77
81
|
print(internal_df)
|
|
@@ -132,7 +136,10 @@ def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header:
|
|
|
132
136
|
{table_html}
|
|
133
137
|
</div>
|
|
134
138
|
"""
|
|
135
|
-
|
|
139
|
+
if display_handle:
|
|
140
|
+
return display_handle.update(HTML(result_html))
|
|
141
|
+
else:
|
|
142
|
+
return display(HTML(result_html), display_id=display_id)
|
|
136
143
|
|
|
137
144
|
|
|
138
145
|
def make_html_report(
|
|
@@ -143,7 +150,7 @@ def make_html_report(
|
|
|
143
150
|
search_id: str,
|
|
144
151
|
email: Optional[str] = None,
|
|
145
152
|
search_keys: Optional[List[str]] = None,
|
|
146
|
-
):
|
|
153
|
+
) -> str:
|
|
147
154
|
# relevant_features_df = relevant_features_df.copy()
|
|
148
155
|
# relevant_features_df["Feature name"] = relevant_features_df["Feature name"].apply(
|
|
149
156
|
# lambda x: "*" + x if x.contains("_autofe_") else x
|
|
@@ -154,9 +161,18 @@ def make_html_report(
|
|
|
154
161
|
"""<button type="button">Request a quote</button></a>"""
|
|
155
162
|
)
|
|
156
163
|
relevant_datasources_df.rename(columns={"action": " "}, inplace=True)
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
from importlib.resources import files
|
|
167
|
+
font_path = files('upgini.utils').joinpath('Roboto-Regular.ttf')
|
|
168
|
+
except Exception:
|
|
169
|
+
from pkg_resources import resource_filename
|
|
170
|
+
font_path = resource_filename('upgini.utils', 'Roboto-Regular.ttf')
|
|
171
|
+
|
|
157
172
|
return f"""<html>
|
|
158
173
|
<head>
|
|
159
174
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
|
175
|
+
<meta charset="UTF-8">
|
|
160
176
|
<style>
|
|
161
177
|
@page {{
|
|
162
178
|
size: a4 portrait;
|
|
@@ -166,23 +182,25 @@ def make_html_report(
|
|
|
166
182
|
/*-pdf-frame-border: 1;*/
|
|
167
183
|
}}
|
|
168
184
|
@frame content_frame {{
|
|
169
|
-
left: 10pt; width: 574pt; top: 50pt; height:
|
|
185
|
+
left: 10pt; width: 574pt; top: 50pt; height: 742pt;
|
|
170
186
|
/*-pdf-frame-border: 1;*/
|
|
171
187
|
}}
|
|
172
188
|
@frame footer_frame {{
|
|
173
189
|
-pdf-frame-content: footer_content;
|
|
174
|
-
left: 10pt; width: 574pt; top: 802pt; height:
|
|
190
|
+
left: 10pt; width: 574pt; top: 802pt; height: 40pt;
|
|
175
191
|
/*-pdf-frame-border: 1;*/
|
|
176
192
|
}}
|
|
177
193
|
}}
|
|
178
194
|
|
|
179
195
|
@font-face {{
|
|
180
|
-
font-family: "
|
|
181
|
-
src: url("
|
|
196
|
+
font-family: "Roboto";
|
|
197
|
+
src: url("{font_path}") format("truetype");
|
|
182
198
|
}}
|
|
183
199
|
|
|
184
200
|
body {{
|
|
185
|
-
font-family: "
|
|
201
|
+
font-family: "Roboto", sans-serif;
|
|
202
|
+
font-weight: 400;
|
|
203
|
+
font-style: normal;
|
|
186
204
|
}}
|
|
187
205
|
|
|
188
206
|
#header_content {{
|
|
@@ -234,7 +252,8 @@ def make_html_report(
|
|
|
234
252
|
<div id="header_content">UPGINI</div>
|
|
235
253
|
<div id="footer_content">
|
|
236
254
|
© Upgini</br>
|
|
237
|
-
sales@upgini.com
|
|
255
|
+
sales@upgini.com</br>
|
|
256
|
+
Launched by version {__version__}
|
|
238
257
|
</div>
|
|
239
258
|
|
|
240
259
|
<h1>Data search report</h1>
|
|
@@ -257,7 +276,7 @@ def make_html_report(
|
|
|
257
276
|
}
|
|
258
277
|
<h3>Relevant data sources</h3>
|
|
259
278
|
{make_table(relevant_datasources_df)}
|
|
260
|
-
<h3>All relevant features. Listing</h3>
|
|
279
|
+
<h3>All relevant features. Listing ({len(relevant_features_df)} items)</h3>
|
|
261
280
|
{make_table(relevant_features_df, wrap_long_string=25)}
|
|
262
281
|
{"<h3>Description of AutoFE feature names</h3>" + make_table(autofe_descriptions_df, wrap_long_string=25)
|
|
263
282
|
if autofe_descriptions_df is not None
|
|
@@ -277,6 +296,8 @@ def prepare_and_show_report(
|
|
|
277
296
|
search_id: str,
|
|
278
297
|
email: Optional[str],
|
|
279
298
|
search_keys: Optional[List[str]] = None,
|
|
299
|
+
display_id: Optional[str] = None,
|
|
300
|
+
display_handle=None,
|
|
280
301
|
):
|
|
281
302
|
if not ipython_available():
|
|
282
303
|
return
|
|
@@ -286,22 +307,32 @@ def prepare_and_show_report(
|
|
|
286
307
|
)
|
|
287
308
|
|
|
288
309
|
if len(relevant_features_df) > 0:
|
|
289
|
-
show_button_download_pdf(report)
|
|
310
|
+
return show_button_download_pdf(report, display_id=display_id, display_handle=display_handle)
|
|
290
311
|
|
|
291
312
|
|
|
292
|
-
def show_button_download_pdf(
|
|
313
|
+
def show_button_download_pdf(
|
|
314
|
+
source: str, title="\U0001F4CA Download PDF report", display_id: Optional[str] = None, display_handle=None
|
|
315
|
+
):
|
|
293
316
|
from IPython.display import HTML, display
|
|
294
317
|
|
|
295
318
|
file_name = f"upgini-report-{uuid.uuid4()}.pdf"
|
|
319
|
+
|
|
320
|
+
# from weasyprint import HTML
|
|
321
|
+
|
|
322
|
+
# html = HTML(string=source)
|
|
323
|
+
# html.write_pdf(file_name)
|
|
296
324
|
with open(file_name, "wb") as output:
|
|
297
|
-
pisa.CreatePDF(src=
|
|
325
|
+
pisa.CreatePDF(src=StringIO(source), dest=output, encoding="UTF-8")
|
|
298
326
|
|
|
299
327
|
with open(file_name, "rb") as f:
|
|
300
328
|
b64 = base64.b64encode(f.read())
|
|
301
329
|
payload = b64.decode()
|
|
302
330
|
html = f"""<a download="{file_name}" href="data:application/pdf;base64,{payload}" target="_blank">
|
|
303
331
|
<button>{title}</button></a>"""
|
|
304
|
-
|
|
332
|
+
if display_handle is not None:
|
|
333
|
+
display_handle.update(HTML(html))
|
|
334
|
+
else:
|
|
335
|
+
return display(HTML(html), display_id=display_id)
|
|
305
336
|
|
|
306
337
|
|
|
307
338
|
def show_request_quote_button():
|
upgini/utils/email_utils.py
CHANGED
|
@@ -7,7 +7,7 @@ import pandas as pd
|
|
|
7
7
|
from pandas.api.types import is_object_dtype, is_string_dtype
|
|
8
8
|
|
|
9
9
|
from upgini.metadata import SearchKey
|
|
10
|
-
from upgini.resource_bundle import
|
|
10
|
+
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
11
11
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
12
12
|
|
|
13
13
|
EMAIL_REGEX = re.compile(r"^[a-zA-Z0-9.!#$%&’*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*$")
|
|
@@ -28,27 +28,54 @@ class EmailSearchKeyDetector(BaseSearchKeyDetector):
|
|
|
28
28
|
return is_email_count / all_count > 0.1
|
|
29
29
|
|
|
30
30
|
|
|
31
|
+
class EmailDomainGenerator:
|
|
32
|
+
DOMAIN_SUFFIX = "_domain"
|
|
33
|
+
|
|
34
|
+
def __init__(self, email_columns: List[str]):
|
|
35
|
+
self.email_columns = email_columns
|
|
36
|
+
self.generated_features = []
|
|
37
|
+
|
|
38
|
+
def generate(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
39
|
+
for email_col in self.email_columns:
|
|
40
|
+
domain_feature = email_col + self.DOMAIN_SUFFIX
|
|
41
|
+
if domain_feature not in df.columns:
|
|
42
|
+
df[domain_feature] = df[email_col].apply(self._email_to_domain).astype("string")
|
|
43
|
+
self.generated_features.append(domain_feature)
|
|
44
|
+
return df
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def _email_to_domain(email: str) -> Optional[str]:
|
|
48
|
+
if email is not None and isinstance(email, str) and "@" in email:
|
|
49
|
+
name_and_domain = email.split("@")
|
|
50
|
+
if len(name_and_domain) == 2 and len(name_and_domain[1]) > 0:
|
|
51
|
+
return name_and_domain[1]
|
|
52
|
+
|
|
53
|
+
|
|
31
54
|
class EmailSearchKeyConverter:
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
EMAIL_ONE_DOMAIN_COLUMN_NAME = "email_one_domain"
|
|
55
|
+
HEM_SUFFIX = "_hem"
|
|
56
|
+
ONE_DOMAIN_SUFFIX = "_one_domain"
|
|
35
57
|
|
|
36
58
|
def __init__(
|
|
37
59
|
self,
|
|
38
60
|
email_column: str,
|
|
39
61
|
hem_column: Optional[str],
|
|
40
62
|
search_keys: Dict[str, SearchKey],
|
|
63
|
+
columns_renaming: Dict[str, str],
|
|
64
|
+
unnest_search_keys: Optional[List[str]] = None,
|
|
65
|
+
bundle: Optional[ResourceBundle] = None,
|
|
41
66
|
logger: Optional[logging.Logger] = None,
|
|
42
67
|
):
|
|
43
68
|
self.email_column = email_column
|
|
44
69
|
self.hem_column = hem_column
|
|
45
70
|
self.search_keys = search_keys
|
|
71
|
+
self.columns_renaming = columns_renaming
|
|
72
|
+
self.unnest_search_keys = unnest_search_keys
|
|
73
|
+
self.bundle = bundle or get_custom_bundle()
|
|
46
74
|
if logger is not None:
|
|
47
75
|
self.logger = logger
|
|
48
76
|
else:
|
|
49
77
|
self.logger = logging.getLogger()
|
|
50
78
|
self.logger.setLevel("FATAL")
|
|
51
|
-
self.generated_features: List[str] = []
|
|
52
79
|
self.email_converted_to_hem = False
|
|
53
80
|
|
|
54
81
|
@staticmethod
|
|
@@ -59,7 +86,7 @@ class EmailSearchKeyConverter:
|
|
|
59
86
|
if not EMAIL_REGEX.fullmatch(email):
|
|
60
87
|
return None
|
|
61
88
|
|
|
62
|
-
return sha256(email.lower().encode("utf-8")).hexdigest()
|
|
89
|
+
return sha256(email.lower().encode("utf-8")).hexdigest().lower()
|
|
63
90
|
|
|
64
91
|
@staticmethod
|
|
65
92
|
def _email_to_one_domain(email: str) -> Optional[str]:
|
|
@@ -70,25 +97,36 @@ class EmailSearchKeyConverter:
|
|
|
70
97
|
|
|
71
98
|
def convert(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
72
99
|
df = df.copy()
|
|
100
|
+
original_email_column = self.columns_renaming[self.email_column]
|
|
73
101
|
if self.hem_column is None:
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
102
|
+
hem_name = self.email_column + self.HEM_SUFFIX
|
|
103
|
+
df[hem_name] = df[self.email_column].apply(self._email_to_hem)
|
|
104
|
+
if df[hem_name].isna().all():
|
|
105
|
+
msg = self.bundle.get("all_emails_invalid").format(self.email_column)
|
|
77
106
|
print(msg)
|
|
78
107
|
self.logger.warning(msg)
|
|
79
|
-
df = df.drop(columns=
|
|
108
|
+
df = df.drop(columns=hem_name)
|
|
80
109
|
del self.search_keys[self.email_column]
|
|
81
110
|
return df
|
|
82
|
-
self.search_keys[
|
|
111
|
+
self.search_keys[hem_name] = SearchKey.HEM
|
|
112
|
+
if self.email_column in self.unnest_search_keys:
|
|
113
|
+
self.unnest_search_keys.append(hem_name)
|
|
114
|
+
self.columns_renaming[hem_name] = original_email_column # it could be upgini_email_unnest...
|
|
83
115
|
self.email_converted_to_hem = True
|
|
116
|
+
else:
|
|
117
|
+
df[self.hem_column] = df[self.hem_column].astype("string").str.lower()
|
|
84
118
|
|
|
85
119
|
del self.search_keys[self.email_column]
|
|
120
|
+
if self.email_column in self.unnest_search_keys:
|
|
121
|
+
self.unnest_search_keys.remove(self.email_column)
|
|
86
122
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
self.
|
|
123
|
+
one_domain_name = self.email_column + self.ONE_DOMAIN_SUFFIX
|
|
124
|
+
df[one_domain_name] = df[self.email_column].apply(self._email_to_one_domain)
|
|
125
|
+
self.columns_renaming[one_domain_name] = original_email_column
|
|
126
|
+
self.search_keys[one_domain_name] = SearchKey.EMAIL_ONE_DOMAIN
|
|
90
127
|
|
|
91
|
-
|
|
92
|
-
|
|
128
|
+
if self.email_converted_to_hem:
|
|
129
|
+
df = df.drop(columns=self.email_column)
|
|
130
|
+
del self.columns_renaming[self.email_column]
|
|
93
131
|
|
|
94
132
|
return df
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
import itertools
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from upgini.metadata import FeaturesMetadataV2
|
|
9
|
+
from upgini.resource_bundle import ResourceBundle
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
LLM_SOURCE = "LLM with external data augmentation"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class FeatureInfo:
|
|
17
|
+
name: str
|
|
18
|
+
internal_name: str
|
|
19
|
+
rounded_shap: float
|
|
20
|
+
hitrate: float
|
|
21
|
+
value_preview: str
|
|
22
|
+
provider: str
|
|
23
|
+
internal_provider: str
|
|
24
|
+
source: str
|
|
25
|
+
internal_source: str
|
|
26
|
+
update_frequency: str
|
|
27
|
+
commercial_schema: str
|
|
28
|
+
doc_link: str
|
|
29
|
+
data_provider_link: str
|
|
30
|
+
data_source_link: str
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def from_metadata(feature_meta: FeaturesMetadataV2, data: pd.DataFrame, is_client_feature: bool) -> "FeatureInfo":
|
|
34
|
+
return FeatureInfo(
|
|
35
|
+
name=_get_name(feature_meta),
|
|
36
|
+
internal_name=_get_internal_name(feature_meta),
|
|
37
|
+
rounded_shap=_round_shap_value(feature_meta.shap_value),
|
|
38
|
+
hitrate=feature_meta.hit_rate,
|
|
39
|
+
value_preview=_get_feature_sample(feature_meta, data),
|
|
40
|
+
provider=_get_provider(feature_meta, is_client_feature),
|
|
41
|
+
internal_provider=_get_internal_provider(feature_meta, is_client_feature),
|
|
42
|
+
source=_get_source(feature_meta, is_client_feature),
|
|
43
|
+
internal_source=_get_internal_source(feature_meta, is_client_feature),
|
|
44
|
+
update_frequency=feature_meta.update_frequency,
|
|
45
|
+
commercial_schema=feature_meta.commercial_schema,
|
|
46
|
+
doc_link=feature_meta.doc_link,
|
|
47
|
+
data_provider_link=feature_meta.data_provider_link,
|
|
48
|
+
data_source_link=feature_meta.data_source_link,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
52
|
+
return {
|
|
53
|
+
bundle.get("features_info_name"): self.name,
|
|
54
|
+
bundle.get("features_info_shap"): self.rounded_shap,
|
|
55
|
+
bundle.get("features_info_hitrate"): self.hitrate,
|
|
56
|
+
bundle.get("features_info_value_preview"): self.value_preview,
|
|
57
|
+
bundle.get("features_info_provider"): self.provider,
|
|
58
|
+
bundle.get("features_info_source"): self.source,
|
|
59
|
+
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
def to_row_without_links(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
63
|
+
return {
|
|
64
|
+
bundle.get("features_info_name"): self.internal_name,
|
|
65
|
+
bundle.get("features_info_shap"): self.rounded_shap,
|
|
66
|
+
bundle.get("features_info_hitrate"): self.hitrate,
|
|
67
|
+
bundle.get("features_info_value_preview"): self.value_preview,
|
|
68
|
+
bundle.get("features_info_provider"): self.internal_provider,
|
|
69
|
+
bundle.get("features_info_source"): self.internal_source,
|
|
70
|
+
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
def to_internal_row(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
74
|
+
return {
|
|
75
|
+
bundle.get("features_info_name"): self.internal_name,
|
|
76
|
+
"feature_link": self.doc_link,
|
|
77
|
+
bundle.get("features_info_shap"): self.rounded_shap,
|
|
78
|
+
bundle.get("features_info_hitrate"): self.hitrate,
|
|
79
|
+
bundle.get("features_info_value_preview"): self.value_preview,
|
|
80
|
+
bundle.get("features_info_provider"): self.internal_provider,
|
|
81
|
+
"provider_link": self.data_provider_link,
|
|
82
|
+
bundle.get("features_info_source"): self.internal_source,
|
|
83
|
+
"source_link": self.data_source_link,
|
|
84
|
+
bundle.get("features_info_commercial_schema"): self.commercial_schema or "",
|
|
85
|
+
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: pd.DataFrame) -> str:
|
|
90
|
+
if feature_meta.name in data.columns:
|
|
91
|
+
feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
|
|
92
|
+
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
|
93
|
+
feature_sample = [round(f, 4) for f in feature_sample]
|
|
94
|
+
feature_sample = [str(f) for f in feature_sample]
|
|
95
|
+
feature_sample = ", ".join(feature_sample)
|
|
96
|
+
if len(feature_sample) > 30:
|
|
97
|
+
feature_sample = feature_sample[:30] + "..."
|
|
98
|
+
else:
|
|
99
|
+
feature_sample = ""
|
|
100
|
+
return feature_sample
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _get_name(feature_meta: FeaturesMetadataV2) -> str:
|
|
104
|
+
if feature_meta.doc_link:
|
|
105
|
+
return _to_anchor(feature_meta.doc_link, feature_meta.name)
|
|
106
|
+
else:
|
|
107
|
+
return feature_meta.name
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _get_internal_name(feature_meta: FeaturesMetadataV2) -> str:
|
|
111
|
+
return feature_meta.name
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _get_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
115
|
+
providers = _list_or_single(feature_meta.data_providers, feature_meta.data_provider)
|
|
116
|
+
provider_links = _list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
|
|
117
|
+
if providers:
|
|
118
|
+
provider = _make_links(providers, provider_links)
|
|
119
|
+
else:
|
|
120
|
+
provider = "" if is_client_feature else _to_anchor("https://upgini.com", "Upgini")
|
|
121
|
+
return provider
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
125
|
+
return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
129
|
+
sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
|
|
130
|
+
source_links = _list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
|
|
131
|
+
if sources:
|
|
132
|
+
source = _make_links(sources, source_links)
|
|
133
|
+
else:
|
|
134
|
+
source = _get_internal_source(feature_meta, is_client_feature)
|
|
135
|
+
return source
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
139
|
+
return feature_meta.data_source or (
|
|
140
|
+
LLM_SOURCE
|
|
141
|
+
if not feature_meta.name.endswith("_country")
|
|
142
|
+
and not feature_meta.name.endswith("_postal_code")
|
|
143
|
+
and not is_client_feature
|
|
144
|
+
else ""
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _list_or_single(lst: List[str], single: str):
|
|
149
|
+
return lst or ([single] if single else [])
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _to_anchor(link: str, value: str) -> str:
|
|
153
|
+
if not value:
|
|
154
|
+
return ""
|
|
155
|
+
elif not link:
|
|
156
|
+
return value
|
|
157
|
+
elif value == LLM_SOURCE:
|
|
158
|
+
return value
|
|
159
|
+
else:
|
|
160
|
+
return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _make_links(names: List[str], links: List[str]):
|
|
164
|
+
all_links = [_to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
|
|
165
|
+
return ",".join(all_links)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _round_shap_value(shap: float) -> float:
|
|
169
|
+
if shap > 0.0 and shap < 0.0001:
|
|
170
|
+
return 0.0001
|
|
171
|
+
else:
|
|
172
|
+
return round(shap, 4)
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from logging import Logger
|
|
3
|
-
from typing import List, Optional
|
|
3
|
+
from typing import Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
|
+
import numpy as np
|
|
5
6
|
import pandas as pd
|
|
6
7
|
from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
|
|
7
8
|
|
|
8
9
|
from upgini.resource_bundle import bundle
|
|
9
|
-
from upgini.utils.warning_counter import WarningCounter
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class FeaturesValidator:
|
|
@@ -21,12 +21,13 @@ class FeaturesValidator:
|
|
|
21
21
|
self,
|
|
22
22
|
df: pd.DataFrame,
|
|
23
23
|
features: List[str],
|
|
24
|
-
features_for_generate: Optional[List[str]],
|
|
25
|
-
|
|
26
|
-
) -> List[str]:
|
|
24
|
+
features_for_generate: Optional[List[str]] = None,
|
|
25
|
+
columns_renaming: Optional[Dict[str, str]] = None,
|
|
26
|
+
) -> Tuple[List[str], List[str]]:
|
|
27
27
|
# one_hot_encoded_features = []
|
|
28
28
|
empty_or_constant_features = []
|
|
29
29
|
high_cardinality_features = []
|
|
30
|
+
warnings = []
|
|
30
31
|
|
|
31
32
|
for f in features:
|
|
32
33
|
column = df[f]
|
|
@@ -51,26 +52,28 @@ class FeaturesValidator:
|
|
|
51
52
|
|
|
52
53
|
# if one_hot_encoded_features:
|
|
53
54
|
# msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
|
|
54
|
-
#
|
|
55
|
-
|
|
56
|
-
|
|
55
|
+
# warnings.append(msg)
|
|
56
|
+
|
|
57
|
+
columns_renaming = columns_renaming or {}
|
|
57
58
|
|
|
58
59
|
if empty_or_constant_features:
|
|
59
|
-
msg = bundle.get("empty_or_contant_features").format(
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
60
|
+
msg = bundle.get("empty_or_contant_features").format(
|
|
61
|
+
[columns_renaming.get(f, f) for f in empty_or_constant_features]
|
|
62
|
+
)
|
|
63
|
+
warnings.append(msg)
|
|
63
64
|
|
|
64
65
|
high_cardinality_features = self.find_high_cardinality(df[features])
|
|
65
66
|
if features_for_generate:
|
|
66
|
-
high_cardinality_features = [
|
|
67
|
+
high_cardinality_features = [
|
|
68
|
+
f for f in high_cardinality_features if columns_renaming.get(f, f) not in features_for_generate
|
|
69
|
+
]
|
|
67
70
|
if high_cardinality_features:
|
|
68
|
-
msg = bundle.get("high_cardinality_features").format(
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
71
|
+
msg = bundle.get("high_cardinality_features").format(
|
|
72
|
+
[columns_renaming.get(f, f) for f in high_cardinality_features]
|
|
73
|
+
)
|
|
74
|
+
warnings.append(msg)
|
|
72
75
|
|
|
73
|
-
return empty_or_constant_features + high_cardinality_features
|
|
76
|
+
return (empty_or_constant_features + high_cardinality_features, warnings)
|
|
74
77
|
|
|
75
78
|
@staticmethod
|
|
76
79
|
def find_high_cardinality(df: pd.DataFrame) -> List[str]:
|
|
@@ -81,10 +84,21 @@ class FeaturesValidator:
|
|
|
81
84
|
return [
|
|
82
85
|
i
|
|
83
86
|
for i in df
|
|
84
|
-
if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or
|
|
87
|
+
if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or FeaturesValidator.__is_integer(df[i]))
|
|
85
88
|
and (df[i].nunique(dropna=False) / row_count >= 0.85)
|
|
86
89
|
]
|
|
87
90
|
|
|
91
|
+
@staticmethod
|
|
92
|
+
def __is_integer(series: pd.Series) -> bool:
|
|
93
|
+
return (
|
|
94
|
+
is_integer_dtype(series)
|
|
95
|
+
or series.dropna()
|
|
96
|
+
.apply(
|
|
97
|
+
lambda f: (float.is_integer(f) and abs(f) < np.iinfo(np.int64).max) if isinstance(f, float) else False
|
|
98
|
+
)
|
|
99
|
+
.all()
|
|
100
|
+
)
|
|
101
|
+
|
|
88
102
|
@staticmethod
|
|
89
103
|
def find_constant_features(df: pd.DataFrame) -> List[str]:
|
|
90
|
-
return [i for i in df if df[i].nunique()
|
|
104
|
+
return [i for i in df if df[i].nunique() <= 1]
|
upgini/utils/ip_utils.py
CHANGED
|
@@ -1,15 +1,114 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from
|
|
2
|
+
from ipaddress import IPv4Address, IPv6Address, _BaseAddress, ip_address
|
|
3
|
+
from typing import Dict, List, Optional, Union
|
|
3
4
|
|
|
4
5
|
import pandas as pd
|
|
5
6
|
from requests import get
|
|
6
7
|
|
|
8
|
+
from upgini.errors import ValidationError
|
|
7
9
|
from upgini.metadata import SearchKey
|
|
10
|
+
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
8
11
|
|
|
9
12
|
# from upgini.resource_bundle import bundle
|
|
10
13
|
# from upgini.utils.track_info import get_track_metrics
|
|
11
14
|
|
|
12
15
|
|
|
16
|
+
class IpSearchKeyConverter:
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
ip_column: str,
|
|
20
|
+
search_keys: Dict[str, SearchKey],
|
|
21
|
+
columns_renaming: Dict[str, str],
|
|
22
|
+
unnest_search_keys: Optional[List[str]] = None,
|
|
23
|
+
bundle: Optional[ResourceBundle] = None,
|
|
24
|
+
logger: Optional[logging.Logger] = None,
|
|
25
|
+
):
|
|
26
|
+
self.ip_column = ip_column
|
|
27
|
+
self.search_keys = search_keys
|
|
28
|
+
self.columns_renaming = columns_renaming
|
|
29
|
+
self.unnest_search_keys = unnest_search_keys
|
|
30
|
+
self.bundle = bundle or get_custom_bundle()
|
|
31
|
+
if logger is not None:
|
|
32
|
+
self.logger = logger
|
|
33
|
+
else:
|
|
34
|
+
self.logger = logging.getLogger()
|
|
35
|
+
self.logger.setLevel("FATAL")
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def _ip_to_int(ip: Optional[_BaseAddress]) -> Optional[int]:
|
|
39
|
+
try:
|
|
40
|
+
if isinstance(ip, (IPv4Address, IPv6Address)):
|
|
41
|
+
return int(ip)
|
|
42
|
+
except Exception:
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
|
|
47
|
+
try:
|
|
48
|
+
if isinstance(ip, (IPv4Address, IPv6Address)):
|
|
49
|
+
return str(int(ip))
|
|
50
|
+
except Exception:
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def _safe_ip_parse(ip: Union[str, int, IPv4Address, IPv6Address]) -> Optional[_BaseAddress]:
|
|
55
|
+
try:
|
|
56
|
+
return ip_address(ip)
|
|
57
|
+
except ValueError:
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
# @staticmethod
|
|
61
|
+
# def _is_ipv4(ip: Optional[_BaseAddress]):
|
|
62
|
+
# return ip is not None and (
|
|
63
|
+
# isinstance(ip, IPv4Address) or (isinstance(ip, IPv6Address) and ip.ipv4_mapped is not None)
|
|
64
|
+
# )
|
|
65
|
+
|
|
66
|
+
# @staticmethod
|
|
67
|
+
# def _to_ipv4(ip: Optional[_BaseAddress]) -> Optional[IPv4Address]:
|
|
68
|
+
# if isinstance(ip, IPv4Address):
|
|
69
|
+
# return ip
|
|
70
|
+
# return None
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def _to_ipv6(ip: Optional[_BaseAddress]) -> Optional[IPv6Address]:
|
|
74
|
+
if isinstance(ip, IPv6Address):
|
|
75
|
+
return ip
|
|
76
|
+
if isinstance(ip, IPv4Address):
|
|
77
|
+
return IPv6Address("::ffff:" + str(ip))
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
def convert(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
81
|
+
"""Convert ip address to int"""
|
|
82
|
+
self.logger.info("Convert ip address to int")
|
|
83
|
+
original_ip = self.columns_renaming[self.ip_column]
|
|
84
|
+
|
|
85
|
+
df[self.ip_column] = df[self.ip_column].apply(self._safe_ip_parse)
|
|
86
|
+
if df[self.ip_column].isnull().all():
|
|
87
|
+
raise ValidationError(self.bundle.get("invalid_ip").format(self.ip_column))
|
|
88
|
+
|
|
89
|
+
# legacy support
|
|
90
|
+
# ipv4 = self.ip_column + "_v4"
|
|
91
|
+
# df[ipv4] = df[self.ip_column].apply(self._to_ipv4).apply(self._ip_to_int).astype("Int64")
|
|
92
|
+
# self.search_keys[ipv4] = SearchKey.IP
|
|
93
|
+
# self.columns_renaming[ipv4] = original_ip
|
|
94
|
+
|
|
95
|
+
ipv6 = self.ip_column + "_v6"
|
|
96
|
+
df[ipv6] = (
|
|
97
|
+
df[self.ip_column]
|
|
98
|
+
.apply(self._to_ipv6)
|
|
99
|
+
.apply(self._ip_to_int_str)
|
|
100
|
+
.astype("string")
|
|
101
|
+
# .str.replace(".0", "", regex=False)
|
|
102
|
+
)
|
|
103
|
+
df = df.drop(columns=self.ip_column)
|
|
104
|
+
del self.search_keys[self.ip_column]
|
|
105
|
+
del self.columns_renaming[self.ip_column]
|
|
106
|
+
self.search_keys[ipv6] = SearchKey.IPV6_ADDRESS
|
|
107
|
+
self.columns_renaming[ipv6] = original_ip # could be __unnest_ip...
|
|
108
|
+
|
|
109
|
+
return df
|
|
110
|
+
|
|
111
|
+
|
|
13
112
|
class IpToCountrySearchKeyConverter:
|
|
14
113
|
url = "http://ip-api.com/json/{}"
|
|
15
114
|
|