upgini 1.1.280.dev0__py3-none-any.whl → 1.2.31a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (43) hide show
  1. upgini/__about__.py +1 -1
  2. upgini/__init__.py +4 -20
  3. upgini/autofe/all_operands.py +39 -9
  4. upgini/autofe/binary.py +148 -45
  5. upgini/autofe/date.py +197 -26
  6. upgini/autofe/feature.py +102 -19
  7. upgini/autofe/groupby.py +22 -22
  8. upgini/autofe/operand.py +9 -6
  9. upgini/autofe/unary.py +83 -41
  10. upgini/autofe/vector.py +8 -8
  11. upgini/data_source/data_source_publisher.py +128 -5
  12. upgini/dataset.py +50 -386
  13. upgini/features_enricher.py +931 -542
  14. upgini/http.py +27 -16
  15. upgini/lazy_import.py +35 -0
  16. upgini/metadata.py +84 -59
  17. upgini/metrics.py +164 -34
  18. upgini/normalizer/normalize_utils.py +197 -0
  19. upgini/resource_bundle/strings.properties +66 -51
  20. upgini/search_task.py +10 -4
  21. upgini/utils/Roboto-Regular.ttf +0 -0
  22. upgini/utils/base_search_key_detector.py +14 -12
  23. upgini/utils/country_utils.py +16 -0
  24. upgini/utils/custom_loss_utils.py +39 -36
  25. upgini/utils/datetime_utils.py +98 -45
  26. upgini/utils/deduplicate_utils.py +135 -112
  27. upgini/utils/display_utils.py +46 -15
  28. upgini/utils/email_utils.py +54 -16
  29. upgini/utils/feature_info.py +172 -0
  30. upgini/utils/features_validator.py +34 -20
  31. upgini/utils/ip_utils.py +100 -1
  32. upgini/utils/phone_utils.py +343 -0
  33. upgini/utils/postal_code_utils.py +34 -0
  34. upgini/utils/sklearn_ext.py +28 -19
  35. upgini/utils/target_utils.py +113 -57
  36. upgini/utils/warning_counter.py +1 -0
  37. upgini/version_validator.py +8 -4
  38. {upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/METADATA +31 -16
  39. upgini-1.2.31a1.dist-info/RECORD +65 -0
  40. upgini/normalizer/phone_normalizer.py +0 -340
  41. upgini-1.1.280.dev0.dist-info/RECORD +0 -62
  42. {upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/WHEEL +0 -0
  43. {upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/licenses/LICENSE +0 -0
@@ -4,12 +4,14 @@ import textwrap
4
4
  import urllib.parse
5
5
  import uuid
6
6
  from datetime import datetime, timezone
7
- from io import BytesIO
7
+ from io import StringIO
8
8
  from typing import Callable, List, Optional
9
9
 
10
10
  import pandas as pd
11
11
  from xhtml2pdf import pisa
12
12
 
13
+ from upgini.__about__ import __version__
14
+
13
15
 
14
16
  def ipython_available() -> bool:
15
17
  try:
@@ -71,7 +73,9 @@ def make_table(df: pd.DataFrame, wrap_long_string=None) -> str:
71
73
  )
72
74
 
73
75
 
74
- def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header: str):
76
+ def display_html_dataframe(
77
+ df: pd.DataFrame, internal_df: pd.DataFrame, header: str, display_id: Optional[str] = None, display_handle=None
78
+ ):
75
79
  if not ipython_available():
76
80
  print(header)
77
81
  print(internal_df)
@@ -132,7 +136,10 @@ def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header:
132
136
  {table_html}
133
137
  </div>
134
138
  """
135
- display(HTML(result_html))
139
+ if display_handle:
140
+ return display_handle.update(HTML(result_html))
141
+ else:
142
+ return display(HTML(result_html), display_id=display_id)
136
143
 
137
144
 
138
145
  def make_html_report(
@@ -143,7 +150,7 @@ def make_html_report(
143
150
  search_id: str,
144
151
  email: Optional[str] = None,
145
152
  search_keys: Optional[List[str]] = None,
146
- ):
153
+ ) -> str:
147
154
  # relevant_features_df = relevant_features_df.copy()
148
155
  # relevant_features_df["Feature name"] = relevant_features_df["Feature name"].apply(
149
156
  # lambda x: "*" + x if x.contains("_autofe_") else x
@@ -154,9 +161,18 @@ def make_html_report(
154
161
  """<button type="button">Request a quote</button></a>"""
155
162
  )
156
163
  relevant_datasources_df.rename(columns={"action": "&nbsp;"}, inplace=True)
164
+
165
+ try:
166
+ from importlib.resources import files
167
+ font_path = files('upgini.utils').joinpath('Roboto-Regular.ttf')
168
+ except Exception:
169
+ from pkg_resources import resource_filename
170
+ font_path = resource_filename('upgini.utils', 'Roboto-Regular.ttf')
171
+
157
172
  return f"""<html>
158
173
  <head>
159
174
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
175
+ <meta charset="UTF-8">
160
176
  <style>
161
177
  @page {{
162
178
  size: a4 portrait;
@@ -166,23 +182,25 @@ def make_html_report(
166
182
  /*-pdf-frame-border: 1;*/
167
183
  }}
168
184
  @frame content_frame {{
169
- left: 10pt; width: 574pt; top: 50pt; height: 752pt;
185
+ left: 10pt; width: 574pt; top: 50pt; height: 742pt;
170
186
  /*-pdf-frame-border: 1;*/
171
187
  }}
172
188
  @frame footer_frame {{
173
189
  -pdf-frame-content: footer_content;
174
- left: 10pt; width: 574pt; top: 802pt; height: 30pt;
190
+ left: 10pt; width: 574pt; top: 802pt; height: 40pt;
175
191
  /*-pdf-frame-border: 1;*/
176
192
  }}
177
193
  }}
178
194
 
179
195
  @font-face {{
180
- font-family: "Alice-Regular";
181
- src: url("/fonts/Alice-Regular.ttf") format("truetype");
196
+ font-family: "Roboto";
197
+ src: url("{font_path}") format("truetype");
182
198
  }}
183
199
 
184
200
  body {{
185
- font-family: "Alice-Regular", Arial, sans-serif;
201
+ font-family: "Roboto", sans-serif;
202
+ font-weight: 400;
203
+ font-style: normal;
186
204
  }}
187
205
 
188
206
  #header_content {{
@@ -234,7 +252,8 @@ def make_html_report(
234
252
  <div id="header_content">UPGINI</div>
235
253
  <div id="footer_content">
236
254
  © Upgini</br>
237
- sales@upgini.com
255
+ sales@upgini.com</br>
256
+ Launched by version {__version__}
238
257
  </div>
239
258
 
240
259
  <h1>Data search report</h1>
@@ -257,7 +276,7 @@ def make_html_report(
257
276
  }
258
277
  <h3>Relevant data sources</h3>
259
278
  {make_table(relevant_datasources_df)}
260
- <h3>All relevant features. Listing</h3>
279
+ <h3>All relevant features. Listing ({len(relevant_features_df)} items)</h3>
261
280
  {make_table(relevant_features_df, wrap_long_string=25)}
262
281
  {"<h3>Description of AutoFE feature names</h3>" + make_table(autofe_descriptions_df, wrap_long_string=25)
263
282
  if autofe_descriptions_df is not None
@@ -277,6 +296,8 @@ def prepare_and_show_report(
277
296
  search_id: str,
278
297
  email: Optional[str],
279
298
  search_keys: Optional[List[str]] = None,
299
+ display_id: Optional[str] = None,
300
+ display_handle=None,
280
301
  ):
281
302
  if not ipython_available():
282
303
  return
@@ -286,22 +307,32 @@ def prepare_and_show_report(
286
307
  )
287
308
 
288
309
  if len(relevant_features_df) > 0:
289
- show_button_download_pdf(report)
310
+ return show_button_download_pdf(report, display_id=display_id, display_handle=display_handle)
290
311
 
291
312
 
292
- def show_button_download_pdf(source: str, title="\U0001F4CA Download PDF report"):
313
+ def show_button_download_pdf(
314
+ source: str, title="\U0001F4CA Download PDF report", display_id: Optional[str] = None, display_handle=None
315
+ ):
293
316
  from IPython.display import HTML, display
294
317
 
295
318
  file_name = f"upgini-report-{uuid.uuid4()}.pdf"
319
+
320
+ # from weasyprint import HTML
321
+
322
+ # html = HTML(string=source)
323
+ # html.write_pdf(file_name)
296
324
  with open(file_name, "wb") as output:
297
- pisa.CreatePDF(src=BytesIO(source.encode("UTF-8")), dest=output)
325
+ pisa.CreatePDF(src=StringIO(source), dest=output, encoding="UTF-8")
298
326
 
299
327
  with open(file_name, "rb") as f:
300
328
  b64 = base64.b64encode(f.read())
301
329
  payload = b64.decode()
302
330
  html = f"""<a download="{file_name}" href="data:application/pdf;base64,{payload}" target="_blank">
303
331
  <button>{title}</button></a>"""
304
- display(HTML(html))
332
+ if display_handle is not None:
333
+ display_handle.update(HTML(html))
334
+ else:
335
+ return display(HTML(html), display_id=display_id)
305
336
 
306
337
 
307
338
  def show_request_quote_button():
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from pandas.api.types import is_object_dtype, is_string_dtype
8
8
 
9
9
  from upgini.metadata import SearchKey
10
- from upgini.resource_bundle import bundle
10
+ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
11
11
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
12
12
 
13
13
  EMAIL_REGEX = re.compile(r"^[a-zA-Z0-9.!#$%&’*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*$")
@@ -28,27 +28,54 @@ class EmailSearchKeyDetector(BaseSearchKeyDetector):
28
28
  return is_email_count / all_count > 0.1
29
29
 
30
30
 
31
+ class EmailDomainGenerator:
32
+ DOMAIN_SUFFIX = "_domain"
33
+
34
+ def __init__(self, email_columns: List[str]):
35
+ self.email_columns = email_columns
36
+ self.generated_features = []
37
+
38
+ def generate(self, df: pd.DataFrame) -> pd.DataFrame:
39
+ for email_col in self.email_columns:
40
+ domain_feature = email_col + self.DOMAIN_SUFFIX
41
+ if domain_feature not in df.columns:
42
+ df[domain_feature] = df[email_col].apply(self._email_to_domain).astype("string")
43
+ self.generated_features.append(domain_feature)
44
+ return df
45
+
46
+ @staticmethod
47
+ def _email_to_domain(email: str) -> Optional[str]:
48
+ if email is not None and isinstance(email, str) and "@" in email:
49
+ name_and_domain = email.split("@")
50
+ if len(name_and_domain) == 2 and len(name_and_domain[1]) > 0:
51
+ return name_and_domain[1]
52
+
53
+
31
54
  class EmailSearchKeyConverter:
32
- HEM_COLUMN_NAME = "hashed_email"
33
- DOMAIN_COLUMN_NAME = "email_domain"
34
- EMAIL_ONE_DOMAIN_COLUMN_NAME = "email_one_domain"
55
+ HEM_SUFFIX = "_hem"
56
+ ONE_DOMAIN_SUFFIX = "_one_domain"
35
57
 
36
58
  def __init__(
37
59
  self,
38
60
  email_column: str,
39
61
  hem_column: Optional[str],
40
62
  search_keys: Dict[str, SearchKey],
63
+ columns_renaming: Dict[str, str],
64
+ unnest_search_keys: Optional[List[str]] = None,
65
+ bundle: Optional[ResourceBundle] = None,
41
66
  logger: Optional[logging.Logger] = None,
42
67
  ):
43
68
  self.email_column = email_column
44
69
  self.hem_column = hem_column
45
70
  self.search_keys = search_keys
71
+ self.columns_renaming = columns_renaming
72
+ self.unnest_search_keys = unnest_search_keys
73
+ self.bundle = bundle or get_custom_bundle()
46
74
  if logger is not None:
47
75
  self.logger = logger
48
76
  else:
49
77
  self.logger = logging.getLogger()
50
78
  self.logger.setLevel("FATAL")
51
- self.generated_features: List[str] = []
52
79
  self.email_converted_to_hem = False
53
80
 
54
81
  @staticmethod
@@ -59,7 +86,7 @@ class EmailSearchKeyConverter:
59
86
  if not EMAIL_REGEX.fullmatch(email):
60
87
  return None
61
88
 
62
- return sha256(email.lower().encode("utf-8")).hexdigest()
89
+ return sha256(email.lower().encode("utf-8")).hexdigest().lower()
63
90
 
64
91
  @staticmethod
65
92
  def _email_to_one_domain(email: str) -> Optional[str]:
@@ -70,25 +97,36 @@ class EmailSearchKeyConverter:
70
97
 
71
98
  def convert(self, df: pd.DataFrame) -> pd.DataFrame:
72
99
  df = df.copy()
100
+ original_email_column = self.columns_renaming[self.email_column]
73
101
  if self.hem_column is None:
74
- df[self.HEM_COLUMN_NAME] = df[self.email_column].apply(self._email_to_hem)
75
- if df[self.HEM_COLUMN_NAME].isna().all():
76
- msg = bundle.get("all_emails_invalid").format(self.email_column)
102
+ hem_name = self.email_column + self.HEM_SUFFIX
103
+ df[hem_name] = df[self.email_column].apply(self._email_to_hem)
104
+ if df[hem_name].isna().all():
105
+ msg = self.bundle.get("all_emails_invalid").format(self.email_column)
77
106
  print(msg)
78
107
  self.logger.warning(msg)
79
- df = df.drop(columns=self.HEM_COLUMN_NAME)
108
+ df = df.drop(columns=hem_name)
80
109
  del self.search_keys[self.email_column]
81
110
  return df
82
- self.search_keys[self.HEM_COLUMN_NAME] = SearchKey.HEM
111
+ self.search_keys[hem_name] = SearchKey.HEM
112
+ if self.email_column in self.unnest_search_keys:
113
+ self.unnest_search_keys.append(hem_name)
114
+ self.columns_renaming[hem_name] = original_email_column # it could be upgini_email_unnest...
83
115
  self.email_converted_to_hem = True
116
+ else:
117
+ df[self.hem_column] = df[self.hem_column].astype("string").str.lower()
84
118
 
85
119
  del self.search_keys[self.email_column]
120
+ if self.email_column in self.unnest_search_keys:
121
+ self.unnest_search_keys.remove(self.email_column)
86
122
 
87
- df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = df[self.email_column].apply(self._email_to_one_domain)
88
-
89
- self.search_keys[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = SearchKey.EMAIL_ONE_DOMAIN
123
+ one_domain_name = self.email_column + self.ONE_DOMAIN_SUFFIX
124
+ df[one_domain_name] = df[self.email_column].apply(self._email_to_one_domain)
125
+ self.columns_renaming[one_domain_name] = original_email_column
126
+ self.search_keys[one_domain_name] = SearchKey.EMAIL_ONE_DOMAIN
90
127
 
91
- df[self.DOMAIN_COLUMN_NAME] = df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME].str[1:]
92
- self.generated_features.append(self.DOMAIN_COLUMN_NAME)
128
+ if self.email_converted_to_hem:
129
+ df = df.drop(columns=self.email_column)
130
+ del self.columns_renaming[self.email_column]
93
131
 
94
132
  return df
@@ -0,0 +1,172 @@
1
+ from dataclasses import dataclass
2
+ import itertools
3
+ from typing import Dict, List
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from upgini.metadata import FeaturesMetadataV2
9
+ from upgini.resource_bundle import ResourceBundle
10
+
11
+
12
+ LLM_SOURCE = "LLM with external data augmentation"
13
+
14
+
15
+ @dataclass
16
+ class FeatureInfo:
17
+ name: str
18
+ internal_name: str
19
+ rounded_shap: float
20
+ hitrate: float
21
+ value_preview: str
22
+ provider: str
23
+ internal_provider: str
24
+ source: str
25
+ internal_source: str
26
+ update_frequency: str
27
+ commercial_schema: str
28
+ doc_link: str
29
+ data_provider_link: str
30
+ data_source_link: str
31
+
32
+ @staticmethod
33
+ def from_metadata(feature_meta: FeaturesMetadataV2, data: pd.DataFrame, is_client_feature: bool) -> "FeatureInfo":
34
+ return FeatureInfo(
35
+ name=_get_name(feature_meta),
36
+ internal_name=_get_internal_name(feature_meta),
37
+ rounded_shap=_round_shap_value(feature_meta.shap_value),
38
+ hitrate=feature_meta.hit_rate,
39
+ value_preview=_get_feature_sample(feature_meta, data),
40
+ provider=_get_provider(feature_meta, is_client_feature),
41
+ internal_provider=_get_internal_provider(feature_meta, is_client_feature),
42
+ source=_get_source(feature_meta, is_client_feature),
43
+ internal_source=_get_internal_source(feature_meta, is_client_feature),
44
+ update_frequency=feature_meta.update_frequency,
45
+ commercial_schema=feature_meta.commercial_schema,
46
+ doc_link=feature_meta.doc_link,
47
+ data_provider_link=feature_meta.data_provider_link,
48
+ data_source_link=feature_meta.data_source_link,
49
+ )
50
+
51
+ def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
52
+ return {
53
+ bundle.get("features_info_name"): self.name,
54
+ bundle.get("features_info_shap"): self.rounded_shap,
55
+ bundle.get("features_info_hitrate"): self.hitrate,
56
+ bundle.get("features_info_value_preview"): self.value_preview,
57
+ bundle.get("features_info_provider"): self.provider,
58
+ bundle.get("features_info_source"): self.source,
59
+ bundle.get("features_info_update_frequency"): self.update_frequency,
60
+ }
61
+
62
+ def to_row_without_links(self, bundle: ResourceBundle) -> Dict[str, str]:
63
+ return {
64
+ bundle.get("features_info_name"): self.internal_name,
65
+ bundle.get("features_info_shap"): self.rounded_shap,
66
+ bundle.get("features_info_hitrate"): self.hitrate,
67
+ bundle.get("features_info_value_preview"): self.value_preview,
68
+ bundle.get("features_info_provider"): self.internal_provider,
69
+ bundle.get("features_info_source"): self.internal_source,
70
+ bundle.get("features_info_update_frequency"): self.update_frequency,
71
+ }
72
+
73
+ def to_internal_row(self, bundle: ResourceBundle) -> Dict[str, str]:
74
+ return {
75
+ bundle.get("features_info_name"): self.internal_name,
76
+ "feature_link": self.doc_link,
77
+ bundle.get("features_info_shap"): self.rounded_shap,
78
+ bundle.get("features_info_hitrate"): self.hitrate,
79
+ bundle.get("features_info_value_preview"): self.value_preview,
80
+ bundle.get("features_info_provider"): self.internal_provider,
81
+ "provider_link": self.data_provider_link,
82
+ bundle.get("features_info_source"): self.internal_source,
83
+ "source_link": self.data_source_link,
84
+ bundle.get("features_info_commercial_schema"): self.commercial_schema or "",
85
+ bundle.get("features_info_update_frequency"): self.update_frequency,
86
+ }
87
+
88
+
89
+ def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: pd.DataFrame) -> str:
90
+ if feature_meta.name in data.columns:
91
+ feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
92
+ if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
93
+ feature_sample = [round(f, 4) for f in feature_sample]
94
+ feature_sample = [str(f) for f in feature_sample]
95
+ feature_sample = ", ".join(feature_sample)
96
+ if len(feature_sample) > 30:
97
+ feature_sample = feature_sample[:30] + "..."
98
+ else:
99
+ feature_sample = ""
100
+ return feature_sample
101
+
102
+
103
+ def _get_name(feature_meta: FeaturesMetadataV2) -> str:
104
+ if feature_meta.doc_link:
105
+ return _to_anchor(feature_meta.doc_link, feature_meta.name)
106
+ else:
107
+ return feature_meta.name
108
+
109
+
110
+ def _get_internal_name(feature_meta: FeaturesMetadataV2) -> str:
111
+ return feature_meta.name
112
+
113
+
114
+ def _get_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
115
+ providers = _list_or_single(feature_meta.data_providers, feature_meta.data_provider)
116
+ provider_links = _list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
117
+ if providers:
118
+ provider = _make_links(providers, provider_links)
119
+ else:
120
+ provider = "" if is_client_feature else _to_anchor("https://upgini.com", "Upgini")
121
+ return provider
122
+
123
+
124
+ def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
125
+ return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
126
+
127
+
128
+ def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
129
+ sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
130
+ source_links = _list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
131
+ if sources:
132
+ source = _make_links(sources, source_links)
133
+ else:
134
+ source = _get_internal_source(feature_meta, is_client_feature)
135
+ return source
136
+
137
+
138
+ def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
139
+ return feature_meta.data_source or (
140
+ LLM_SOURCE
141
+ if not feature_meta.name.endswith("_country")
142
+ and not feature_meta.name.endswith("_postal_code")
143
+ and not is_client_feature
144
+ else ""
145
+ )
146
+
147
+
148
+ def _list_or_single(lst: List[str], single: str):
149
+ return lst or ([single] if single else [])
150
+
151
+
152
+ def _to_anchor(link: str, value: str) -> str:
153
+ if not value:
154
+ return ""
155
+ elif not link:
156
+ return value
157
+ elif value == LLM_SOURCE:
158
+ return value
159
+ else:
160
+ return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
161
+
162
+
163
+ def _make_links(names: List[str], links: List[str]):
164
+ all_links = [_to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
165
+ return ",".join(all_links)
166
+
167
+
168
+ def _round_shap_value(shap: float) -> float:
169
+ if shap > 0.0 and shap < 0.0001:
170
+ return 0.0001
171
+ else:
172
+ return round(shap, 4)
@@ -1,12 +1,12 @@
1
1
  import logging
2
2
  from logging import Logger
3
- from typing import List, Optional
3
+ from typing import Dict, List, Optional, Tuple
4
4
 
5
+ import numpy as np
5
6
  import pandas as pd
6
7
  from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
7
8
 
8
9
  from upgini.resource_bundle import bundle
9
- from upgini.utils.warning_counter import WarningCounter
10
10
 
11
11
 
12
12
  class FeaturesValidator:
@@ -21,12 +21,13 @@ class FeaturesValidator:
21
21
  self,
22
22
  df: pd.DataFrame,
23
23
  features: List[str],
24
- features_for_generate: Optional[List[str]],
25
- warning_counter: WarningCounter,
26
- ) -> List[str]:
24
+ features_for_generate: Optional[List[str]] = None,
25
+ columns_renaming: Optional[Dict[str, str]] = None,
26
+ ) -> Tuple[List[str], List[str]]:
27
27
  # one_hot_encoded_features = []
28
28
  empty_or_constant_features = []
29
29
  high_cardinality_features = []
30
+ warnings = []
30
31
 
31
32
  for f in features:
32
33
  column = df[f]
@@ -51,26 +52,28 @@ class FeaturesValidator:
51
52
 
52
53
  # if one_hot_encoded_features:
53
54
  # msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
54
- # print(msg)
55
- # self.logger.warning(msg)
56
- # warning_counter.increment()
55
+ # warnings.append(msg)
56
+
57
+ columns_renaming = columns_renaming or {}
57
58
 
58
59
  if empty_or_constant_features:
59
- msg = bundle.get("empty_or_contant_features").format(empty_or_constant_features)
60
- print(msg)
61
- self.logger.warning(msg)
62
- warning_counter.increment()
60
+ msg = bundle.get("empty_or_contant_features").format(
61
+ [columns_renaming.get(f, f) for f in empty_or_constant_features]
62
+ )
63
+ warnings.append(msg)
63
64
 
64
65
  high_cardinality_features = self.find_high_cardinality(df[features])
65
66
  if features_for_generate:
66
- high_cardinality_features = [f for f in high_cardinality_features if f not in features_for_generate]
67
+ high_cardinality_features = [
68
+ f for f in high_cardinality_features if columns_renaming.get(f, f) not in features_for_generate
69
+ ]
67
70
  if high_cardinality_features:
68
- msg = bundle.get("high_cardinality_features").format(high_cardinality_features)
69
- print(msg)
70
- self.logger.warning(msg)
71
- warning_counter.increment()
71
+ msg = bundle.get("high_cardinality_features").format(
72
+ [columns_renaming.get(f, f) for f in high_cardinality_features]
73
+ )
74
+ warnings.append(msg)
72
75
 
73
- return empty_or_constant_features + high_cardinality_features
76
+ return (empty_or_constant_features + high_cardinality_features, warnings)
74
77
 
75
78
  @staticmethod
76
79
  def find_high_cardinality(df: pd.DataFrame) -> List[str]:
@@ -81,10 +84,21 @@ class FeaturesValidator:
81
84
  return [
82
85
  i
83
86
  for i in df
84
- if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or is_integer_dtype(df[i]))
87
+ if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or FeaturesValidator.__is_integer(df[i]))
85
88
  and (df[i].nunique(dropna=False) / row_count >= 0.85)
86
89
  ]
87
90
 
91
+ @staticmethod
92
+ def __is_integer(series: pd.Series) -> bool:
93
+ return (
94
+ is_integer_dtype(series)
95
+ or series.dropna()
96
+ .apply(
97
+ lambda f: (float.is_integer(f) and abs(f) < np.iinfo(np.int64).max) if isinstance(f, float) else False
98
+ )
99
+ .all()
100
+ )
101
+
88
102
  @staticmethod
89
103
  def find_constant_features(df: pd.DataFrame) -> List[str]:
90
- return [i for i in df if df[i].nunique() == 1]
104
+ return [i for i in df if df[i].nunique() <= 1]
upgini/utils/ip_utils.py CHANGED
@@ -1,15 +1,114 @@
1
1
  import logging
2
- from typing import Dict, List, Optional
2
+ from ipaddress import IPv4Address, IPv6Address, _BaseAddress, ip_address
3
+ from typing import Dict, List, Optional, Union
3
4
 
4
5
  import pandas as pd
5
6
  from requests import get
6
7
 
8
+ from upgini.errors import ValidationError
7
9
  from upgini.metadata import SearchKey
10
+ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
8
11
 
9
12
  # from upgini.resource_bundle import bundle
10
13
  # from upgini.utils.track_info import get_track_metrics
11
14
 
12
15
 
16
+ class IpSearchKeyConverter:
17
+ def __init__(
18
+ self,
19
+ ip_column: str,
20
+ search_keys: Dict[str, SearchKey],
21
+ columns_renaming: Dict[str, str],
22
+ unnest_search_keys: Optional[List[str]] = None,
23
+ bundle: Optional[ResourceBundle] = None,
24
+ logger: Optional[logging.Logger] = None,
25
+ ):
26
+ self.ip_column = ip_column
27
+ self.search_keys = search_keys
28
+ self.columns_renaming = columns_renaming
29
+ self.unnest_search_keys = unnest_search_keys
30
+ self.bundle = bundle or get_custom_bundle()
31
+ if logger is not None:
32
+ self.logger = logger
33
+ else:
34
+ self.logger = logging.getLogger()
35
+ self.logger.setLevel("FATAL")
36
+
37
+ @staticmethod
38
+ def _ip_to_int(ip: Optional[_BaseAddress]) -> Optional[int]:
39
+ try:
40
+ if isinstance(ip, (IPv4Address, IPv6Address)):
41
+ return int(ip)
42
+ except Exception:
43
+ pass
44
+
45
+ @staticmethod
46
+ def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
47
+ try:
48
+ if isinstance(ip, (IPv4Address, IPv6Address)):
49
+ return str(int(ip))
50
+ except Exception:
51
+ pass
52
+
53
+ @staticmethod
54
+ def _safe_ip_parse(ip: Union[str, int, IPv4Address, IPv6Address]) -> Optional[_BaseAddress]:
55
+ try:
56
+ return ip_address(ip)
57
+ except ValueError:
58
+ pass
59
+
60
+ # @staticmethod
61
+ # def _is_ipv4(ip: Optional[_BaseAddress]):
62
+ # return ip is not None and (
63
+ # isinstance(ip, IPv4Address) or (isinstance(ip, IPv6Address) and ip.ipv4_mapped is not None)
64
+ # )
65
+
66
+ # @staticmethod
67
+ # def _to_ipv4(ip: Optional[_BaseAddress]) -> Optional[IPv4Address]:
68
+ # if isinstance(ip, IPv4Address):
69
+ # return ip
70
+ # return None
71
+
72
+ @staticmethod
73
+ def _to_ipv6(ip: Optional[_BaseAddress]) -> Optional[IPv6Address]:
74
+ if isinstance(ip, IPv6Address):
75
+ return ip
76
+ if isinstance(ip, IPv4Address):
77
+ return IPv6Address("::ffff:" + str(ip))
78
+ return None
79
+
80
+ def convert(self, df: pd.DataFrame) -> pd.DataFrame:
81
+ """Convert ip address to int"""
82
+ self.logger.info("Convert ip address to int")
83
+ original_ip = self.columns_renaming[self.ip_column]
84
+
85
+ df[self.ip_column] = df[self.ip_column].apply(self._safe_ip_parse)
86
+ if df[self.ip_column].isnull().all():
87
+ raise ValidationError(self.bundle.get("invalid_ip").format(self.ip_column))
88
+
89
+ # legacy support
90
+ # ipv4 = self.ip_column + "_v4"
91
+ # df[ipv4] = df[self.ip_column].apply(self._to_ipv4).apply(self._ip_to_int).astype("Int64")
92
+ # self.search_keys[ipv4] = SearchKey.IP
93
+ # self.columns_renaming[ipv4] = original_ip
94
+
95
+ ipv6 = self.ip_column + "_v6"
96
+ df[ipv6] = (
97
+ df[self.ip_column]
98
+ .apply(self._to_ipv6)
99
+ .apply(self._ip_to_int_str)
100
+ .astype("string")
101
+ # .str.replace(".0", "", regex=False)
102
+ )
103
+ df = df.drop(columns=self.ip_column)
104
+ del self.search_keys[self.ip_column]
105
+ del self.columns_renaming[self.ip_column]
106
+ self.search_keys[ipv6] = SearchKey.IPV6_ADDRESS
107
+ self.columns_renaming[ipv6] = original_ip # could be __unnest_ip...
108
+
109
+ return df
110
+
111
+
13
112
  class IpToCountrySearchKeyConverter:
14
113
  url = "http://ip-api.com/json/{}"
15
114