upgini 1.1.315a3579.dev1__py3-none-any.whl → 1.1.316a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -6,13 +6,10 @@ from typing import Dict, List, Optional
6
6
  import numpy as np
7
7
  import pandas as pd
8
8
  from dateutil.relativedelta import relativedelta
9
- from pandas.api.types import (
10
- is_numeric_dtype,
11
- is_period_dtype,
12
- )
9
+ from pandas.api.types import is_numeric_dtype
13
10
 
14
11
  from upgini.errors import ValidationError
15
- from upgini.metadata import SearchKey
12
+ from upgini.metadata import EVAL_SET_INDEX, SearchKey
16
13
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
17
14
  from upgini.utils.warning_counter import WarningCounter
18
15
 
@@ -31,18 +28,22 @@ DATE_FORMATS = [
31
28
  "%Y-%m-%dT%H:%M:%S.%f",
32
29
  ]
33
30
 
34
- DATETIME_PATTERN = r"^[\d\s\.\-:T/]+$"
31
+ DATETIME_PATTERN = r"^[\d\s\.\-:T/+]+$"
35
32
 
36
33
 
37
34
  class DateTimeSearchKeyConverter:
38
35
  DATETIME_COL = "_date_time"
36
+ # MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31) # 946684800000 # 2000-01-01
37
+ MIN_SUPPORTED_DATE_TS = pd.to_datetime(datetime.datetime(1999, 12, 31)).tz_localize(None)
39
38
 
40
39
  def __init__(
41
40
  self,
42
41
  date_column: str,
43
42
  date_format: Optional[str] = None,
44
43
  logger: Optional[logging.Logger] = None,
45
- bundle: ResourceBundle = None,
44
+ bundle: Optional[ResourceBundle] = None,
45
+ warnings_counter: Optional[WarningCounter] = None,
46
+ silent_mode=False,
46
47
  ):
47
48
  self.date_column = date_column
48
49
  self.date_format = date_format
@@ -53,6 +54,8 @@ class DateTimeSearchKeyConverter:
53
54
  self.logger.setLevel("FATAL")
54
55
  self.generated_features: List[str] = []
55
56
  self.bundle = bundle or get_custom_bundle()
57
+ self.warnings_counter = warnings_counter or WarningCounter()
58
+ self.silent_mode = silent_mode
56
59
 
57
60
  @staticmethod
58
61
  def _int_to_opt(i: int) -> Optional[int]:
@@ -81,20 +84,20 @@ class DateTimeSearchKeyConverter:
81
84
  df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
82
85
  elif isinstance(df[self.date_column].values[0], datetime.date):
83
86
  df[self.date_column] = pd.to_datetime(df[self.date_column], errors="coerce")
84
- elif is_period_dtype(df[self.date_column]):
87
+ elif isinstance(df[self.date_column].dtype, pd.PeriodDtype):
85
88
  df[self.date_column] = df[self.date_column].dt.to_timestamp()
86
89
  elif is_numeric_dtype(df[self.date_column]):
87
90
  # 315532801 - 2524608001 - seconds
88
91
  # 315532801000 - 2524608001000 - milliseconds
89
92
  # 315532801000000 - 2524608001000000 - microseconds
90
93
  # 315532801000000000 - 2524608001000000000 - nanoseconds
91
- if df[self.date_column].apply(lambda x: 10 ** 16 < x).all():
94
+ if df[self.date_column].apply(lambda x: 10**16 < x).all():
92
95
  df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ns")
93
- elif df[self.date_column].apply(lambda x: 10 ** 14 < x < 10 ** 16).all():
96
+ elif df[self.date_column].apply(lambda x: 10**14 < x < 10**16).all():
94
97
  df[self.date_column] = pd.to_datetime(df[self.date_column], unit="us")
95
- elif df[self.date_column].apply(lambda x: 10 ** 11 < x < 10 ** 14).all():
98
+ elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
96
99
  df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ms")
97
- elif df[self.date_column].apply(lambda x: 0 < x < 10 ** 11).all():
100
+ elif df[self.date_column].apply(lambda x: 0 < x < 10**11).all():
98
101
  df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
99
102
  else:
100
103
  msg = self.bundle.get("unsupported_date_type").format(self.date_column)
@@ -108,6 +111,9 @@ class DateTimeSearchKeyConverter:
108
111
  # as additional features
109
112
  seconds = "datetime_seconds"
110
113
  df[self.date_column] = df[self.date_column].dt.tz_localize(None)
114
+
115
+ df = self.clean_old_dates(df)
116
+
111
117
  df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
112
118
 
113
119
  seconds_without_na = df[seconds].dropna()
@@ -152,6 +158,19 @@ class DateTimeSearchKeyConverter:
152
158
  except ValueError:
153
159
  raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
154
160
 
161
+ def clean_old_dates(self, df: pd.DataFrame) -> pd.DataFrame:
162
+ condition = df[self.date_column] <= self.MIN_SUPPORTED_DATE_TS
163
+ old_subset = df[condition]
164
+ if len(old_subset) > 0:
165
+ df.loc[condition, self.date_column] = None
166
+ self.logger.info(f"Set to None: {len(old_subset)} of {len(df)} rows because they are before 2000-01-01")
167
+ msg = self.bundle.get("dataset_drop_old_dates")
168
+ self.logger.warning(msg)
169
+ if not self.silent_mode:
170
+ print(msg)
171
+ self.warnings_counter.increment()
172
+ return df
173
+
155
174
 
156
175
  def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
157
176
  try:
@@ -188,7 +207,7 @@ def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
188
207
  def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[str]) -> bool:
189
208
  df = df.copy()
190
209
  seconds = "datetime_seconds"
191
- if is_period_dtype(df[date_col]):
210
+ if isinstance(df[date_col].dtype, pd.PeriodDtype):
192
211
  df[date_col] = df[date_col].dt.to_timestamp()
193
212
  else:
194
213
  df[date_col] = pd.to_datetime(df[date_col])
@@ -238,23 +257,25 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
238
257
 
239
258
 
240
259
  def validate_dates_distribution(
241
- X: pd.DataFrame,
260
+ df: pd.DataFrame,
242
261
  search_keys: Dict[str, SearchKey],
243
262
  logger: Optional[logging.Logger] = None,
244
263
  bundle: Optional[ResourceBundle] = None,
245
264
  warning_counter: Optional[WarningCounter] = None,
246
265
  ):
247
- maybe_date_col = None
248
- for key, key_type in search_keys.items():
249
- if key_type in [SearchKey.DATE, SearchKey.DATETIME]:
250
- maybe_date_col = key
266
+ maybe_date_col = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
267
+
268
+ if EVAL_SET_INDEX in df.columns:
269
+ X = df.query(f"{EVAL_SET_INDEX} == 0")
270
+ else:
271
+ X = df
251
272
 
252
273
  if maybe_date_col is None:
253
274
  for col in X.columns:
254
275
  if col in search_keys:
255
276
  continue
256
277
  try:
257
- if is_period_dtype(X[col]):
278
+ if isinstance(X[col].dtype, pd.PeriodDtype):
258
279
  pass
259
280
  elif pd.__version__ >= "2.0.0":
260
281
  # Format mixed to avoid massive warnings
@@ -269,7 +290,7 @@ def validate_dates_distribution(
269
290
  if maybe_date_col is None:
270
291
  return
271
292
 
272
- if is_period_dtype(X[maybe_date_col]):
293
+ if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
273
294
  dates = X[maybe_date_col].dt.to_timestamp().dt.date
274
295
  elif pd.__version__ >= "2.0.0":
275
296
  dates = pd.to_datetime(X[maybe_date_col], format="mixed").dt.date
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from pandas.api.types import is_object_dtype, is_string_dtype
8
8
 
9
9
  from upgini.metadata import SearchKey
10
- from upgini.resource_bundle import bundle
10
+ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
11
11
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
12
12
 
13
13
  EMAIL_REGEX = re.compile(r"^[a-zA-Z0-9.!#$%&’*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*$")
@@ -28,29 +28,53 @@ class EmailSearchKeyDetector(BaseSearchKeyDetector):
28
28
  return is_email_count / all_count > 0.1
29
29
 
30
30
 
31
+ class EmailDomainGenerator:
32
+ DOMAIN_SUFFIX = "_domain"
33
+
34
+ def __init__(self, email_columns: List[str]):
35
+ self.email_columns = email_columns
36
+ self.generated_features = []
37
+
38
+ def generate(self, df: pd.DataFrame) -> pd.DataFrame:
39
+ for email_col in self.email_columns:
40
+ domain_feature = email_col + self.DOMAIN_SUFFIX
41
+ df[domain_feature] = df[email_col].apply(self._email_to_domain)
42
+ self.generated_features.append(domain_feature)
43
+ return df
44
+
45
+ @staticmethod
46
+ def _email_to_domain(email: str) -> Optional[str]:
47
+ if email is not None and isinstance(email, str) and "@" in email:
48
+ name_and_domain = email.split("@")
49
+ if len(name_and_domain) == 2 and len(name_and_domain[1]) > 0:
50
+ return name_and_domain[1]
51
+
52
+
31
53
  class EmailSearchKeyConverter:
32
- HEM_COLUMN_NAME = "hashed_email"
33
- DOMAIN_COLUMN_NAME = "email_domain"
34
- EMAIL_ONE_DOMAIN_COLUMN_NAME = "email_one_domain"
54
+ HEM_SUFFIX = "_hem"
55
+ ONE_DOMAIN_SUFFIX = "_one_domain"
35
56
 
36
57
  def __init__(
37
58
  self,
38
59
  email_column: str,
39
60
  hem_column: Optional[str],
40
61
  search_keys: Dict[str, SearchKey],
62
+ columns_renaming: Dict[str, str],
41
63
  unnest_search_keys: Optional[List[str]] = None,
64
+ bundle: Optional[ResourceBundle] = None,
42
65
  logger: Optional[logging.Logger] = None,
43
66
  ):
44
67
  self.email_column = email_column
45
68
  self.hem_column = hem_column
46
69
  self.search_keys = search_keys
70
+ self.columns_renaming = columns_renaming
47
71
  self.unnest_search_keys = unnest_search_keys
72
+ self.bundle = bundle or get_custom_bundle()
48
73
  if logger is not None:
49
74
  self.logger = logger
50
75
  else:
51
76
  self.logger = logging.getLogger()
52
77
  self.logger.setLevel("FATAL")
53
- self.generated_features: List[str] = []
54
78
  self.email_converted_to_hem = False
55
79
 
56
80
  @staticmethod
@@ -61,7 +85,7 @@ class EmailSearchKeyConverter:
61
85
  if not EMAIL_REGEX.fullmatch(email):
62
86
  return None
63
87
 
64
- return sha256(email.lower().encode("utf-8")).hexdigest()
88
+ return sha256(email.lower().encode("utf-8")).hexdigest().lower()
65
89
 
66
90
  @staticmethod
67
91
  def _email_to_one_domain(email: str) -> Optional[str]:
@@ -72,28 +96,36 @@ class EmailSearchKeyConverter:
72
96
 
73
97
  def convert(self, df: pd.DataFrame) -> pd.DataFrame:
74
98
  df = df.copy()
99
+ original_email_column = self.columns_renaming[self.email_column]
75
100
  if self.hem_column is None:
76
- df[self.HEM_COLUMN_NAME] = df[self.email_column].apply(self._email_to_hem)
77
- if df[self.HEM_COLUMN_NAME].isna().all():
78
- msg = bundle.get("all_emails_invalid").format(self.email_column)
101
+ hem_name = self.email_column + self.HEM_SUFFIX
102
+ df[hem_name] = df[self.email_column].apply(self._email_to_hem)
103
+ if df[hem_name].isna().all():
104
+ msg = self.bundle.get("all_emails_invalid").format(self.email_column)
79
105
  print(msg)
80
106
  self.logger.warning(msg)
81
- df = df.drop(columns=self.HEM_COLUMN_NAME)
107
+ df = df.drop(columns=hem_name)
82
108
  del self.search_keys[self.email_column]
83
109
  return df
84
- self.search_keys[self.HEM_COLUMN_NAME] = SearchKey.HEM
85
- self.unnest_search_keys.append(self.HEM_COLUMN_NAME)
110
+ self.search_keys[hem_name] = SearchKey.HEM
111
+ if self.email_column in self.unnest_search_keys:
112
+ self.unnest_search_keys.append(hem_name)
113
+ self.columns_renaming[hem_name] = original_email_column # it could be upgini_email_unnest...
86
114
  self.email_converted_to_hem = True
115
+ else:
116
+ df[self.hem_column] = df[self.hem_column].astype("string").str.lower()
87
117
 
88
118
  del self.search_keys[self.email_column]
89
119
  if self.email_column in self.unnest_search_keys:
90
120
  self.unnest_search_keys.remove(self.email_column)
91
121
 
92
- df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = df[self.email_column].apply(self._email_to_one_domain)
93
-
94
- self.search_keys[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = SearchKey.EMAIL_ONE_DOMAIN
122
+ one_domain_name = self.email_column + self.ONE_DOMAIN_SUFFIX
123
+ df[one_domain_name] = df[self.email_column].apply(self._email_to_one_domain)
124
+ self.columns_renaming[one_domain_name] = original_email_column
125
+ self.search_keys[one_domain_name] = SearchKey.EMAIL_ONE_DOMAIN
95
126
 
96
- df[self.DOMAIN_COLUMN_NAME] = df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME].str[1:]
97
- self.generated_features.append(self.DOMAIN_COLUMN_NAME)
127
+ if self.email_converted_to_hem:
128
+ df = df.drop(columns=self.email_column)
129
+ del self.columns_renaming[self.email_column]
98
130
 
99
131
  return df
upgini/utils/ip_utils.py CHANGED
@@ -1,15 +1,114 @@
1
1
  import logging
2
- from typing import Dict, List, Optional
2
+ from ipaddress import IPv4Address, IPv6Address, _BaseAddress, ip_address
3
+ from typing import Dict, List, Optional, Union
3
4
 
4
5
  import pandas as pd
5
6
  from requests import get
6
7
 
8
+ from upgini.errors import ValidationError
7
9
  from upgini.metadata import SearchKey
10
+ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
8
11
 
9
12
  # from upgini.resource_bundle import bundle
10
13
  # from upgini.utils.track_info import get_track_metrics
11
14
 
12
15
 
16
+ class IpSearchKeyConverter:
17
+ def __init__(
18
+ self,
19
+ ip_column: str,
20
+ search_keys: Dict[str, SearchKey],
21
+ columns_renaming: Dict[str, str],
22
+ unnest_search_keys: Optional[List[str]] = None,
23
+ bundle: Optional[ResourceBundle] = None,
24
+ logger: Optional[logging.Logger] = None,
25
+ ):
26
+ self.ip_column = ip_column
27
+ self.search_keys = search_keys
28
+ self.columns_renaming = columns_renaming
29
+ self.unnest_search_keys = unnest_search_keys
30
+ self.bundle = bundle or get_custom_bundle()
31
+ if logger is not None:
32
+ self.logger = logger
33
+ else:
34
+ self.logger = logging.getLogger()
35
+ self.logger.setLevel("FATAL")
36
+
37
+ @staticmethod
38
+ def _ip_to_int(ip: Optional[_BaseAddress]) -> Optional[int]:
39
+ try:
40
+ if isinstance(ip, (IPv4Address, IPv6Address)):
41
+ return int(ip)
42
+ except Exception:
43
+ pass
44
+
45
+ @staticmethod
46
+ def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
47
+ try:
48
+ if isinstance(ip, (IPv4Address, IPv6Address)):
49
+ return str(int(ip))
50
+ except Exception:
51
+ pass
52
+
53
+ @staticmethod
54
+ def _safe_ip_parse(ip: Union[str, int, IPv4Address, IPv6Address]) -> Optional[_BaseAddress]:
55
+ try:
56
+ return ip_address(ip)
57
+ except ValueError:
58
+ pass
59
+
60
+ @staticmethod
61
+ def _is_ipv4(ip: Optional[_BaseAddress]):
62
+ return ip is not None and (
63
+ isinstance(ip, IPv4Address) or (isinstance(ip, IPv6Address) and ip.ipv4_mapped is not None)
64
+ )
65
+
66
+ @staticmethod
67
+ def _to_ipv4(ip: Optional[_BaseAddress]) -> Optional[IPv4Address]:
68
+ if isinstance(ip, IPv4Address):
69
+ return ip
70
+ return None
71
+
72
+ @staticmethod
73
+ def _to_ipv6(ip: Optional[_BaseAddress]) -> Optional[IPv6Address]:
74
+ if isinstance(ip, IPv6Address):
75
+ return ip
76
+ if isinstance(ip, IPv4Address):
77
+ return IPv6Address("::ffff:" + str(ip))
78
+ return None
79
+
80
+ def convert(self, df: pd.DataFrame) -> pd.DataFrame:
81
+ """Convert ip address to int"""
82
+ self.logger.info("Convert ip address to int")
83
+ original_ip = self.columns_renaming[self.ip_column]
84
+
85
+ df[self.ip_column] = df[self.ip_column].apply(self._safe_ip_parse)
86
+ if df[self.ip_column].isnull().all():
87
+ raise ValidationError(self.bundle.get("invalid_ip").format(self.ip_column))
88
+
89
+ # legacy support
90
+ ipv4 = self.ip_column + "_v4"
91
+ df[ipv4] = df[self.ip_column].apply(self._to_ipv4).apply(self._ip_to_int).astype("Int64")
92
+ self.search_keys[ipv4] = SearchKey.IP
93
+ self.columns_renaming[ipv4] = original_ip
94
+
95
+ ipv6 = self.ip_column + "_v6"
96
+ df[ipv6] = (
97
+ df[self.ip_column]
98
+ .apply(self._to_ipv6)
99
+ .apply(self._ip_to_int_str)
100
+ .astype("string")
101
+ # .str.replace(".0", "", regex=False)
102
+ )
103
+ df = df.drop(columns=self.ip_column)
104
+ del self.search_keys[self.ip_column]
105
+ del self.columns_renaming[self.ip_column]
106
+ self.search_keys[ipv6] = SearchKey.IPV6_ADDRESS
107
+ self.columns_renaming[ipv6] = original_ip # could be __unnest_ip...
108
+
109
+ return df
110
+
111
+
13
112
  class IpToCountrySearchKeyConverter:
14
113
  url = "http://ip-api.com/json/{}"
15
114