upgini 1.2.49__py3-none-any.whl → 1.2.51__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.49"
1
+ __version__ = "1.2.51"
upgini/dataset.py CHANGED
@@ -37,12 +37,18 @@ from upgini.metadata import (
37
37
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
38
38
  from upgini.search_task import SearchTask
39
39
  from upgini.utils.email_utils import EmailSearchKeyConverter
40
- from upgini.utils.target_utils import balance_undersample, balance_undersample_forced, balance_undersample_time_series
40
+ from upgini.utils.target_utils import (
41
+ balance_undersample,
42
+ balance_undersample_forced,
43
+ balance_undersample_time_series,
44
+ )
41
45
 
42
46
  try:
43
47
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
44
48
  except Exception:
45
- from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
49
+ from upgini.utils.fallback_progress_bar import (
50
+ CustomFallbackProgressBar as ProgressBar,
51
+ )
46
52
 
47
53
 
48
54
  class Dataset: # (pd.DataFrame):
@@ -347,7 +353,8 @@ class Dataset: # (pd.DataFrame):
347
353
  key
348
354
  for search_group in self.search_keys_checked
349
355
  for key in search_group
350
- if not self.columns_renaming.get(key).endswith(EmailSearchKeyConverter.ONE_DOMAIN_SUFFIX)
356
+ if key in self.columns_renaming
357
+ and not self.columns_renaming.get(key).endswith(EmailSearchKeyConverter.ONE_DOMAIN_SUFFIX)
351
358
  }
352
359
  ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS.value)
353
360
  if (
@@ -2270,6 +2270,7 @@ if response.status_code == 200:
2270
2270
  df = converter.convert(df)
2271
2271
 
2272
2272
  ip_column = self._get_ip_column(search_keys)
2273
+ ip_prefix_column = None
2273
2274
  if ip_column:
2274
2275
  converter = IpSearchKeyConverter(
2275
2276
  ip_column,
@@ -2280,6 +2281,7 @@ if response.status_code == 200:
2280
2281
  self.logger,
2281
2282
  )
2282
2283
  df = converter.convert(df)
2284
+ ip_prefix_column = converter.ip_prefix_column
2283
2285
 
2284
2286
  phone_column = self._get_phone_column(search_keys)
2285
2287
  country_column = self._get_country_column(search_keys)
@@ -2299,12 +2301,15 @@ if response.status_code == 200:
2299
2301
  # generated_features = [f for f in generated_features if f in self.fit_generated_features]
2300
2302
 
2301
2303
  meaning_types = {col: key.value for col, key in search_keys.items()}
2304
+ if ip_prefix_column:
2305
+ meaning_types[ip_prefix_column] = FileColumnMeaningType.IP_PREFIX
2302
2306
  for col in features_for_transform:
2303
2307
  meaning_types[col] = FileColumnMeaningType.FEATURE
2304
2308
  features_not_to_pass = [
2305
2309
  c
2306
2310
  for c in df.columns
2307
2311
  if c not in search_keys.keys()
2312
+ and c != ip_prefix_column
2308
2313
  and c not in features_for_transform
2309
2314
  and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
2310
2315
  ]
@@ -2619,6 +2624,11 @@ if response.status_code == 200:
2619
2624
  self.generate_features = checked_generate_features
2620
2625
  self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
2621
2626
 
2627
+ if self.id_columns is not None:
2628
+ for id_column in self.id_columns:
2629
+ if id_column not in validated_X.columns:
2630
+ raise ValidationError(self.bundle.get("missing_id_column").format(id_column))
2631
+
2622
2632
  validate_scoring_argument(scoring)
2623
2633
 
2624
2634
  self.__log_debug_information(
@@ -2761,6 +2771,7 @@ if response.status_code == 200:
2761
2771
  df = converter.convert(df)
2762
2772
 
2763
2773
  ip_column = self._get_ip_column(self.fit_search_keys)
2774
+ ip_prefix_column = None
2764
2775
  if ip_column:
2765
2776
  converter = IpSearchKeyConverter(
2766
2777
  ip_column,
@@ -2771,7 +2782,7 @@ if response.status_code == 200:
2771
2782
  self.logger,
2772
2783
  )
2773
2784
  df = converter.convert(df)
2774
-
2785
+ ip_prefix_column = converter.ip_prefix_column
2775
2786
  phone_column = self._get_phone_column(self.fit_search_keys)
2776
2787
  country_column = self._get_country_column(self.fit_search_keys)
2777
2788
  if phone_column:
@@ -2787,9 +2798,13 @@ if response.status_code == 200:
2787
2798
  converter = PostalCodeSearchKeyConverter(postal_code)
2788
2799
  df = converter.convert(df)
2789
2800
 
2790
- non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
2791
- self.fit_search_keys.keys()
2792
- )
2801
+ non_feature_columns = [
2802
+ self.TARGET_NAME,
2803
+ EVAL_SET_INDEX,
2804
+ ENTITY_SYSTEM_RECORD_ID,
2805
+ SEARCH_KEY_UNNEST,
2806
+ ip_prefix_column,
2807
+ ] + list(self.fit_search_keys.keys())
2793
2808
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2794
2809
  non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
2795
2810
 
@@ -2810,6 +2825,8 @@ if response.status_code == 200:
2810
2825
  **{col: key.value for col, key in self.fit_search_keys.items()},
2811
2826
  **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2812
2827
  }
2828
+ if ip_prefix_column:
2829
+ meaning_types[ip_prefix_column] = FileColumnMeaningType.IP_PREFIX
2813
2830
  meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2814
2831
  meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
2815
2832
  if SEARCH_KEY_UNNEST in df.columns:
upgini/metadata.py CHANGED
@@ -44,6 +44,7 @@ class FileColumnMeaningType(Enum):
44
44
  ENTITY_SYSTEM_RECORD_ID = "ENTITY_SYSTEM_RECORD_ID"
45
45
  UNNEST_KEY = "UNNEST_KEY"
46
46
  IP_BINARY = "IP_BINARY"
47
+ IP_PREFIX = "IP_PREFIX"
47
48
  IP_RANGE_FROM_BINARY = "IP_RANGE_FROM_BINARY"
48
49
  IP_RANGE_TO_BINARY = "IP_RANGE_TO_BINARY"
49
50
 
@@ -134,6 +134,7 @@ x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
134
134
  baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
135
135
  baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
136
136
  missing_features_for_transform=Missing some features for transform that were presented on fit: {}
137
+ missing_id_column=Id column {} not found in X
137
138
  # target validation
138
139
  empty_target=Target is empty in all rows
139
140
  # non_numeric_target=Binary target should be numerical type
upgini/utils/ip_utils.py CHANGED
@@ -33,9 +33,12 @@ class IpSearchKeyConverter:
33
33
  else:
34
34
  self.logger = logging.getLogger()
35
35
  self.logger.setLevel("FATAL")
36
+ self.ip_prefix_column = None
36
37
 
37
38
  @staticmethod
38
39
  def _ip_to_int(ip: Optional[_BaseAddress]) -> Optional[int]:
40
+ if ip is None:
41
+ return None
39
42
  try:
40
43
  if isinstance(ip, (IPv4Address, IPv6Address)):
41
44
  return int(ip)
@@ -44,6 +47,8 @@ class IpSearchKeyConverter:
44
47
 
45
48
  @staticmethod
46
49
  def _ip_to_binary(ip: Optional[_BaseAddress]) -> Optional[bytes]:
50
+ if ip is None:
51
+ return None
47
52
  try:
48
53
  if isinstance(ip, IPv6Address) and ip.ipv4_mapped is not None:
49
54
  return ip.ipv4_mapped.packed
@@ -52,6 +57,20 @@ class IpSearchKeyConverter:
52
57
  except Exception:
53
58
  pass
54
59
 
60
+ @staticmethod
61
+ def _ip_to_prefix(ip: Optional[_BaseAddress]) -> Optional[str]:
62
+ if ip is None:
63
+ return None
64
+ try:
65
+ if isinstance(ip, IPv6Address):
66
+ if ip.ipv4_mapped is not None:
67
+ return ".".join(ip.ipv4_mapped.exploded.split(".")[:2])
68
+ return ":".join(ip.exploded.split(":")[:2]) # TODO use 3 in future
69
+ else:
70
+ return ".".join(ip.exploded.split(".")[:2])
71
+ except Exception:
72
+ pass
73
+
55
74
  @staticmethod
56
75
  def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
57
76
  try:
@@ -102,24 +121,26 @@ class IpSearchKeyConverter:
102
121
  # self.search_keys[ipv4] = SearchKey.IP
103
122
  # self.columns_renaming[ipv4] = original_ip
104
123
 
105
- ipv6 = self.ip_column + "_v6"
106
- df[ipv6] = (
107
- df[self.ip_column]
108
- .apply(self._to_ipv6)
109
- .apply(self._ip_to_int_str)
110
- .astype("string")
111
- # .str.replace(".0", "", regex=False)
112
- )
113
- # ip_binary = self.ip_column + "_binary"
114
- # df[ip_binary] = df[self.ip_column].apply(self._ip_to_binary)
124
+ # ipv6 = self.ip_column + "_v6"
125
+ # df[ipv6] = (
126
+ # df[self.ip_column]
127
+ # .apply(self._to_ipv6)
128
+ # .apply(self._ip_to_int_str)
129
+ # .astype("string")
130
+ # # .str.replace(".0", "", regex=False)
131
+ # )
132
+ ip_binary = self.ip_column + "_binary"
133
+ df[ip_binary] = df[self.ip_column].apply(self._ip_to_binary)
134
+ self.ip_prefix_column = self.ip_column + "_prefix"
135
+ df[self.ip_prefix_column] = df[self.ip_column].apply(self._ip_to_prefix)
115
136
 
116
137
  df = df.drop(columns=self.ip_column)
117
138
  del self.search_keys[self.ip_column]
118
139
  del self.columns_renaming[self.ip_column]
119
- self.search_keys[ipv6] = SearchKey.IPV6_ADDRESS
120
- # self.search_keys[ip_binary] = SearchKey.IP_BINARY
121
- self.columns_renaming[ipv6] = original_ip
122
- # self.columns_renaming[ip_binary] = original_ip
140
+ # self.search_keys[ipv6] = SearchKey.IPV6_ADDRESS
141
+ self.search_keys[ip_binary] = SearchKey.IP_BINARY
142
+ # self.columns_renaming[ipv6] = original_ip
143
+ self.columns_renaming[ip_binary] = original_ip
123
144
 
124
145
  return df
125
146
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.49
3
+ Version: 1.2.51
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=qkDVgmmc_W7WSA8UFQzKzULlTX27bnHrBTAlSeTYqYs,23
1
+ upgini/__about__.py,sha256=kgsz9u_lLDc3N0akch6v9PpMXz_PW7_aEHXRb1pWgHg,23
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=QC3jncWS3wHe4CY7pWWDMO_3HKxGbi0EyPHXMdBtoQM,33456
4
+ upgini/dataset.py,sha256=vT4JyHmafLNbj54SySXr93f5hNS6-t94aFslbBy-7No,33535
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=8bhABqZ3aXIc_5WBVNqFnRNT_0fCNbLyPwXv7VXdygs,200350
6
+ upgini/features_enricher.py,sha256=80h-1a-UxhuknvuEk1tQk5q5dckqlD_DHzNfufNuaPI,201110
7
7
  upgini/http.py,sha256=danPeX7nTMa_70S-pk-4UUm5yOvXYlR84jgyjoHYBkU,43367
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
- upgini/metadata.py,sha256=-ibqiNjD7dTagqg53FoEJNEqvAYbwgfyn9PGTRQ_YKU,12054
9
+ upgini/metadata.py,sha256=zuLdt5XyO_ZH4VsUNshzRHgv6VfYiXy0M8jeohloFBw,12082
10
10
  upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
11
11
  upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
30
30
  upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
- upgini/resource_bundle/strings.properties,sha256=0jZC0HjyQHeqFCHt6nn1kz7vV0oq92AYQJvy-soAwe4,27304
33
+ upgini/resource_bundle/strings.properties,sha256=0_KAExIi1u48N1CQ13LKJS3bgDlRs-MPOyU3VxcE-qY,27350
34
34
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
35
35
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -51,7 +51,7 @@ upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0-
51
51
  upgini/utils/feature_info.py,sha256=0rOXSyCj-sw-8migWP0ge8qrOzGU50dQvH0JUJUrDfQ,6766
52
52
  upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
53
53
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
54
- upgini/utils/ip_utils.py,sha256=VORRmtKlItcbBVVK5SiwXD7J-6Y5rn7UQ5m6WcBXt7E,5698
54
+ upgini/utils/ip_utils.py,sha256=GZqBaV-nky-_Yb9KclmTrYovCG4kawYcbdjEpw1e5Mo,6500
55
55
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
56
56
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
57
57
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
59
59
  upgini/utils/target_utils.py,sha256=RlpKGss9kMibVSlA8iZuO_qxmyeplqzn7X8g6hiGGGs,14341
60
60
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
61
61
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
62
- upgini-1.2.49.dist-info/METADATA,sha256=XZ11OqCR6UqQrqo2RsKAIZYdPIVmIcn61GxRJn6f9Ys,49055
63
- upgini-1.2.49.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
64
- upgini-1.2.49.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
- upgini-1.2.49.dist-info/RECORD,,
62
+ upgini-1.2.51.dist-info/METADATA,sha256=WXti81Fx4H5NawX2D7XvQ5cPEUVi4mlMkykbn94gXKI,49055
63
+ upgini-1.2.51.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
64
+ upgini-1.2.51.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
+ upgini-1.2.51.dist-info/RECORD,,