upgini 1.2.49__py3-none-any.whl → 1.2.51__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/dataset.py +10 -3
- upgini/features_enricher.py +21 -4
- upgini/metadata.py +1 -0
- upgini/resource_bundle/strings.properties +1 -0
- upgini/utils/ip_utils.py +35 -14
- {upgini-1.2.49.dist-info → upgini-1.2.51.dist-info}/METADATA +1 -1
- {upgini-1.2.49.dist-info → upgini-1.2.51.dist-info}/RECORD +10 -10
- {upgini-1.2.49.dist-info → upgini-1.2.51.dist-info}/WHEEL +0 -0
- {upgini-1.2.49.dist-info → upgini-1.2.51.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.51"
|
upgini/dataset.py
CHANGED
|
@@ -37,12 +37,18 @@ from upgini.metadata import (
|
|
|
37
37
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
38
38
|
from upgini.search_task import SearchTask
|
|
39
39
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
|
40
|
-
from upgini.utils.target_utils import
|
|
40
|
+
from upgini.utils.target_utils import (
|
|
41
|
+
balance_undersample,
|
|
42
|
+
balance_undersample_forced,
|
|
43
|
+
balance_undersample_time_series,
|
|
44
|
+
)
|
|
41
45
|
|
|
42
46
|
try:
|
|
43
47
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
|
44
48
|
except Exception:
|
|
45
|
-
from upgini.utils.fallback_progress_bar import
|
|
49
|
+
from upgini.utils.fallback_progress_bar import (
|
|
50
|
+
CustomFallbackProgressBar as ProgressBar,
|
|
51
|
+
)
|
|
46
52
|
|
|
47
53
|
|
|
48
54
|
class Dataset: # (pd.DataFrame):
|
|
@@ -347,7 +353,8 @@ class Dataset: # (pd.DataFrame):
|
|
|
347
353
|
key
|
|
348
354
|
for search_group in self.search_keys_checked
|
|
349
355
|
for key in search_group
|
|
350
|
-
if
|
|
356
|
+
if key in self.columns_renaming
|
|
357
|
+
and not self.columns_renaming.get(key).endswith(EmailSearchKeyConverter.ONE_DOMAIN_SUFFIX)
|
|
351
358
|
}
|
|
352
359
|
ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS.value)
|
|
353
360
|
if (
|
upgini/features_enricher.py
CHANGED
|
@@ -2270,6 +2270,7 @@ if response.status_code == 200:
|
|
|
2270
2270
|
df = converter.convert(df)
|
|
2271
2271
|
|
|
2272
2272
|
ip_column = self._get_ip_column(search_keys)
|
|
2273
|
+
ip_prefix_column = None
|
|
2273
2274
|
if ip_column:
|
|
2274
2275
|
converter = IpSearchKeyConverter(
|
|
2275
2276
|
ip_column,
|
|
@@ -2280,6 +2281,7 @@ if response.status_code == 200:
|
|
|
2280
2281
|
self.logger,
|
|
2281
2282
|
)
|
|
2282
2283
|
df = converter.convert(df)
|
|
2284
|
+
ip_prefix_column = converter.ip_prefix_column
|
|
2283
2285
|
|
|
2284
2286
|
phone_column = self._get_phone_column(search_keys)
|
|
2285
2287
|
country_column = self._get_country_column(search_keys)
|
|
@@ -2299,12 +2301,15 @@ if response.status_code == 200:
|
|
|
2299
2301
|
# generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
2300
2302
|
|
|
2301
2303
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
2304
|
+
if ip_prefix_column:
|
|
2305
|
+
meaning_types[ip_prefix_column] = FileColumnMeaningType.IP_PREFIX
|
|
2302
2306
|
for col in features_for_transform:
|
|
2303
2307
|
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
2304
2308
|
features_not_to_pass = [
|
|
2305
2309
|
c
|
|
2306
2310
|
for c in df.columns
|
|
2307
2311
|
if c not in search_keys.keys()
|
|
2312
|
+
and c != ip_prefix_column
|
|
2308
2313
|
and c not in features_for_transform
|
|
2309
2314
|
and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
|
|
2310
2315
|
]
|
|
@@ -2619,6 +2624,11 @@ if response.status_code == 200:
|
|
|
2619
2624
|
self.generate_features = checked_generate_features
|
|
2620
2625
|
self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
|
|
2621
2626
|
|
|
2627
|
+
if self.id_columns is not None:
|
|
2628
|
+
for id_column in self.id_columns:
|
|
2629
|
+
if id_column not in validated_X.columns:
|
|
2630
|
+
raise ValidationError(self.bundle.get("missing_id_column").format(id_column))
|
|
2631
|
+
|
|
2622
2632
|
validate_scoring_argument(scoring)
|
|
2623
2633
|
|
|
2624
2634
|
self.__log_debug_information(
|
|
@@ -2761,6 +2771,7 @@ if response.status_code == 200:
|
|
|
2761
2771
|
df = converter.convert(df)
|
|
2762
2772
|
|
|
2763
2773
|
ip_column = self._get_ip_column(self.fit_search_keys)
|
|
2774
|
+
ip_prefix_column = None
|
|
2764
2775
|
if ip_column:
|
|
2765
2776
|
converter = IpSearchKeyConverter(
|
|
2766
2777
|
ip_column,
|
|
@@ -2771,7 +2782,7 @@ if response.status_code == 200:
|
|
|
2771
2782
|
self.logger,
|
|
2772
2783
|
)
|
|
2773
2784
|
df = converter.convert(df)
|
|
2774
|
-
|
|
2785
|
+
ip_prefix_column = converter.ip_prefix_column
|
|
2775
2786
|
phone_column = self._get_phone_column(self.fit_search_keys)
|
|
2776
2787
|
country_column = self._get_country_column(self.fit_search_keys)
|
|
2777
2788
|
if phone_column:
|
|
@@ -2787,9 +2798,13 @@ if response.status_code == 200:
|
|
|
2787
2798
|
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2788
2799
|
df = converter.convert(df)
|
|
2789
2800
|
|
|
2790
|
-
non_feature_columns = [
|
|
2791
|
-
self.
|
|
2792
|
-
|
|
2801
|
+
non_feature_columns = [
|
|
2802
|
+
self.TARGET_NAME,
|
|
2803
|
+
EVAL_SET_INDEX,
|
|
2804
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
2805
|
+
SEARCH_KEY_UNNEST,
|
|
2806
|
+
ip_prefix_column,
|
|
2807
|
+
] + list(self.fit_search_keys.keys())
|
|
2793
2808
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2794
2809
|
non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2795
2810
|
|
|
@@ -2810,6 +2825,8 @@ if response.status_code == 200:
|
|
|
2810
2825
|
**{col: key.value for col, key in self.fit_search_keys.items()},
|
|
2811
2826
|
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2812
2827
|
}
|
|
2828
|
+
if ip_prefix_column:
|
|
2829
|
+
meaning_types[ip_prefix_column] = FileColumnMeaningType.IP_PREFIX
|
|
2813
2830
|
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2814
2831
|
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
2815
2832
|
if SEARCH_KEY_UNNEST in df.columns:
|
upgini/metadata.py
CHANGED
|
@@ -44,6 +44,7 @@ class FileColumnMeaningType(Enum):
|
|
|
44
44
|
ENTITY_SYSTEM_RECORD_ID = "ENTITY_SYSTEM_RECORD_ID"
|
|
45
45
|
UNNEST_KEY = "UNNEST_KEY"
|
|
46
46
|
IP_BINARY = "IP_BINARY"
|
|
47
|
+
IP_PREFIX = "IP_PREFIX"
|
|
47
48
|
IP_RANGE_FROM_BINARY = "IP_RANGE_FROM_BINARY"
|
|
48
49
|
IP_RANGE_TO_BINARY = "IP_RANGE_TO_BINARY"
|
|
49
50
|
|
|
@@ -134,6 +134,7 @@ x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
|
|
|
134
134
|
baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
|
|
135
135
|
baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
|
|
136
136
|
missing_features_for_transform=Missing some features for transform that were presented on fit: {}
|
|
137
|
+
missing_id_column=Id column {} not found in X
|
|
137
138
|
# target validation
|
|
138
139
|
empty_target=Target is empty in all rows
|
|
139
140
|
# non_numeric_target=Binary target should be numerical type
|
upgini/utils/ip_utils.py
CHANGED
|
@@ -33,9 +33,12 @@ class IpSearchKeyConverter:
|
|
|
33
33
|
else:
|
|
34
34
|
self.logger = logging.getLogger()
|
|
35
35
|
self.logger.setLevel("FATAL")
|
|
36
|
+
self.ip_prefix_column = None
|
|
36
37
|
|
|
37
38
|
@staticmethod
|
|
38
39
|
def _ip_to_int(ip: Optional[_BaseAddress]) -> Optional[int]:
|
|
40
|
+
if ip is None:
|
|
41
|
+
return None
|
|
39
42
|
try:
|
|
40
43
|
if isinstance(ip, (IPv4Address, IPv6Address)):
|
|
41
44
|
return int(ip)
|
|
@@ -44,6 +47,8 @@ class IpSearchKeyConverter:
|
|
|
44
47
|
|
|
45
48
|
@staticmethod
|
|
46
49
|
def _ip_to_binary(ip: Optional[_BaseAddress]) -> Optional[bytes]:
|
|
50
|
+
if ip is None:
|
|
51
|
+
return None
|
|
47
52
|
try:
|
|
48
53
|
if isinstance(ip, IPv6Address) and ip.ipv4_mapped is not None:
|
|
49
54
|
return ip.ipv4_mapped.packed
|
|
@@ -52,6 +57,20 @@ class IpSearchKeyConverter:
|
|
|
52
57
|
except Exception:
|
|
53
58
|
pass
|
|
54
59
|
|
|
60
|
+
@staticmethod
|
|
61
|
+
def _ip_to_prefix(ip: Optional[_BaseAddress]) -> Optional[str]:
|
|
62
|
+
if ip is None:
|
|
63
|
+
return None
|
|
64
|
+
try:
|
|
65
|
+
if isinstance(ip, IPv6Address):
|
|
66
|
+
if ip.ipv4_mapped is not None:
|
|
67
|
+
return ".".join(ip.ipv4_mapped.exploded.split(".")[:2])
|
|
68
|
+
return ":".join(ip.exploded.split(":")[:2]) # TODO use 3 in future
|
|
69
|
+
else:
|
|
70
|
+
return ".".join(ip.exploded.split(".")[:2])
|
|
71
|
+
except Exception:
|
|
72
|
+
pass
|
|
73
|
+
|
|
55
74
|
@staticmethod
|
|
56
75
|
def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
|
|
57
76
|
try:
|
|
@@ -102,24 +121,26 @@ class IpSearchKeyConverter:
|
|
|
102
121
|
# self.search_keys[ipv4] = SearchKey.IP
|
|
103
122
|
# self.columns_renaming[ipv4] = original_ip
|
|
104
123
|
|
|
105
|
-
ipv6 = self.ip_column + "_v6"
|
|
106
|
-
df[ipv6] = (
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
|
|
124
|
+
# ipv6 = self.ip_column + "_v6"
|
|
125
|
+
# df[ipv6] = (
|
|
126
|
+
# df[self.ip_column]
|
|
127
|
+
# .apply(self._to_ipv6)
|
|
128
|
+
# .apply(self._ip_to_int_str)
|
|
129
|
+
# .astype("string")
|
|
130
|
+
# # .str.replace(".0", "", regex=False)
|
|
131
|
+
# )
|
|
132
|
+
ip_binary = self.ip_column + "_binary"
|
|
133
|
+
df[ip_binary] = df[self.ip_column].apply(self._ip_to_binary)
|
|
134
|
+
self.ip_prefix_column = self.ip_column + "_prefix"
|
|
135
|
+
df[self.ip_prefix_column] = df[self.ip_column].apply(self._ip_to_prefix)
|
|
115
136
|
|
|
116
137
|
df = df.drop(columns=self.ip_column)
|
|
117
138
|
del self.search_keys[self.ip_column]
|
|
118
139
|
del self.columns_renaming[self.ip_column]
|
|
119
|
-
self.search_keys[ipv6] = SearchKey.IPV6_ADDRESS
|
|
120
|
-
|
|
121
|
-
self.columns_renaming[ipv6] = original_ip
|
|
122
|
-
|
|
140
|
+
# self.search_keys[ipv6] = SearchKey.IPV6_ADDRESS
|
|
141
|
+
self.search_keys[ip_binary] = SearchKey.IP_BINARY
|
|
142
|
+
# self.columns_renaming[ipv6] = original_ip
|
|
143
|
+
self.columns_renaming[ip_binary] = original_ip
|
|
123
144
|
|
|
124
145
|
return df
|
|
125
146
|
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=kgsz9u_lLDc3N0akch6v9PpMXz_PW7_aEHXRb1pWgHg,23
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=vT4JyHmafLNbj54SySXr93f5hNS6-t94aFslbBy-7No,33535
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=80h-1a-UxhuknvuEk1tQk5q5dckqlD_DHzNfufNuaPI,201110
|
|
7
7
|
upgini/http.py,sha256=danPeX7nTMa_70S-pk-4UUm5yOvXYlR84jgyjoHYBkU,43367
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
|
-
upgini/metadata.py,sha256
|
|
9
|
+
upgini/metadata.py,sha256=zuLdt5XyO_ZH4VsUNshzRHgv6VfYiXy0M8jeohloFBw,12082
|
|
10
10
|
upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
|
|
11
11
|
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
30
30
|
upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=0_KAExIi1u48N1CQ13LKJS3bgDlRs-MPOyU3VxcE-qY,27350
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -51,7 +51,7 @@ upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0-
|
|
|
51
51
|
upgini/utils/feature_info.py,sha256=0rOXSyCj-sw-8migWP0ge8qrOzGU50dQvH0JUJUrDfQ,6766
|
|
52
52
|
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
|
53
53
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
54
|
-
upgini/utils/ip_utils.py,sha256=
|
|
54
|
+
upgini/utils/ip_utils.py,sha256=GZqBaV-nky-_Yb9KclmTrYovCG4kawYcbdjEpw1e5Mo,6500
|
|
55
55
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
56
56
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
57
57
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
59
59
|
upgini/utils/target_utils.py,sha256=RlpKGss9kMibVSlA8iZuO_qxmyeplqzn7X8g6hiGGGs,14341
|
|
60
60
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
61
61
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
62
|
+
upgini-1.2.51.dist-info/METADATA,sha256=WXti81Fx4H5NawX2D7XvQ5cPEUVi4mlMkykbn94gXKI,49055
|
|
63
|
+
upgini-1.2.51.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
64
|
+
upgini-1.2.51.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
65
|
+
upgini-1.2.51.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|