upgini 1.2.96a2__py3-none-any.whl → 1.2.96a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +25 -49
- upgini/normalizer/normalize_utils.py +25 -0
- upgini/utils/ip_utils.py +2 -2
- {upgini-1.2.96a2.dist-info → upgini-1.2.96a3.dist-info}/METADATA +1 -1
- {upgini-1.2.96a2.dist-info → upgini-1.2.96a3.dist-info}/RECORD +8 -8
- {upgini-1.2.96a2.dist-info → upgini-1.2.96a3.dist-info}/WHEEL +0 -0
- {upgini-1.2.96a2.dist-info → upgini-1.2.96a3.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.96a3"
|
upgini/features_enricher.py
CHANGED
@@ -71,10 +71,7 @@ from upgini.search_task import SearchTask
|
|
71
71
|
from upgini.spinner import Spinner
|
72
72
|
from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
|
73
73
|
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|
74
|
-
from upgini.utils.country_utils import
|
75
|
-
CountrySearchKeyConverter,
|
76
|
-
CountrySearchKeyDetector,
|
77
|
-
)
|
74
|
+
from upgini.utils.country_utils import CountrySearchKeyDetector
|
78
75
|
from upgini.utils.custom_loss_utils import (
|
79
76
|
get_additional_params_custom_loss,
|
80
77
|
get_runtime_params_custom_loss,
|
@@ -105,11 +102,8 @@ from upgini.utils.feature_info import FeatureInfo, _round_shap_value
|
|
105
102
|
from upgini.utils.features_validator import FeaturesValidator
|
106
103
|
from upgini.utils.format import Format
|
107
104
|
from upgini.utils.ip_utils import IpSearchKeyConverter
|
108
|
-
from upgini.utils.phone_utils import
|
109
|
-
from upgini.utils.postal_code_utils import
|
110
|
-
PostalCodeSearchKeyConverter,
|
111
|
-
PostalCodeSearchKeyDetector,
|
112
|
-
)
|
105
|
+
from upgini.utils.phone_utils import PhoneSearchKeyDetector
|
106
|
+
from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
|
113
107
|
|
114
108
|
try:
|
115
109
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
@@ -2505,21 +2499,6 @@ if response.status_code == 200:
|
|
2505
2499
|
)
|
2506
2500
|
df = converter.convert(df)
|
2507
2501
|
|
2508
|
-
phone_column = self._get_phone_column(search_keys)
|
2509
|
-
country_column = self._get_country_column(search_keys)
|
2510
|
-
if phone_column:
|
2511
|
-
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
2512
|
-
df = converter.convert(df)
|
2513
|
-
|
2514
|
-
if country_column:
|
2515
|
-
converter = CountrySearchKeyConverter(country_column)
|
2516
|
-
df = converter.convert(df)
|
2517
|
-
|
2518
|
-
postal_code = self._get_postal_column(search_keys)
|
2519
|
-
if postal_code:
|
2520
|
-
converter = PostalCodeSearchKeyConverter(postal_code)
|
2521
|
-
df = converter.convert(df)
|
2522
|
-
|
2523
2502
|
meaning_types = {}
|
2524
2503
|
meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
|
2525
2504
|
meaning_types.update({col: key.value for col, key in search_keys.items()})
|
@@ -2914,6 +2893,7 @@ if response.status_code == 200:
|
|
2914
2893
|
self.fit_generated_features.extend(converter.generated_features)
|
2915
2894
|
else:
|
2916
2895
|
self.logger.info("Input dataset hasn't date column")
|
2896
|
+
# TODO remove when this logic will be implemented on the back
|
2917
2897
|
if self.__should_add_date_column():
|
2918
2898
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
2919
2899
|
|
@@ -2945,6 +2925,26 @@ if response.status_code == 200:
|
|
2945
2925
|
if normalizer.removed_features:
|
2946
2926
|
self.__log_warning(self.bundle.get("dataset_date_features").format(normalizer.removed_features))
|
2947
2927
|
|
2928
|
+
non_feature_columns = [
|
2929
|
+
self.TARGET_NAME,
|
2930
|
+
EVAL_SET_INDEX,
|
2931
|
+
] + list(self.fit_search_keys.keys())
|
2932
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
2933
|
+
non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
|
2934
|
+
|
2935
|
+
features_columns = [c for c in df.columns if c not in non_feature_columns]
|
2936
|
+
|
2937
|
+
features_to_drop, feature_validator_warnings = FeaturesValidator(self.logger).validate(
|
2938
|
+
df, features_columns, self.generate_features, self.fit_columns_renaming
|
2939
|
+
)
|
2940
|
+
if feature_validator_warnings:
|
2941
|
+
for warning in feature_validator_warnings:
|
2942
|
+
self.__log_warning(warning)
|
2943
|
+
self.fit_dropped_features.update(features_to_drop)
|
2944
|
+
df = df.drop(columns=features_to_drop)
|
2945
|
+
|
2946
|
+
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
2947
|
+
|
2948
2948
|
self.__adjust_cv(df)
|
2949
2949
|
|
2950
2950
|
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
@@ -2984,6 +2984,7 @@ if response.status_code == 200:
|
|
2984
2984
|
# Convert EMAIL to HEM etc after unnesting to do it only with one column
|
2985
2985
|
df = self.__convert_unnestable_keys(df, unnest_search_keys)
|
2986
2986
|
|
2987
|
+
# refresh features columns
|
2987
2988
|
non_feature_columns = [
|
2988
2989
|
self.TARGET_NAME,
|
2989
2990
|
EVAL_SET_INDEX,
|
@@ -2995,17 +2996,6 @@ if response.status_code == 200:
|
|
2995
2996
|
|
2996
2997
|
features_columns = [c for c in df.columns if c not in non_feature_columns]
|
2997
2998
|
|
2998
|
-
features_to_drop, feature_validator_warnings = FeaturesValidator(self.logger).validate(
|
2999
|
-
df, features_columns, self.generate_features, self.fit_columns_renaming
|
3000
|
-
)
|
3001
|
-
if feature_validator_warnings:
|
3002
|
-
for warning in feature_validator_warnings:
|
3003
|
-
self.__log_warning(warning)
|
3004
|
-
self.fit_dropped_features.update(features_to_drop)
|
3005
|
-
df = df.drop(columns=features_to_drop)
|
3006
|
-
|
3007
|
-
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
3008
|
-
|
3009
2999
|
meaning_types = {
|
3010
3000
|
**{col: key.value for col, key in self.fit_search_keys.items()},
|
3011
3001
|
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
@@ -3235,20 +3225,6 @@ if response.status_code == 200:
|
|
3235
3225
|
self.logger,
|
3236
3226
|
)
|
3237
3227
|
df = converter.convert(df)
|
3238
|
-
phone_column = self._get_phone_column(self.fit_search_keys)
|
3239
|
-
country_column = self._get_country_column(self.fit_search_keys)
|
3240
|
-
if phone_column:
|
3241
|
-
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
3242
|
-
df = converter.convert(df)
|
3243
|
-
|
3244
|
-
if country_column:
|
3245
|
-
converter = CountrySearchKeyConverter(country_column)
|
3246
|
-
df = converter.convert(df)
|
3247
|
-
|
3248
|
-
postal_code = self._get_postal_column(self.fit_search_keys)
|
3249
|
-
if postal_code:
|
3250
|
-
converter = PostalCodeSearchKeyConverter(postal_code)
|
3251
|
-
df = converter.convert(df)
|
3252
3228
|
|
3253
3229
|
return df
|
3254
3230
|
|
@@ -24,8 +24,11 @@ from upgini.metadata import (
|
|
24
24
|
)
|
25
25
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
26
26
|
from upgini.utils import find_numbers_with_decimal_comma
|
27
|
+
from upgini.utils.country_utils import CountrySearchKeyConverter
|
27
28
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
29
|
+
from upgini.utils.ip_utils import IpSearchKeyConverter
|
28
30
|
from upgini.utils.phone_utils import PhoneSearchKeyConverter
|
31
|
+
from upgini.utils.postal_code_utils import PostalCodeSearchKeyConverter
|
29
32
|
|
30
33
|
|
31
34
|
class Normalizer:
|
@@ -65,6 +68,12 @@ class Normalizer:
|
|
65
68
|
|
66
69
|
df = self._convert_phone_numbers(df)
|
67
70
|
|
71
|
+
df = self._convert_ip_addresses(df)
|
72
|
+
|
73
|
+
df = self._convert_postal_codes(df)
|
74
|
+
|
75
|
+
df = self._convert_countries(df)
|
76
|
+
|
68
77
|
df = self.__convert_features_types(df)
|
69
78
|
|
70
79
|
return df, self.search_keys, self.generated_features
|
@@ -191,6 +200,22 @@ class Normalizer:
|
|
191
200
|
df = converter.convert(df)
|
192
201
|
return df
|
193
202
|
|
203
|
+
def _convert_ip_addresses(self, df: pd.DataFrame) -> pd.DataFrame:
|
204
|
+
for ip_col in SearchKey.find_all_keys(self.search_keys, SearchKey.IP):
|
205
|
+
df[ip_col] = df[ip_col].apply(IpSearchKeyConverter.safe_ip_parse)
|
206
|
+
return df
|
207
|
+
|
208
|
+
def _convert_postal_codes(self, df: pd.DataFrame) -> pd.DataFrame:
|
209
|
+
for postal_code_col in SearchKey.find_all_keys(self.search_keys, SearchKey.POSTAL_CODE):
|
210
|
+
df = PostalCodeSearchKeyConverter(postal_code_col).convert(df)
|
211
|
+
return df
|
212
|
+
|
213
|
+
def _convert_countries(self, df: pd.DataFrame) -> pd.DataFrame:
|
214
|
+
maybe_country_col = SearchKey.find_key(self.search_keys, SearchKey.COUNTRY)
|
215
|
+
if maybe_country_col:
|
216
|
+
df = CountrySearchKeyConverter(maybe_country_col).convert(df)
|
217
|
+
return df
|
218
|
+
|
194
219
|
def __convert_features_types(self, df: pd.DataFrame):
|
195
220
|
# self.logger.info("Convert features to supported data types")
|
196
221
|
|
upgini/utils/ip_utils.py
CHANGED
@@ -79,7 +79,7 @@ class IpSearchKeyConverter:
|
|
79
79
|
pass
|
80
80
|
|
81
81
|
@staticmethod
|
82
|
-
def
|
82
|
+
def safe_ip_parse(ip: Union[str, int, IPv4Address, IPv6Address, bytes]) -> Optional[_BaseAddress]:
|
83
83
|
try:
|
84
84
|
return ip_address(ip)
|
85
85
|
except ValueError:
|
@@ -110,7 +110,7 @@ class IpSearchKeyConverter:
|
|
110
110
|
self.logger.info("Convert ip address to int")
|
111
111
|
original_ip = self.columns_renaming[self.ip_column]
|
112
112
|
|
113
|
-
df[self.ip_column] = df[self.ip_column].apply(self.
|
113
|
+
df[self.ip_column] = df[self.ip_column].apply(self.safe_ip_parse)
|
114
114
|
if df[self.ip_column].isnull().all():
|
115
115
|
raise ValidationError(self.bundle.get("invalid_ip").format(self.ip_column))
|
116
116
|
|
@@ -1,9 +1,9 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=euNfF4usca0xoZtvXArti-DB6GH8wgwXSYVRRbwzMkE,25
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=e6JDYTZ2AwC5aF-dqclKZKkiKrHo2f6cFmMQO2ZZmjM,32724
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=dGB4utdlCtgILwX1Hnchg066fwwoyItPBSyzoPe64Z8,218244
|
7
7
|
upgini/http.py,sha256=4i7fQwrwU3WzDUOWzrgR-4C8eJwj_5dBwRAR-UjUtlc,44345
|
8
8
|
upgini/metadata.py,sha256=vsbbHyPCP3Rs8WkeDgQg99uAA_zmsbDStAT-NwDYhO4,12455
|
9
9
|
upgini/metrics.py,sha256=UbKEsHB7XDzoyGNqDx846zbh1t65GpqdnnhViccdoKU,45615
|
@@ -35,7 +35,7 @@ upgini/data_source/data_source_publisher.py,sha256=ufL8qK1vg8iUKd5bLWz6hEMGiC3Je
|
|
35
35
|
upgini/mdc/__init__.py,sha256=iHJlXQg6xRM1-ZOUtaPSJqw5SpQDszvxp4LyqviNLIQ,1027
|
36
36
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
37
37
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
|
-
upgini/normalizer/normalize_utils.py,sha256=
|
38
|
+
upgini/normalizer/normalize_utils.py,sha256=BSP0vIjRPrupL1sziAudPXJ-qsO4UE9Pyhwiqa1MZV8,8484
|
39
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
40
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
41
41
|
upgini/resource_bundle/strings.properties,sha256=UO6K0wwvutyOyClOnJYlFYAETzMSen6hHnj3--5AIAs,28497
|
@@ -59,7 +59,7 @@ upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0-
|
|
59
59
|
upgini/utils/feature_info.py,sha256=b3RvAeOHSEu-ZXWTrf42Dll_3ZUBL0pw7sdk7hgUKD0,7284
|
60
60
|
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
61
61
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
62
|
-
upgini/utils/ip_utils.py,sha256=
|
62
|
+
upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
|
63
63
|
upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
64
64
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
65
65
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
@@ -71,7 +71,7 @@ upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,
|
|
71
71
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
72
72
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
73
73
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
74
|
-
upgini-1.2.
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
77
|
-
upgini-1.2.
|
74
|
+
upgini-1.2.96a3.dist-info/METADATA,sha256=HdJ2Ptri3J0BuSdc6tXhBZ79vpGvbKIuPx5PVLrtTeY,49530
|
75
|
+
upgini-1.2.96a3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
76
|
+
upgini-1.2.96a3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
77
|
+
upgini-1.2.96a3.dist-info/RECORD,,
|
File without changes
|
File without changes
|