upgini 1.2.96a1__py3-none-any.whl → 1.2.96a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.96a1"
1
+ __version__ = "1.2.96a3"
@@ -71,10 +71,7 @@ from upgini.search_task import SearchTask
71
71
  from upgini.spinner import Spinner
72
72
  from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
73
73
  from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
74
- from upgini.utils.country_utils import (
75
- CountrySearchKeyConverter,
76
- CountrySearchKeyDetector,
77
- )
74
+ from upgini.utils.country_utils import CountrySearchKeyDetector
78
75
  from upgini.utils.custom_loss_utils import (
79
76
  get_additional_params_custom_loss,
80
77
  get_runtime_params_custom_loss,
@@ -105,11 +102,8 @@ from upgini.utils.feature_info import FeatureInfo, _round_shap_value
105
102
  from upgini.utils.features_validator import FeaturesValidator
106
103
  from upgini.utils.format import Format
107
104
  from upgini.utils.ip_utils import IpSearchKeyConverter
108
- from upgini.utils.phone_utils import PhoneSearchKeyConverter, PhoneSearchKeyDetector
109
- from upgini.utils.postal_code_utils import (
110
- PostalCodeSearchKeyConverter,
111
- PostalCodeSearchKeyDetector,
112
- )
105
+ from upgini.utils.phone_utils import PhoneSearchKeyDetector
106
+ from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
113
107
 
114
108
  try:
115
109
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
@@ -1154,7 +1148,7 @@ class FeaturesEnricher(TransformerMixin):
1154
1148
  self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
1155
1149
  if baseline_metric is not None and enriched_metric is not None:
1156
1150
  uplift = (enriched_cv_result.metric - baseline_cv_result.metric) * multiplier
1157
- uplift_perc = uplift / baseline_cv_result.metric * 100 * multiplier
1151
+ uplift_perc = uplift / abs(baseline_cv_result.metric) * 100
1158
1152
 
1159
1153
  train_metrics = {
1160
1154
  self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
@@ -1233,7 +1227,7 @@ class FeaturesEnricher(TransformerMixin):
1233
1227
 
1234
1228
  if etalon_eval_metric is not None and enriched_eval_metric is not None:
1235
1229
  eval_uplift = (enriched_eval_results.metric - etalon_eval_results.metric) * multiplier
1236
- eval_uplift_perc = eval_uplift / etalon_eval_results.metric * 100 * multiplier
1230
+ eval_uplift_perc = eval_uplift / abs(etalon_eval_results.metric) * 100
1237
1231
  else:
1238
1232
  eval_uplift = None
1239
1233
  eval_uplift_perc = None
@@ -2505,21 +2499,6 @@ if response.status_code == 200:
2505
2499
  )
2506
2500
  df = converter.convert(df)
2507
2501
 
2508
- phone_column = self._get_phone_column(search_keys)
2509
- country_column = self._get_country_column(search_keys)
2510
- if phone_column:
2511
- converter = PhoneSearchKeyConverter(phone_column, country_column)
2512
- df = converter.convert(df)
2513
-
2514
- if country_column:
2515
- converter = CountrySearchKeyConverter(country_column)
2516
- df = converter.convert(df)
2517
-
2518
- postal_code = self._get_postal_column(search_keys)
2519
- if postal_code:
2520
- converter = PostalCodeSearchKeyConverter(postal_code)
2521
- df = converter.convert(df)
2522
-
2523
2502
  meaning_types = {}
2524
2503
  meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
2525
2504
  meaning_types.update({col: key.value for col, key in search_keys.items()})
@@ -2914,6 +2893,7 @@ if response.status_code == 200:
2914
2893
  self.fit_generated_features.extend(converter.generated_features)
2915
2894
  else:
2916
2895
  self.logger.info("Input dataset hasn't date column")
2896
+ # TODO remove when this logic will be implemented on the back
2917
2897
  if self.__should_add_date_column():
2918
2898
  df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2919
2899
 
@@ -2945,6 +2925,26 @@ if response.status_code == 200:
2945
2925
  if normalizer.removed_features:
2946
2926
  self.__log_warning(self.bundle.get("dataset_date_features").format(normalizer.removed_features))
2947
2927
 
2928
+ non_feature_columns = [
2929
+ self.TARGET_NAME,
2930
+ EVAL_SET_INDEX,
2931
+ ] + list(self.fit_search_keys.keys())
2932
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2933
+ non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
2934
+
2935
+ features_columns = [c for c in df.columns if c not in non_feature_columns]
2936
+
2937
+ features_to_drop, feature_validator_warnings = FeaturesValidator(self.logger).validate(
2938
+ df, features_columns, self.generate_features, self.fit_columns_renaming
2939
+ )
2940
+ if feature_validator_warnings:
2941
+ for warning in feature_validator_warnings:
2942
+ self.__log_warning(warning)
2943
+ self.fit_dropped_features.update(features_to_drop)
2944
+ df = df.drop(columns=features_to_drop)
2945
+
2946
+ self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
2947
+
2948
2948
  self.__adjust_cv(df)
2949
2949
 
2950
2950
  if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
@@ -2984,6 +2984,7 @@ if response.status_code == 200:
2984
2984
  # Convert EMAIL to HEM etc after unnesting to do it only with one column
2985
2985
  df = self.__convert_unnestable_keys(df, unnest_search_keys)
2986
2986
 
2987
+ # refresh features columns
2987
2988
  non_feature_columns = [
2988
2989
  self.TARGET_NAME,
2989
2990
  EVAL_SET_INDEX,
@@ -2995,17 +2996,6 @@ if response.status_code == 200:
2995
2996
 
2996
2997
  features_columns = [c for c in df.columns if c not in non_feature_columns]
2997
2998
 
2998
- features_to_drop, feature_validator_warnings = FeaturesValidator(self.logger).validate(
2999
- df, features_columns, self.generate_features, self.fit_columns_renaming
3000
- )
3001
- if feature_validator_warnings:
3002
- for warning in feature_validator_warnings:
3003
- self.__log_warning(warning)
3004
- self.fit_dropped_features.update(features_to_drop)
3005
- df = df.drop(columns=features_to_drop)
3006
-
3007
- self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
3008
-
3009
2999
  meaning_types = {
3010
3000
  **{col: key.value for col, key in self.fit_search_keys.items()},
3011
3001
  **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
@@ -3235,20 +3225,6 @@ if response.status_code == 200:
3235
3225
  self.logger,
3236
3226
  )
3237
3227
  df = converter.convert(df)
3238
- phone_column = self._get_phone_column(self.fit_search_keys)
3239
- country_column = self._get_country_column(self.fit_search_keys)
3240
- if phone_column:
3241
- converter = PhoneSearchKeyConverter(phone_column, country_column)
3242
- df = converter.convert(df)
3243
-
3244
- if country_column:
3245
- converter = CountrySearchKeyConverter(country_column)
3246
- df = converter.convert(df)
3247
-
3248
- postal_code = self._get_postal_column(self.fit_search_keys)
3249
- if postal_code:
3250
- converter = PostalCodeSearchKeyConverter(postal_code)
3251
- df = converter.convert(df)
3252
3228
 
3253
3229
  return df
3254
3230
 
@@ -24,8 +24,11 @@ from upgini.metadata import (
24
24
  )
25
25
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
26
26
  from upgini.utils import find_numbers_with_decimal_comma
27
+ from upgini.utils.country_utils import CountrySearchKeyConverter
27
28
  from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
29
+ from upgini.utils.ip_utils import IpSearchKeyConverter
28
30
  from upgini.utils.phone_utils import PhoneSearchKeyConverter
31
+ from upgini.utils.postal_code_utils import PostalCodeSearchKeyConverter
29
32
 
30
33
 
31
34
  class Normalizer:
@@ -65,6 +68,12 @@ class Normalizer:
65
68
 
66
69
  df = self._convert_phone_numbers(df)
67
70
 
71
+ df = self._convert_ip_addresses(df)
72
+
73
+ df = self._convert_postal_codes(df)
74
+
75
+ df = self._convert_countries(df)
76
+
68
77
  df = self.__convert_features_types(df)
69
78
 
70
79
  return df, self.search_keys, self.generated_features
@@ -191,6 +200,22 @@ class Normalizer:
191
200
  df = converter.convert(df)
192
201
  return df
193
202
 
203
+ def _convert_ip_addresses(self, df: pd.DataFrame) -> pd.DataFrame:
204
+ for ip_col in SearchKey.find_all_keys(self.search_keys, SearchKey.IP):
205
+ df[ip_col] = df[ip_col].apply(IpSearchKeyConverter.safe_ip_parse)
206
+ return df
207
+
208
+ def _convert_postal_codes(self, df: pd.DataFrame) -> pd.DataFrame:
209
+ for postal_code_col in SearchKey.find_all_keys(self.search_keys, SearchKey.POSTAL_CODE):
210
+ df = PostalCodeSearchKeyConverter(postal_code_col).convert(df)
211
+ return df
212
+
213
+ def _convert_countries(self, df: pd.DataFrame) -> pd.DataFrame:
214
+ maybe_country_col = SearchKey.find_key(self.search_keys, SearchKey.COUNTRY)
215
+ if maybe_country_col:
216
+ df = CountrySearchKeyConverter(maybe_country_col).convert(df)
217
+ return df
218
+
194
219
  def __convert_features_types(self, df: pd.DataFrame):
195
220
  # self.logger.info("Convert features to supported data types")
196
221
 
upgini/utils/ip_utils.py CHANGED
@@ -79,7 +79,7 @@ class IpSearchKeyConverter:
79
79
  pass
80
80
 
81
81
  @staticmethod
82
- def _safe_ip_parse(ip: Union[str, int, IPv4Address, IPv6Address]) -> Optional[_BaseAddress]:
82
+ def safe_ip_parse(ip: Union[str, int, IPv4Address, IPv6Address, bytes]) -> Optional[_BaseAddress]:
83
83
  try:
84
84
  return ip_address(ip)
85
85
  except ValueError:
@@ -110,7 +110,7 @@ class IpSearchKeyConverter:
110
110
  self.logger.info("Convert ip address to int")
111
111
  original_ip = self.columns_renaming[self.ip_column]
112
112
 
113
- df[self.ip_column] = df[self.ip_column].apply(self._safe_ip_parse)
113
+ df[self.ip_column] = df[self.ip_column].apply(self.safe_ip_parse)
114
114
  if df[self.ip_column].isnull().all():
115
115
  raise ValidationError(self.bundle.get("invalid_ip").format(self.ip_column))
116
116
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.96a1
3
+ Version: 1.2.96a3
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=rKXcG2aFlwzxqUbUMNA-wiGYi7lhcfupLKmFdSTOcGU,25
1
+ upgini/__about__.py,sha256=euNfF4usca0xoZtvXArti-DB6GH8wgwXSYVRRbwzMkE,25
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=e6JDYTZ2AwC5aF-dqclKZKkiKrHo2f6cFmMQO2ZZmjM,32724
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=C9f8WfPuOXiWwmXt12jTLqtMG8fJvDPEB-2i1EYBlTA,219140
6
+ upgini/features_enricher.py,sha256=dGB4utdlCtgILwX1Hnchg066fwwoyItPBSyzoPe64Z8,218244
7
7
  upgini/http.py,sha256=4i7fQwrwU3WzDUOWzrgR-4C8eJwj_5dBwRAR-UjUtlc,44345
8
8
  upgini/metadata.py,sha256=vsbbHyPCP3Rs8WkeDgQg99uAA_zmsbDStAT-NwDYhO4,12455
9
9
  upgini/metrics.py,sha256=UbKEsHB7XDzoyGNqDx846zbh1t65GpqdnnhViccdoKU,45615
@@ -35,7 +35,7 @@ upgini/data_source/data_source_publisher.py,sha256=ufL8qK1vg8iUKd5bLWz6hEMGiC3Je
35
35
  upgini/mdc/__init__.py,sha256=iHJlXQg6xRM1-ZOUtaPSJqw5SpQDszvxp4LyqviNLIQ,1027
36
36
  upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
37
37
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
- upgini/normalizer/normalize_utils.py,sha256=g2TcDXZeJp9kAFO2sTqZ4CAsN4J1qHNgoJHZ8gtzUWo,7376
38
+ upgini/normalizer/normalize_utils.py,sha256=BSP0vIjRPrupL1sziAudPXJ-qsO4UE9Pyhwiqa1MZV8,8484
39
39
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
40
40
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
41
41
  upgini/resource_bundle/strings.properties,sha256=UO6K0wwvutyOyClOnJYlFYAETzMSen6hHnj3--5AIAs,28497
@@ -59,7 +59,7 @@ upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0-
59
59
  upgini/utils/feature_info.py,sha256=b3RvAeOHSEu-ZXWTrf42Dll_3ZUBL0pw7sdk7hgUKD0,7284
60
60
  upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
61
61
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
62
- upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
62
+ upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
63
63
  upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
64
64
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
65
65
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
@@ -71,7 +71,7 @@ upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,
71
71
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
72
72
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
73
73
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
74
- upgini-1.2.96a1.dist-info/METADATA,sha256=B_yjMWUU9i8rRbntPj90HWEprh4z7_SiZFFkL-fPZOM,49530
75
- upgini-1.2.96a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
76
- upgini-1.2.96a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
- upgini-1.2.96a1.dist-info/RECORD,,
74
+ upgini-1.2.96a3.dist-info/METADATA,sha256=HdJ2Ptri3J0BuSdc6tXhBZ79vpGvbKIuPx5PVLrtTeY,49530
75
+ upgini-1.2.96a3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
76
+ upgini-1.2.96a3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
+ upgini-1.2.96a3.dist-info/RECORD,,