upgini 1.1.306__py3-none-any.whl → 1.1.308__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.1.306"
1
+ __version__ = "1.1.308"
upgini/__init__.py CHANGED
@@ -1,7 +1,11 @@
1
+ import os
2
+
1
3
  from .lazy_import import LazyImport
2
4
 
3
- FeaturesEnricher = LazyImport('upgini.features_enricher', 'FeaturesEnricher')
4
- SearchKey = LazyImport('upgini.metadata', 'SearchKey')
5
- RuntimeParameters = LazyImport('upgini.metadata', 'RuntimeParameters')
6
- CVType = LazyImport('upgini.metadata', 'CVType')
7
- ModelTaskType = LazyImport('upgini.metadata', 'ModelTaskType')
5
+ os.environ["SETUPTOOLS_USE_DISTUTILS"] = "stdlib"
6
+
7
+ FeaturesEnricher = LazyImport("upgini.features_enricher", "FeaturesEnricher")
8
+ SearchKey = LazyImport("upgini.metadata", "SearchKey")
9
+ RuntimeParameters = LazyImport("upgini.metadata", "RuntimeParameters")
10
+ CVType = LazyImport("upgini.metadata", "CVType")
11
+ ModelTaskType = LazyImport("upgini.metadata", "ModelTaskType")
@@ -90,7 +90,6 @@ from upgini.utils.display_utils import (
90
90
  from upgini.utils.email_utils import EmailSearchKeyConverter, EmailSearchKeyDetector
91
91
  from upgini.utils.features_validator import FeaturesValidator
92
92
  from upgini.utils.format import Format
93
- from upgini.utils.ip_utils import IpToCountrySearchKeyConverter
94
93
  from upgini.utils.phone_utils import PhoneSearchKeyDetector
95
94
  from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
96
95
 
@@ -1213,13 +1212,6 @@ class FeaturesEnricher(TransformerMixin):
1213
1212
  converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
1214
1213
  extended_X = converter.convert(extended_X)
1215
1214
  generated_features.extend(converter.generated_features)
1216
- if (
1217
- self.detect_missing_search_keys
1218
- and list(search_keys.values()) == [SearchKey.DATE]
1219
- and self.country_code is None
1220
- ):
1221
- converter = IpToCountrySearchKeyConverter(search_keys, self.logger)
1222
- extended_X = converter.convert(extended_X)
1223
1215
  generated_features = [f for f in generated_features if f in self.fit_generated_features]
1224
1216
 
1225
1217
  return extended_X, search_keys
@@ -1987,13 +1979,6 @@ class FeaturesEnricher(TransformerMixin):
1987
1979
  df = converter.convert(df)
1988
1980
  generated_features.extend(converter.generated_features)
1989
1981
  email_converted_to_hem = converter.email_converted_to_hem
1990
- if (
1991
- self.detect_missing_search_keys
1992
- and list(search_keys.values()) == [SearchKey.DATE]
1993
- and self.country_code is None
1994
- ):
1995
- converter = IpToCountrySearchKeyConverter(search_keys, self.logger)
1996
- df = converter.convert(df)
1997
1982
  generated_features = [f for f in generated_features if f in self.fit_generated_features]
1998
1983
 
1999
1984
  meaning_types = {col: key.value for col, key in search_keys.items()}
@@ -2300,8 +2285,6 @@ class FeaturesEnricher(TransformerMixin):
2300
2285
  self.fit_search_keys = self.search_keys.copy()
2301
2286
  self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
2302
2287
 
2303
- validate_dates_distribution(validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
2304
-
2305
2288
  maybe_date_column = self._get_date_column(self.fit_search_keys)
2306
2289
  has_date = maybe_date_column is not None
2307
2290
  model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
@@ -2322,9 +2305,6 @@ class FeaturesEnricher(TransformerMixin):
2322
2305
 
2323
2306
  df = self.__handle_index_search_keys(df, self.fit_search_keys)
2324
2307
 
2325
- if is_numeric_dtype(df[self.TARGET_NAME]) and has_date:
2326
- self._validate_PSI(df.sort_values(by=maybe_date_column))
2327
-
2328
2308
  if DEFAULT_INDEX in df.columns:
2329
2309
  msg = self.bundle.get("unsupported_index_column")
2330
2310
  self.logger.info(msg)
@@ -2334,33 +2314,32 @@ class FeaturesEnricher(TransformerMixin):
2334
2314
 
2335
2315
  df = self.__add_country_code(df, self.fit_search_keys)
2336
2316
 
2337
- df = remove_fintech_duplicates(
2338
- df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
2339
- )
2340
- df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
2341
-
2342
- date_column = self._get_date_column(self.fit_search_keys)
2343
- self.__adjust_cv(df, date_column, model_task_type)
2344
-
2345
2317
  self.fit_generated_features = []
2346
2318
 
2347
- if date_column is not None:
2348
- converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2319
+ if has_date:
2320
+ converter = DateTimeSearchKeyConverter(maybe_date_column, self.date_format, self.logger, bundle=self.bundle)
2349
2321
  df = converter.convert(df, keep_time=True)
2350
- self.logger.info(f"Date column after convertion: {df[date_column]}")
2322
+ self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
2351
2323
  self.fit_generated_features.extend(converter.generated_features)
2352
2324
  else:
2353
2325
  self.logger.info("Input dataset hasn't date column")
2354
2326
  if self.add_date_if_missing:
2355
2327
  df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2356
2328
 
2357
- if (
2358
- self.detect_missing_search_keys
2359
- and list(self.fit_search_keys.values()) == [SearchKey.DATE]
2360
- and self.country_code is None
2361
- ):
2362
- converter = IpToCountrySearchKeyConverter(self.fit_search_keys, self.logger)
2363
- df = converter.convert(df)
2329
+ # Checks that need validated date
2330
+ validate_dates_distribution(df, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
2331
+
2332
+ if is_numeric_dtype(df[self.TARGET_NAME]) and has_date:
2333
+ self._validate_PSI(df.sort_values(by=maybe_date_column))
2334
+
2335
+ self.__adjust_cv(df, maybe_date_column, model_task_type)
2336
+
2337
+ # TODO normalize and convert all columns
2338
+
2339
+ df = remove_fintech_duplicates(
2340
+ df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
2341
+ )
2342
+ df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
2364
2343
 
2365
2344
  # Explode multiple search keys
2366
2345
  non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.306
3
+ Version: 1.1.308
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=okL1wSisFFnkKhJ4nbM09p9rNtG-hKV5SxZSxyhmBQk,24
2
- upgini/__init__.py,sha256=ObEtjFkIssl83qeKNMLpIQygfwK8TzztwiI43YTsAP0,353
1
+ upgini/__about__.py,sha256=o7HYGU6jJ9PhGLTF-smMJX3fn5iDsS3gczNF7jp5774,24
2
+ upgini/__init__.py,sha256=Xs0YFVBu1KUdtZzbStGRPQtLt3YLzJnjx5nIUBlX8BE,415
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=MOzBVsvzlHLxNfPWtMaXC_jIPeW7_gUvbSGeXnsPgNI,46158
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=8grYkWdix0NogYAY7YjUVe2Tqfra6NAxM78N9MlLF30,183572
6
+ upgini/features_enricher.py,sha256=OhVgWC42fyTSsvcGVxFXVHQs1xz8ui3tbC9oCcxEV9w,182589
7
7
  upgini/http.py,sha256=a4Epc9YLIJBuYk4t8E_2-QDLBtJFqKO35jn2SnYQZCg,42920
8
8
  upgini/lazy_import.py,sha256=EwoM0msNGbSmWBhGbrLDny1DSnOlvTxCjmMKPxYlDms,610
9
9
  upgini/metadata.py,sha256=E5WWZ_MkjGyYNQh_LnwMIBHyqPx1fxk-qhEfQIJnzq8,10209
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
57
57
  upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.1.306.dist-info/METADATA,sha256=GUAb3dq_ebpqOqGoH4gEydc7T1Z91cLIur9nMvmrUEg,48153
61
- upgini-1.1.306.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
- upgini-1.1.306.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.1.306.dist-info/RECORD,,
60
+ upgini-1.1.308.dist-info/METADATA,sha256=-DLyRMOoFbhsZXB53n66ORDp89WAfzYFbC0KtE6FNFk,48153
61
+ upgini-1.1.308.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
+ upgini-1.1.308.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.1.308.dist-info/RECORD,,