upgini 1.1.312a3__py3-none-any.whl → 1.1.312a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.1.312a3"
1
+ __version__ = "1.1.312a5"
upgini/dataset.py CHANGED
@@ -440,9 +440,11 @@ class Dataset: # (pd.DataFrame):
440
440
  FileColumnMeaningType.DATETIME,
441
441
  # FileColumnMeaningType.IP_ADDRESS,
442
442
  }:
443
+ min_value = self.data[column_name].astype("Int64").min()
444
+ max_value = self.data[column_name].astype("Int64").max()
443
445
  min_max_values = NumericInterval(
444
- minValue=self.data[column_name].astype("Int64").min(),
445
- maxValue=self.data[column_name].astype("Int64").max(),
446
+ minValue=min_value,
447
+ maxValue=max_value,
446
448
  )
447
449
  else:
448
450
  min_max_values = None
@@ -91,7 +91,11 @@ from upgini.utils.display_utils import (
91
91
  prepare_and_show_report,
92
92
  show_request_quote_button,
93
93
  )
94
- from upgini.utils.email_utils import EmailDomainGenerator, EmailSearchKeyConverter, EmailSearchKeyDetector
94
+ from upgini.utils.email_utils import (
95
+ EmailDomainGenerator,
96
+ EmailSearchKeyConverter,
97
+ EmailSearchKeyDetector,
98
+ )
95
99
  from upgini.utils.features_validator import FeaturesValidator
96
100
  from upgini.utils.format import Format
97
101
  from upgini.utils.ip_utils import IpSearchKeyConverter
@@ -1030,7 +1034,7 @@ class FeaturesEnricher(TransformerMixin):
1030
1034
  self.bundle.get("quality_metrics_rows_header"): _num_samples(effective_X),
1031
1035
  }
1032
1036
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1033
- y_sorted
1037
+ effective_y
1034
1038
  ):
1035
1039
  train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1036
1040
  np.mean(effective_y), 4
@@ -1103,7 +1107,7 @@ class FeaturesEnricher(TransformerMixin):
1103
1107
  # self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
1104
1108
  }
1105
1109
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1106
- eval_y_sorted
1110
+ effective_eval_set[idx][1]
1107
1111
  ):
1108
1112
  eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1109
1113
  np.mean(effective_eval_set[idx][1]), 4
@@ -1363,6 +1367,7 @@ class FeaturesEnricher(TransformerMixin):
1363
1367
  importance_threshold,
1364
1368
  max_features,
1365
1369
  )
1370
+ filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
1366
1371
 
1367
1372
  X_sorted, y_sorted = self._sort_by_system_record_id(X_sampled, y_sampled, self.cv)
1368
1373
  enriched_X_sorted, enriched_y_sorted = self._sort_by_system_record_id(enriched_X, y_sampled, self.cv)
@@ -2217,7 +2222,9 @@ class FeaturesEnricher(TransformerMixin):
2217
2222
  result = enrich()
2218
2223
 
2219
2224
  filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
2220
- existing_filtered_columns = [c for c in filtered_columns if c in result.columns]
2225
+ existing_filtered_columns = [
2226
+ c for c in filtered_columns if c in result.columns and c not in validated_X.columns
2227
+ ]
2221
2228
  selecting_columns = validated_X.columns.tolist() + generated_features + existing_filtered_columns
2222
2229
  if add_fit_system_record_id:
2223
2230
  selecting_columns.append(SORT_ID)
@@ -2430,9 +2437,7 @@ class FeaturesEnricher(TransformerMixin):
2430
2437
 
2431
2438
  email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
2432
2439
  if email_columns:
2433
- generator = EmailDomainGenerator(
2434
- email_columns
2435
- )
2440
+ generator = EmailDomainGenerator(email_columns)
2436
2441
  df = generator.generate(df)
2437
2442
  self.fit_generated_features.extend(generator.generated_features)
2438
2443
 
@@ -1,6 +1,7 @@
1
1
  import datetime
2
2
  import logging
3
3
  import re
4
+ import pytz
4
5
  from typing import Dict, List, Optional
5
6
 
6
7
  import numpy as np
@@ -28,12 +29,13 @@ DATE_FORMATS = [
28
29
  "%Y-%m-%dT%H:%M:%S.%f",
29
30
  ]
30
31
 
31
- DATETIME_PATTERN = r"^[\d\s\.\-:T/]+$"
32
+ DATETIME_PATTERN = r"^[\d\s\.\-:T/+]+$"
32
33
 
33
34
 
34
35
  class DateTimeSearchKeyConverter:
35
36
  DATETIME_COL = "_date_time"
36
- MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31) # 946684800000 # 2000-01-01
37
+ # MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31) # 946684800000 # 2000-01-01
38
+ MIN_SUPPORTED_DATE_TS = pd.to_datetime(datetime.datetime(1999, 12, 31)).tz_localize(None)
37
39
 
38
40
  def __init__(
39
41
  self,
@@ -106,12 +108,13 @@ class DateTimeSearchKeyConverter:
106
108
  df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
107
109
  df[self.date_column] = self.parse_date(df)
108
110
 
109
- df = self.clean_old_dates(df)
110
-
111
111
  # If column with date is datetime then extract seconds of the day and minute of the hour
112
112
  # as additional features
113
113
  seconds = "datetime_seconds"
114
114
  df[self.date_column] = df[self.date_column].dt.tz_localize(None)
115
+
116
+ df = self.clean_old_dates(df)
117
+
115
118
  df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
116
119
 
117
120
  seconds_without_na = df[seconds].dropna()
@@ -29,21 +29,21 @@ class PhoneSearchKeyConverter:
29
29
  def convert(self, df: pd.DataFrame) -> pd.DataFrame:
30
30
  df = self.phone_to_int(df)
31
31
  if self.country_column is not None:
32
- df = df.apply(self.add_prefix, axis=1)
32
+ df[self.phone_column] = df.apply(self.add_prefix, axis=1)
33
33
  df[self.phone_column] = df[self.phone_column].astype("Int64")
34
34
  return df
35
35
 
36
36
  def add_prefix(self, row):
37
37
  phone = row[self.phone_column]
38
38
  if pd.isna(phone):
39
- return row
39
+ return phone
40
40
  country = row[self.country_column]
41
41
  country_prefix_tuple = self.COUNTRIES_PREFIXES.get(country)
42
42
  if country_prefix_tuple is not None:
43
43
  country_prefix, number_of_digits = country_prefix_tuple
44
44
  if len(str(phone)) == number_of_digits:
45
- row[self.phone_column] = int(country_prefix + str(phone))
46
- return row
45
+ return int(country_prefix + str(phone))
46
+ return phone
47
47
 
48
48
  def phone_to_int(self, df: pd.DataFrame) -> pd.DataFrame:
49
49
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.312a3
3
+ Version: 1.1.312a5
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=IaDaRN1MPzK9IEvOazFTrqDhFeyxseC5mkVDu1NRrYc,26
1
+ upgini/__about__.py,sha256=AYWzaD5lVHd4s3vopg53TBKzkKgWzJVi62VrmfO7rRU,26
2
2
  upgini/__init__.py,sha256=Xs0YFVBu1KUdtZzbStGRPQtLt3YLzJnjx5nIUBlX8BE,415
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=CdOE1h94E1YgStslQIPfvMp5z_ODt7QfXfxqpmYL5Xs,30758
4
+ upgini/dataset.py,sha256=c6jghh32P9_2CspELYCOsmNIOiShuCADnCCJ8Jj2t50,30834
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=FfFlFW3BArv2rQWGCs-SXrDDDcjQTwwJxzRysZlJfq0,186961
6
+ upgini/features_enricher.py,sha256=LqGOMObkFsAm58sBL3UhTmc7TOnDQmLivxl3jbXh-n0,187132
7
7
  upgini/http.py,sha256=a4Epc9YLIJBuYk4t8E_2-QDLBtJFqKO35jn2SnYQZCg,42920
8
8
  upgini/lazy_import.py,sha256=EwoM0msNGbSmWBhGbrLDny1DSnOlvTxCjmMKPxYlDms,610
9
9
  upgini/metadata.py,sha256=YQ-1HZGyPOksP2iM50ff_pMHXLyzvpChqSfNh8Z0ke4,10833
@@ -42,7 +42,7 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
42
42
  upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
43
43
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
44
44
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
45
- upgini/utils/datetime_utils.py,sha256=O-IQbWtWJs6xTAr3m9FMRHyT-fL_28vCMrrt4eqfpa0,12025
45
+ upgini/utils/datetime_utils.py,sha256=JSHCx6kpt7n60i3cphI5yWEatQK729x1coSjC8Gafrg,12135
46
46
  upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
47
47
  upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
48
48
  upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
@@ -50,14 +50,14 @@ upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0-
50
50
  upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V5UO1A,3307
51
51
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
52
52
  upgini/utils/ip_utils.py,sha256=ZZj_uQFTHhagzt-MRew__ZBOp2DdnkMrachS7PElkSE,5143
53
- upgini/utils/phone_utils.py,sha256=c8oNajhT7Z1hXpiRAEH828vX7SoALBJKUun_M5qu9vg,10363
53
+ upgini/utils/phone_utils.py,sha256=PTSRfGAWCuLy8R6I8X6clcc1K7bZXIIrZ_alIB8irC8,10368
54
54
  upgini/utils/postal_code_utils.py,sha256=C899tJS8qM_ps4I3g-Ve6qzIa22O_UqwNmGFoyy9sO8,1716
55
55
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
56
56
  upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
57
57
  upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.1.312a3.dist-info/METADATA,sha256=133zxblkCtyOblc05Qe0mfRcwYYU9qLcPbNNBjHl-mY,48155
61
- upgini-1.1.312a3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
- upgini-1.1.312a3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.1.312a3.dist-info/RECORD,,
60
+ upgini-1.1.312a5.dist-info/METADATA,sha256=MdhSWCWMCuajyC7B1bVWfVFOs7b5iIsmD7m3Z48egng,48155
61
+ upgini-1.1.312a5.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
+ upgini-1.1.312a5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.1.312a5.dist-info/RECORD,,