upgini 1.2.137__py3-none-any.whl → 1.2.139__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.137"
1
+ __version__ = "1.2.139"
upgini/dataset.py CHANGED
@@ -54,7 +54,7 @@ except Exception:
54
54
 
55
55
  class Dataset:
56
56
  MIN_ROWS_COUNT = 100
57
- MAX_ROWS = 200_000
57
+ MAX_ROWS = 3_000_000
58
58
  MIN_SUPPORTED_DATE_TS = 946684800000 # 2000-01-01
59
59
  MAX_FEATURES_COUNT = 3500
60
60
  MAX_UPLOADING_FILE_SIZE = 268435456 # 256 Mb
@@ -77,6 +77,7 @@ from upgini.utils.custom_loss_utils import (
77
77
  )
78
78
  from upgini.utils.cv_utils import CVConfig, get_groups
79
79
  from upgini.utils.datetime_utils import (
80
+ DateSearchKeyDetector,
80
81
  DateTimeConverter,
81
82
  is_blocked_time_series,
82
83
  is_dates_distribution_valid,
@@ -238,6 +239,7 @@ class FeaturesEnricher(TransformerMixin):
238
239
  generate_search_key_features: bool = True,
239
240
  sample_config: SampleConfig | None = None,
240
241
  print_trace_id: bool = False,
242
+ print_loaded_report: bool = True,
241
243
  **kwargs,
242
244
  ):
243
245
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -284,7 +286,7 @@ class FeaturesEnricher(TransformerMixin):
284
286
  self.id_columns = id_columns
285
287
  self.id_columns_encoder = None
286
288
  self.country_code = country_code
287
- self.__validate_search_keys(search_keys, search_id)
289
+ self.__validate_search_keys(self.search_keys, search_id)
288
290
 
289
291
  self.model_task_type = ModelTaskType.parse(model_task_type)
290
292
  self.endpoint = endpoint
@@ -317,7 +319,8 @@ class FeaturesEnricher(TransformerMixin):
317
319
  self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
318
320
  df = pd.DataFrame(columns=x_columns)
319
321
  self.__prepare_feature_importances(trace_id, df, silent=True, update_selected_features=False)
320
- self.__show_selected_features()
322
+ if print_loaded_report:
323
+ self.__show_selected_features()
321
324
  # TODO validate search_keys with search_keys from file_metadata
322
325
  print(self.bundle.get("search_by_task_id_finish"))
323
326
  self.logger.debug(f"Successfully initialized with search_id: {search_id}")
@@ -3226,8 +3229,11 @@ if response.status_code == 200:
3226
3229
  df, self.fit_search_keys, self.fit_generated_features
3227
3230
  )
3228
3231
  self.fit_columns_renaming = normalizer.columns_renaming
3229
- if normalizer.removed_features:
3230
- self.__log_warning(self.bundle.get("dataset_date_features").format(normalizer.removed_features))
3232
+ if normalizer.removed_datetime_features:
3233
+ original_removed_datetime_features = [
3234
+ self.fit_columns_renaming.get(f, f) for f in normalizer.removed_datetime_features
3235
+ ]
3236
+ self.__log_warning(self.bundle.get("dataset_date_features").format(original_removed_datetime_features))
3231
3237
 
3232
3238
  non_feature_columns = [
3233
3239
  self.TARGET_NAME,
@@ -4093,11 +4099,12 @@ if response.status_code == 200:
4093
4099
  or set(search_keys.values()) == {SearchKey.EMAIL}
4094
4100
  or set(search_keys.values()) == {SearchKey.HEM}
4095
4101
  or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
4102
+ or len(search_keys) == 0
4103
+ or set(search_keys.values()) == {SearchKey.CUSTOM_KEY}
4096
4104
  ):
4097
4105
  if not silent:
4098
4106
  self.__log_warning(bundle.get("current_date_added"))
4099
4107
  df[CURRENT_DATE_COL] = datetime.date.today()
4100
- # df[CURRENT_DATE_COL] = datetime.date(2025, 10, 15)
4101
4108
  search_keys[CURRENT_DATE_COL] = SearchKey.DATE
4102
4109
  converter = DateTimeConverter(CURRENT_DATE_COL, generate_cyclical_features=False)
4103
4110
  df = converter.convert(df)
@@ -4781,7 +4788,8 @@ if response.status_code == 200:
4781
4788
  else:
4782
4789
  msg = self.bundle.get("unregistered_only_personal_keys")
4783
4790
  self.logger.warning(msg + f" Provided search keys: {search_keys}")
4784
- raise ValidationError(msg)
4791
+ # Current date will be added later
4792
+ # raise ValidationError(msg)
4785
4793
 
4786
4794
  if (
4787
4795
  len(valid_search_keys.values()) == 1
@@ -4900,6 +4908,21 @@ if response.status_code == 200:
4900
4908
  search_key in self.fit_search_keys.values() and search_key not in search_keys.values()
4901
4909
  )
4902
4910
 
4911
+ if (
4912
+ SearchKey.DATE not in search_keys.values()
4913
+ and SearchKey.DATETIME not in search_keys.values()
4914
+ and check_need_detect(SearchKey.DATE)
4915
+ and check_need_detect(SearchKey.DATETIME)
4916
+ ):
4917
+ maybe_keys = DateSearchKeyDetector().get_search_key_columns(sample, search_keys)
4918
+ if len(maybe_keys) > 0:
4919
+ datetime_key = maybe_keys[0]
4920
+ search_keys[datetime_key] = SearchKey.DATETIME
4921
+ self.autodetected_search_keys[datetime_key] = SearchKey.DATETIME
4922
+ self.logger.info(f"Autodetected search key DATETIME in column {datetime_key}")
4923
+ if not silent_mode:
4924
+ print(self.bundle.get("datetime_detected").format(datetime_key))
4925
+
4903
4926
  # if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
4904
4927
  if check_need_detect(SearchKey.POSTAL_CODE):
4905
4928
  maybe_keys = PostalCodeSearchKeyDetector().get_search_key_columns(sample, search_keys)
@@ -5,7 +5,6 @@ from typing import Dict, List, Tuple
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
  from pandas.api.types import is_bool_dtype as is_bool
8
- from pandas.api.types import is_datetime64_any_dtype as is_datetime
9
8
  from pandas.api.types import (
10
9
  is_float_dtype,
11
10
  is_numeric_dtype,
@@ -45,7 +44,7 @@ class Normalizer:
45
44
  self.columns_renaming = {}
46
45
  self.search_keys = {}
47
46
  self.generated_features = []
48
- self.removed_features = []
47
+ self.removed_datetime_features = []
49
48
 
50
49
  def normalize(
51
50
  self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
@@ -134,8 +133,9 @@ class Normalizer:
134
133
  features = self._get_features(df)
135
134
 
136
135
  for f in features:
137
- if is_datetime(df[f]) or isinstance(df[f].dtype, pd.PeriodDtype):
138
- self.removed_features.append(f)
136
+ converter = DateTimeConverter(f)
137
+ if converter.is_datetime(df):
138
+ self.removed_datetime_features.append(f)
139
139
  df.drop(columns=f, inplace=True)
140
140
 
141
141
  return df
@@ -210,6 +210,7 @@ features_info_zero_important_features=Oops, we can't find any relevant external
210
210
  features_info_zero_hit_rate_search_keys=Oops, looks like values/formats of the search keys {} might be incorrect,\nas we won't be able to match any data source using these values\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
211
211
  features_not_generated=Following features didn't pass checks for automated feature generation: {}
212
212
  # Information
213
+ datetime_detected=Datetime detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
213
214
  postal_code_detected=Postal codes detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
214
215
  country_detected=Countries detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
215
216
  country_auto_determined=Search key country_code `{}` was automatically determined by client IP. \nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
@@ -24,4 +24,8 @@ class BaseSearchKeyDetector:
24
24
  for column_name in other_columns:
25
25
  if self._is_search_key_by_values(df[column_name]):
26
26
  columns_by_values.append(column_name)
27
- return list(set(columns_by_names + columns_by_values))
27
+
28
+ both = [col for col in columns_by_names if col in columns_by_values]
29
+ only_values = [col for col in columns_by_values if col not in columns_by_names]
30
+ only_names = [col for col in columns_by_names if col not in columns_by_values]
31
+ return both + only_values + only_names
@@ -10,6 +10,7 @@ from pandas.api.types import is_numeric_dtype
10
10
  from upgini.errors import ValidationError
11
11
  from upgini.metadata import EVAL_SET_INDEX, SearchKey
12
12
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
13
+ from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
13
14
 
14
15
  DATE_FORMATS = [
15
16
  "%Y-%m-%d",
@@ -29,6 +30,15 @@ DATE_FORMATS = [
29
30
  DATETIME_PATTERN = r"^[\d\s\.\-:T/+]+$"
30
31
 
31
32
 
33
+ class DateSearchKeyDetector(BaseSearchKeyDetector):
34
+ def _is_search_key_by_name(self, column_name: str) -> bool:
35
+ lower_column_name = str(column_name).lower()
36
+ return "date" in lower_column_name or "time" in lower_column_name or "timestamp" in lower_column_name
37
+
38
+ def _is_search_key_by_values(self, column: pd.Series) -> bool:
39
+ return DateTimeConverter(column.name).is_datetime(column.to_frame(column.name))
40
+
41
+
32
42
  class DateTimeConverter:
33
43
  DATETIME_COL = "_date_time"
34
44
  # MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31) # 946684800000 # 2000-01-01
@@ -80,7 +90,7 @@ class DateTimeConverter:
80
90
  return True
81
91
 
82
92
  parsed = self.parse_datetime(df, raise_errors=False)
83
- return parsed is not None and not parsed.isna().all()
93
+ return parsed is not None and parsed.isna().mean() <= 0.5
84
94
 
85
95
  def parse_datetime(self, df: pd.DataFrame, raise_errors=True) -> pd.Series | None:
86
96
  if len(df) == 0 or df[self.date_column].isna().all():
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: upgini
3
- Version: 1.2.137
3
+ Version: 1.2.139
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=WAhVusMOc6hw9YR0UCWwVJJi3v2_uHEpPqxnSm9SguM,24
1
+ upgini/__about__.py,sha256=OVX5g3MZ0JOT_utGS3DJjpqUC3UrDWMPf-_HJXfHjfo,24
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=D9JzJJkZLPP_dp8GOlGgMhTtrd5pvP-4cHIcqiY3q-E,33354
4
+ upgini/dataset.py,sha256=7qM6O2NnsdnfAVrrPIyHKyUqwlQyuiPsR12ayF0OHsc,33356
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=g1daAwtECDpSsM2TtVmozU3crsdMQ_xC5PlBJAafAX0,236099
6
+ upgini/features_enricher.py,sha256=nddbXsQ27GGcNcS-xl_5IhEk5-7GM7vCxcoRN-kUh2M,237269
7
7
  upgini/http.py,sha256=y26x4TQVYuEM3jz8JdASxSyBtvBemUkFf-FmX25sx-s,44356
8
8
  upgini/metadata.py,sha256=BwUTCY-EUHqPtO0tGazHrk3wqhh-NfjNZhlBHW8bR78,12796
9
9
  upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
@@ -35,10 +35,10 @@ upgini/data_source/data_source_publisher.py,sha256=CQi3fEukaStV-RiadSEvEFLThOlZJ
35
35
  upgini/mdc/__init__.py,sha256=iHJlXQg6xRM1-ZOUtaPSJqw5SpQDszvxp4LyqviNLIQ,1027
36
36
  upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
37
37
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
- upgini/normalizer/normalize_utils.py,sha256=w9f_9udrwqbhXgFMTs2keuce-6X_j6h3D7EdNo_2X7g,8493
38
+ upgini/normalizer/normalize_utils.py,sha256=oKevieBChYxtugocrbev8Uz2vqSZ_PB4Ibo55fHOylM,8452
39
39
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
40
40
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
41
- upgini/resource_bundle/strings.properties,sha256=3aK2sxXYuvSLuoOyLq8IcyekfINH0Il5nLvVXMsuEpY,29353
41
+ upgini/resource_bundle/strings.properties,sha256=Dym1ymRH2uTeb0CfrO_lccoP9LWj7lxNGvsxmR7vkSw,29583
42
42
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
43
43
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  upgini/sampler/base.py,sha256=Fva2FEhLiNRPZ9Q6uOtJRtRzwsayjv7aphalAZO_4lc,6452
@@ -46,13 +46,13 @@ upgini/sampler/random_under_sampler.py,sha256=4mofmaRTmNwT_HqxecWJyfXdLKK0h9jMBw
46
46
  upgini/sampler/utils.py,sha256=PYOk3kKSnFlyxcpdtDNLBEEhTB4lO_iP7pQHqeUcmAc,20211
47
47
  upgini/utils/Roboto-Regular.ttf,sha256=kqYnZjMRQMpbyLulIChCLSdgYa1XF8GsUIoRi2Gcauw,168260
48
48
  upgini/utils/__init__.py,sha256=O_KgzKiJjW3g4NoqZ7lAxUpoHcBi_gze6r3ndEjCH74,842
49
- upgini/utils/base_search_key_detector.py,sha256=Inc6iGG-VXQdejWFfbekIkZk2ahC4k7CdGqzOkie6Bs,1021
49
+ upgini/utils/base_search_key_detector.py,sha256=DCmTbfhCdirJaDMpbKicFvhgVv6a9NNf9tXy3AbMCEg,1258
50
50
  upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl1UOB4s,3382
51
51
  upgini/utils/config.py,sha256=zFdnjchykfp_1Tm3Qep7phLzXBpXIOzr2tIuXchRBLw,1754
52
52
  upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
53
53
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
54
54
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
55
- upgini/utils/datetime_utils.py,sha256=B9zNaH2ZyV-lbBSTBdCZjc4zq1nVlejci40sf-TYfik,17102
55
+ upgini/utils/datetime_utils.py,sha256=pTi5doYjjjlQgwEVDNrJuKaAEC3Jtx78fjX4W7M_UV4,17615
56
56
  upgini/utils/deduplicate_utils.py,sha256=CLX0QapRxB-ZVQT7yGvv1vSd2zac5SwRjCJavujdCps,11332
57
57
  upgini/utils/display_utils.py,sha256=MoTqXZJvC6pAqgOaI3V0FG-IU_LnMfrn4TDcNvUqsdg,13316
58
58
  upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=CihpV6SC95HwtlMH60rGAUzVDa4Id0Bva8ySprmNHlE,
74
74
  upgini/utils/track_info.py,sha256=NDKeQTUlZaYp15UoP-xLKGoDoJQ0drbDMwB0g9R0PUg,6427
75
75
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
76
76
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
77
- upgini-1.2.137.dist-info/METADATA,sha256=qTIcTcJz2tn18BOyAa7SbP14-fOxG4l3rtQyCxG8wDI,51164
78
- upgini-1.2.137.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
79
- upgini-1.2.137.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
- upgini-1.2.137.dist-info/RECORD,,
77
+ upgini-1.2.139.dist-info/METADATA,sha256=LOVD2PiZDgd7Je06bv_AqVQfyhM_NXYvkMTQYq6aBpg,51164
78
+ upgini-1.2.139.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
79
+ upgini-1.2.139.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
+ upgini-1.2.139.dist-info/RECORD,,