upgini 1.1.264a1__py3-none-any.whl → 1.1.265__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -70,6 +70,7 @@ from upgini.utils.datetime_utils import (
70
70
  DateTimeSearchKeyConverter,
71
71
  is_blocked_time_series,
72
72
  is_time_series,
73
+ validate_dates_distribution,
73
74
  )
74
75
  from upgini.utils.deduplicate_utils import (
75
76
  clean_full_duplicates,
@@ -1922,7 +1923,7 @@ class FeaturesEnricher(TransformerMixin):
1922
1923
 
1923
1924
  meaning_types = {col: key.value for col, key in search_keys.items()}
1924
1925
  non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1925
- # Don't pass
1926
+
1926
1927
  if email_converted_to_hem:
1927
1928
  non_keys_columns.append(email_column)
1928
1929
 
@@ -2221,6 +2222,10 @@ class FeaturesEnricher(TransformerMixin):
2221
2222
  self.fit_search_keys = self.search_keys.copy()
2222
2223
  self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
2223
2224
 
2225
+ validate_dates_distribution(
2226
+ validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter
2227
+ )
2228
+
2224
2229
  has_date = self._get_date_column(self.fit_search_keys) is not None
2225
2230
  model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
2226
2231
  self._validate_binary_observations(validated_y, model_task_type)
@@ -111,6 +111,7 @@ x_is_empty=X is empty
111
111
  y_is_empty=y is empty
112
112
  x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
113
113
  missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
114
+ x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample.
114
115
  # eval set validation
115
116
  unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
116
117
  eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
@@ -1,7 +1,7 @@
1
1
  import datetime
2
2
  import logging
3
3
  import re
4
- from typing import List, Optional
4
+ from typing import Dict, List, Optional
5
5
 
6
6
  import numpy as np
7
7
  import pandas as pd
@@ -9,7 +9,9 @@ from dateutil.relativedelta import relativedelta
9
9
  from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
10
10
 
11
11
  from upgini.errors import ValidationError
12
+ from upgini.metadata import SearchKey
12
13
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
14
+ from upgini.utils.warning_counter import WarningCounter
13
15
 
14
16
  DATE_FORMATS = [
15
17
  "%Y-%m-%d",
@@ -225,3 +227,49 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
225
227
 
226
228
  is_diff_less_than_two_columns = grouped.apply(check_differences)
227
229
  return is_diff_less_than_two_columns.all()
230
+
231
+
232
+ def validate_dates_distribution(
233
+ X: pd.DataFrame,
234
+ search_keys: Dict[str, SearchKey],
235
+ logger: Optional[logging.Logger] = None,
236
+ bundle: Optional[ResourceBundle] = None,
237
+ warning_counter: Optional[WarningCounter] = None,
238
+ ):
239
+ maybe_date_col = None
240
+ for key, key_type in search_keys.items():
241
+ if key_type in [SearchKey.DATE, SearchKey.DATETIME]:
242
+ maybe_date_col = key
243
+
244
+ if maybe_date_col is None:
245
+ for col in X.columns:
246
+ if col in search_keys:
247
+ continue
248
+ try:
249
+ pd.to_datetime(X[col])
250
+ maybe_date_col = col
251
+ break
252
+ except Exception:
253
+ pass
254
+
255
+ if maybe_date_col is None:
256
+ return
257
+
258
+ dates = pd.to_datetime(X[maybe_date_col]).dt.date
259
+
260
+ date_counts = dates.value_counts().sort_index()
261
+
262
+ date_counts_1 = date_counts[: round(len(date_counts) / 2)]
263
+ date_counts_2 = date_counts[round(len(date_counts) / 2) :]
264
+ ratio = date_counts_2.mean() / date_counts_1.mean()
265
+
266
+ if ratio > 1.2 or ratio < 0.8:
267
+ if warning_counter is not None:
268
+ warning_counter.increment()
269
+ if logger is None:
270
+ logger = logging.getLogger("muted_logger")
271
+ logger.setLevel("FATAL")
272
+ bundle = bundle or get_custom_bundle()
273
+ msg = bundle.get("x_unstable_by_date")
274
+ print(msg)
275
+ logger.warning(msg)
@@ -132,9 +132,7 @@ def balance_undersample(
132
132
  class_value = classes[class_idx]
133
133
  class_count = vc[class_value]
134
134
  sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
135
- sampler = RandomUnderSampler(
136
- sampling_strategy=sample_strategy, random_state=random_state
137
- )
135
+ sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
138
136
  X = df[SYSTEM_RECORD_ID]
139
137
  X = X.to_frame(SYSTEM_RECORD_ID)
140
138
  new_x, _ = sampler.fit_resample(X, target) # type: ignore
@@ -153,9 +151,7 @@ def balance_undersample(
153
151
  minority_class = df[df[target_column] == min_class_value]
154
152
  majority_class = df[df[target_column] != min_class_value]
155
153
  sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
156
- sampled_majority_class = majority_class.sample(
157
- n=sample_size, random_state=random_state
158
- )
154
+ sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
159
155
  resampled_data = df[
160
156
  (df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
161
157
  | (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.264a1
3
+ Version: 1.1.265
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -2,7 +2,7 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
3
  upgini/dataset.py,sha256=xb4gIANyGbdcuM8Awyq2pJPiH_3k_LEbETApJgAoRBA,45529
4
4
  upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
5
- upgini/features_enricher.py,sha256=-RtPvppDLzAVXqak2FHBJQdvzPhk1EHskM__vcZoGEE,172730
5
+ upgini/features_enricher.py,sha256=5rc9vcsCBwmRDb8aAPOFGmkRbC7_zGJGPlaSvkytqCk,172880
6
6
  upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
7
7
  upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
8
8
  upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
@@ -29,7 +29,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
29
29
  upgini/normalizer/phone_normalizer.py,sha256=lhwsPEnfyjeIsndW2EcQGZksXYsfxaQ1ghAzVYoDRKM,9927
30
30
  upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
31
31
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
32
- upgini/resource_bundle/strings.properties,sha256=AgAzMSq-_vt_uIALOlB_xPVfcQuYYWxGvrOm8lN1CkY,25522
32
+ upgini/resource_bundle/strings.properties,sha256=_bEfgRl2a9sgoy2RxvIf26NemnCW5CM-1AWWpljwZQE,25664
33
33
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
34
34
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
35
  upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
@@ -41,7 +41,7 @@ upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6P
41
41
  upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU,6462
42
42
  upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
43
43
  upgini/utils/cv_utils.py,sha256=Tn01RJvpZGZh0PUQUimlBkV-AXwe7s6yjCNFtw352Uc,3525
44
- upgini/utils/datetime_utils.py,sha256=ol5Bgh98wU6KBY9z4QskNO0ja-L7HJL70HmTAjl7iRU,8836
44
+ upgini/utils/datetime_utils.py,sha256=4ii5WphAHlb_NRmdJx35VZpTarJbAr-AnDw3XSzUSow,10346
45
45
  upgini/utils/deduplicate_utils.py,sha256=6AbARehUCghJZ4PppFtrej2s3gFRruh41MEm6mzakHs,8607
46
46
  upgini/utils/display_utils.py,sha256=LKoSwjrE0xgS5_cqVhc2og2CQ1UCZ1nTI2VKboIhoQA,10858
47
47
  upgini/utils/email_utils.py,sha256=3CvHXTSzlgLyGsQOXfRYVfFhfPy6OXG4uXOBWRaLfHg,3479
@@ -53,11 +53,11 @@ upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,4
53
53
  upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3x_zs,409
54
54
  upgini/utils/progress_bar.py,sha256=iNXyqT3vKCeHpfiG5HHwr7Lk2cTtKViM93Fl8iZnjGc,1564
55
55
  upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,44027
56
- upgini/utils/target_utils.py,sha256=WVhhxpQVvnhsDV7ctlds51VFg7hz59S_MFUSoRZFszw,7204
56
+ upgini/utils/target_utils.py,sha256=5BHcOsBRb4z7P8t3e9rsdXUWUUI7DBmQMmv-x6RwzHM,7152
57
57
  upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
58
58
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
59
- upgini-1.1.264a1.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
60
- upgini-1.1.264a1.dist-info/METADATA,sha256=2t2VzbA1zrEr428Dxp5FqaiSL1zthQIERsEwljj46eA,48158
61
- upgini-1.1.264a1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
62
- upgini-1.1.264a1.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
63
- upgini-1.1.264a1.dist-info/RECORD,,
59
+ upgini-1.1.265.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
60
+ upgini-1.1.265.dist-info/METADATA,sha256=HX-CwFFNgXRRuZ00TELhLI1-3ufrny1K0uZc9p0JWdA,48156
61
+ upgini-1.1.265.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
62
+ upgini-1.1.265.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
63
+ upgini-1.1.265.dist-info/RECORD,,