upgini 1.2.38a3769.dev1__py3-none-any.whl → 1.2.38a3769.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.38a3769.dev1"
1
+ __version__ = "1.2.38a3769.dev3"
upgini/dataset.py CHANGED
@@ -312,6 +312,7 @@ class Dataset: # (pd.DataFrame):
312
312
  if v in [FileColumnMeaningType.DATE, FileColumnMeaningType.DATETIME]
313
313
  ),
314
314
  sample_size=sample_rows,
315
+ random_state=self.random_state,
315
316
  logger=self.logger,
316
317
  )
317
318
  else:
@@ -281,8 +281,6 @@ class FeaturesEnricher(TransformerMixin):
281
281
 
282
282
  self.search_keys = search_keys or {}
283
283
  self.id_columns = id_columns
284
- if id_columns is not None:
285
- self.search_keys.update({col: SearchKey.CUSTOM_KEY for col in id_columns})
286
284
  self.country_code = country_code
287
285
  self.__validate_search_keys(search_keys, search_id)
288
286
 
@@ -2657,6 +2655,9 @@ class FeaturesEnricher(TransformerMixin):
2657
2655
 
2658
2656
  self.__adjust_cv(df)
2659
2657
 
2658
+ if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
2659
+ self.search_keys.update({col: SearchKey.CUSTOM_KEY for col in self.id_columns})
2660
+
2660
2661
  df, fintech_warnings = remove_fintech_duplicates(
2661
2662
  df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
2662
2663
  )
@@ -246,6 +246,7 @@ def balance_undersample_forced(
246
246
  id_columns=id_columns,
247
247
  date_column=date_column,
248
248
  sample_size=sample_size,
249
+ random_state=random_state,
249
250
  logger=logger,
250
251
  )
251
252
  elif task_type in [ModelTaskType.MULTICLASS, ModelTaskType.REGRESSION]:
@@ -284,19 +285,28 @@ def balance_undersample_time_series(
284
285
  id_columns: List[str],
285
286
  date_column: str,
286
287
  sample_size: int,
288
+ random_state: int = 42,
287
289
  min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO,
290
+ prefer_recent_dates: bool = True,
288
291
  logger: Optional[logging.Logger] = None,
289
292
  ):
290
293
  def ensure_tuple(x):
291
294
  return tuple([x]) if not isinstance(x, tuple) else x
292
295
 
296
+ random_state = np.random.RandomState(random_state)
297
+
293
298
  ids_sort = df.groupby(id_columns)[date_column].aggregate(["max", "count"]).T.to_dict()
294
- ids_sort = {ensure_tuple(k): (v["max"], v["count"]) for k, v in ids_sort.items()}
299
+ ids_sort = {
300
+ ensure_tuple(k): (
301
+ (v["max"], v["count"], random_state.rand()) if prefer_recent_dates else (v["count"], random_state.rand())
302
+ )
303
+ for k, v in ids_sort.items()
304
+ }
295
305
  id_counts = df[id_columns].value_counts()
296
306
  id_counts.index = [ensure_tuple(i) for i in id_counts.index]
297
307
  id_counts = id_counts.sort_index(key=lambda x: [ids_sort[y] for y in x], ascending=False).cumsum()
298
308
  id_counts = id_counts[id_counts <= sample_size]
299
- min_different_ids = int(len(df[id_columns].drop_duplicates()) * min_different_ids_ratio)
309
+ min_different_ids = max(int(len(df[id_columns].drop_duplicates()) * min_different_ids_ratio), 1)
300
310
 
301
311
  def id_mask(sample_index: pd.Index) -> pd.Index:
302
312
  if isinstance(sample_index, pd.MultiIndex):
@@ -307,10 +317,10 @@ def balance_undersample_time_series(
307
317
  if len(id_counts) < min_different_ids:
308
318
  if logger is not None:
309
319
  logger.info(
310
- f"Different ids count {len(id_counts)} is less than min different ids {min_different_ids}, sampling time window"
320
+ f"Different ids count {len(id_counts)} for sample size {sample_size} is less than min different ids {min_different_ids}, sampling time window"
311
321
  )
312
322
  date_counts = df.groupby(id_columns)[date_column].nunique().sort_values(ascending=False)
313
- ids_to_sample = date_counts.index[:min_different_ids]
323
+ ids_to_sample = date_counts.index[:min_different_ids] if len(id_counts) > 0 else date_counts.index
314
324
  mask = id_mask(ids_to_sample)
315
325
  df = df[mask]
316
326
  sample_date_counts = df[date_column].value_counts().sort_index(ascending=False).cumsum()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.38a3769.dev1
3
+ Version: 1.2.38a3769.dev3
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=p0xaY3SHrNu5ANUCNBeoBbJ2dD9QsJL_eb_HjEWLp7Q,33
1
+ upgini/__about__.py,sha256=sQ7NNr0lfG3UfxCnX2sMNRntUVR0zW-NHhIgizLV7ls,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=zJQUzCTcSV5bqZ9B0oy2a77-oigLmW9F8BGs23WYwA0,33109
4
+ upgini/dataset.py,sha256=zYPSQ73ch6k5EWxZlh1KrjL0gMkmAwl7Nkgrz6zxywY,33161
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=NQuaXJTKf-CR6fM9fGrAjxYMxcoxGPO-YPvyHDRDfag,195477
6
+ upgini/features_enricher.py,sha256=m7z3iWSEj0ORUVnp65I0b_427SITjNnBvn8hdebS_xE,195541
7
7
  upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=-ibqiNjD7dTagqg53FoEJNEqvAYbwgfyn9PGTRQ_YKU,12054
@@ -56,10 +56,10 @@ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,1
56
56
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
57
57
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
58
58
  upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
59
- upgini/utils/target_utils.py,sha256=9LWG8LiCzgYD1h3_MvOFnN3BG8bMLnwfCWdRV47cs_I,13910
59
+ upgini/utils/target_utils.py,sha256=i_EsluRZG3LKrqv9NmhvEha9Uwp8JQjRUmokeo240Is,14283
60
60
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
61
61
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
62
- upgini-1.2.38a3769.dev1.dist-info/METADATA,sha256=xECfr7DVtLllQD_hQft1lzZVdFAXB1uMjGK_BkNXdLc,48604
63
- upgini-1.2.38a3769.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
64
- upgini-1.2.38a3769.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
- upgini-1.2.38a3769.dev1.dist-info/RECORD,,
62
+ upgini-1.2.38a3769.dev3.dist-info/METADATA,sha256=AeaVPfRIc-RCuzozwXSgurTpHXE21yR_tpsBjCra3KA,48604
63
+ upgini-1.2.38a3769.dev3.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
64
+ upgini-1.2.38a3769.dev3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
+ upgini-1.2.38a3769.dev3.dist-info/RECORD,,