PyPI - upgini - Versions diffs - 1.2.38a3769.dev1__py3-none-any.whl → 1.2.38a3769.dev3__py3-none-any.whl - Mend

upgini 1.2.38a3769.dev1py3-none-any.whl → 1.2.38a3769.dev3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (8) hide show

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.38a3769.~~dev1~~"
1	+ __version__ = "1.2.38a3769.dev3"

upgini/dataset.py CHANGED Viewed

@@ -312,6 +312,7 @@ class Dataset:  # (pd.DataFrame):
                         if v in [FileColumnMeaningType.DATE, FileColumnMeaningType.DATETIME]
                     ),
                     sample_size=sample_rows,
+                    random_state=self.random_state,
                     logger=self.logger,
                 )
             else:

upgini/features_enricher.py CHANGED Viewed

@@ -281,8 +281,6 @@ class FeaturesEnricher(TransformerMixin):
         self.search_keys = search_keys or {}
         self.id_columns = id_columns
-        if id_columns is not None:
-            self.search_keys.update({col: SearchKey.CUSTOM_KEY for col in id_columns})
         self.country_code = country_code
         self.__validate_search_keys(search_keys, search_id)
@@ -2657,6 +2655,9 @@ class FeaturesEnricher(TransformerMixin):
         self.__adjust_cv(df)
+        if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
+            self.search_keys.update({col: SearchKey.CUSTOM_KEY for col in self.id_columns})
         df, fintech_warnings = remove_fintech_duplicates(
             df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
         )

upgini/utils/target_utils.py CHANGED Viewed

@@ -246,6 +246,7 @@ def balance_undersample_forced(
             id_columns=id_columns,
             date_column=date_column,
             sample_size=sample_size,
+            random_state=random_state,
             logger=logger,
         )
     elif task_type in [ModelTaskType.MULTICLASS, ModelTaskType.REGRESSION]:
@@ -284,19 +285,28 @@ def balance_undersample_time_series(
     id_columns: List[str],
     date_column: str,
     sample_size: int,
+    random_state: int = 42,
     min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO,
+    prefer_recent_dates: bool = True,
     logger: Optional[logging.Logger] = None,
 ):
     def ensure_tuple(x):
         return tuple([x]) if not isinstance(x, tuple) else x
+    random_state = np.random.RandomState(random_state)
     ids_sort = df.groupby(id_columns)[date_column].aggregate(["max", "count"]).T.to_dict()
-    ids_sort = {ensure_tuple(k): (v["max"], v["count"]) for k, v in ids_sort.items()}
+    ids_sort = {
+        ensure_tuple(k): (
+            (v["max"], v["count"], random_state.rand()) if prefer_recent_dates else (v["count"], random_state.rand())
+        )
+        for k, v in ids_sort.items()
+    }
     id_counts = df[id_columns].value_counts()
     id_counts.index = [ensure_tuple(i) for i in id_counts.index]
     id_counts = id_counts.sort_index(key=lambda x: [ids_sort[y] for y in x], ascending=False).cumsum()
     id_counts = id_counts[id_counts <= sample_size]
-    min_different_ids = int(len(df[id_columns].drop_duplicates()) * min_different_ids_ratio)
+    min_different_ids = max(int(len(df[id_columns].drop_duplicates()) * min_different_ids_ratio), 1)
     def id_mask(sample_index: pd.Index) -> pd.Index:
         if isinstance(sample_index, pd.MultiIndex):
@@ -307,10 +317,10 @@ def balance_undersample_time_series(
     if len(id_counts) < min_different_ids:
         if logger is not None:
             logger.info(
-                f"Different ids count {len(id_counts)} is less than min different ids {min_different_ids}, sampling time window"
+                f"Different ids count {len(id_counts)} for sample size {sample_size} is less than min different ids {min_different_ids}, sampling time window"
             )
         date_counts = df.groupby(id_columns)[date_column].nunique().sort_values(ascending=False)
-        ids_to_sample = date_counts.index[:min_different_ids]
+        ids_to_sample = date_counts.index[:min_different_ids] if len(id_counts) > 0 else date_counts.index
         mask = id_mask(ids_to_sample)
         df = df[mask]
         sample_date_counts = df[date_column].value_counts().sort_index(ascending=False).cumsum()

{upgini-1.2.38a3769.dev1.dist-info → upgini-1.2.38a3769.dev3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.38a3769.dev1
+Version: 1.2.38a3769.dev3
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.38a3769.dev1.dist-info → upgini-1.2.38a3769.dev3.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-upgini/__about__.py,sha256=p0xaY3SHrNu5ANUCNBeoBbJ2dD9QsJL_eb_HjEWLp7Q,33
+upgini/__about__.py,sha256=sQ7NNr0lfG3UfxCnX2sMNRntUVR0zW-NHhIgizLV7ls,33
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
-upgini/dataset.py,sha256=zJQUzCTcSV5bqZ9B0oy2a77-oigLmW9F8BGs23WYwA0,33109
+upgini/dataset.py,sha256=zYPSQ73ch6k5EWxZlh1KrjL0gMkmAwl7Nkgrz6zxywY,33161
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=NQuaXJTKf-CR6fM9fGrAjxYMxcoxGPO-YPvyHDRDfag,195477
+upgini/features_enricher.py,sha256=m7z3iWSEj0ORUVnp65I0b_427SITjNnBvn8hdebS_xE,195541
 upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
 upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
 upgini/metadata.py,sha256=-ibqiNjD7dTagqg53FoEJNEqvAYbwgfyn9PGTRQ_YKU,12054
@@ -56,10 +56,10 @@ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,1
 upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
 upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
 upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
-upgini/utils/target_utils.py,sha256=9LWG8LiCzgYD1h3_MvOFnN3BG8bMLnwfCWdRV47cs_I,13910
+upgini/utils/target_utils.py,sha256=i_EsluRZG3LKrqv9NmhvEha9Uwp8JQjRUmokeo240Is,14283
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.38a3769.dev1.dist-info/METADATA,sha256=xECfr7DVtLllQD_hQft1lzZVdFAXB1uMjGK_BkNXdLc,48604
-upgini-1.2.38a3769.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
-upgini-1.2.38a3769.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.38a3769.dev1.dist-info/RECORD,,
+upgini-1.2.38a3769.dev3.dist-info/METADATA,sha256=AeaVPfRIc-RCuzozwXSgurTpHXE21yR_tpsBjCra3KA,48604
+upgini-1.2.38a3769.dev3.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
+upgini-1.2.38a3769.dev3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.38a3769.dev3.dist-info/RECORD,,

{upgini-1.2.38a3769.dev1.dist-info → upgini-1.2.38a3769.dev3.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.38a3769.dev1.dist-info → upgini-1.2.38a3769.dev3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.38a3769.dev1__py3-none-any.whl → 1.2.38a3769.dev3__py3-none-any.whl

Potentially problematic release.

upgini 1.2.38a3769.dev1py3-none-any.whl → 1.2.38a3769.dev3py3-none-any.whl