PyPI - upgini - Versions diffs - 1.2.113a3974.dev2__tar.gz → 1.2.114a2__tar.gz - Mend

upgini 1.2.113a3974.dev2tar.gz → 1.2.114a2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

{upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.113a3974.dev2
+Version: 1.2.114a2
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/
@@ -270,9 +270,9 @@ from upgini.metadata import SearchKey
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"country": SearchKey.COUNTRY,
-    		"zip_code": SearchKey.POSTAL_CODE,
-    		"hashed_email": SearchKey.HEM,
+		"country": SearchKey.COUNTRY,
+		"zip_code": SearchKey.POSTAL_CODE,
+		"hashed_email": SearchKey.HEM,
 		"last_visit_ip_address": SearchKey.IP,
 		"registered_with_phone": SearchKey.PHONE
 	})
@@ -358,9 +358,9 @@ from upgini.metadata import SearchKey
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"country": SearchKey.COUNTRY,
-    		"zip_code": SearchKey.POSTAL_CODE,
-    		"hashed_email": SearchKey.HEM,
+		"country": SearchKey.COUNTRY,
+		"zip_code": SearchKey.POSTAL_CODE,
+		"hashed_email": SearchKey.HEM,
 		"last_visit_ip_address": SearchKey.IP,
 		"registered_with_phone": SearchKey.PHONE
 	},
@@ -381,7 +381,7 @@ from upgini.metadata import SearchKey
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"zip_code": SearchKey.POSTAL_CODE,
+		"zip_code": SearchKey.POSTAL_CODE,
 	},
 	country_code = "US",
 	date_format = "%Y-%d-%m"
@@ -409,8 +409,8 @@ y = train_df["churn_flag"]
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"country": SearchKey.COUNTRY,
-    		"zip_code": SearchKey.POSTAL_CODE
+		"country": SearchKey.COUNTRY,
+		"zip_code": SearchKey.POSTAL_CODE
 	})
 # everything is ready to fit! For 200к records fitting should take around 10 minutes,
@@ -464,8 +464,8 @@ And then, for `transform` in a production ML pipeline, you'll get enrichment wit
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"country": SearchKey.COUNTRY,
-    		"zip_code": SearchKey.POSTAL_CODE,
+		"country": SearchKey.COUNTRY,
+		"zip_code": SearchKey.POSTAL_CODE,
 	},
 )
 ```
@@ -516,8 +516,8 @@ enricher = FeaturesEnricher(
 If you're working with multivariate time series, you should specify id columns of individual univariate series in `FeaturesEnricher`. For example, if you have a dataset predicting sales for different stores and products, you should specify store and product id columns as follows:
 ```python
 enricher = FeaturesEnricher(
-    search_keys={
-        "sales_date": SearchKey.DATE,
+		search_keys={
+				"sales_date": SearchKey.DATE,
     },
     id_columns=["store_id", "product_id"],
     cv=CVType.time_series
@@ -733,9 +733,22 @@ enricher.fit(
 )
 ```
 #### ⚠️ Requirements for out-of-time dataset
-- Same data schema as for search initialization dataset
+- Same data schema as for search initialization X dataset
 - Pandas dataframe representation
+There are 3 options to pass out-of-time without labels:
+```python
+enricher.fit(
+  train_ids_and_features,
+  train_label,
+  eval_set = [
+    (eval_ids_and_features_1,),  # Just tuple of 1 element
+    (eval_ids_and_features_2, None),  # None as labels
+    (eval_ids_and_features_3, [np.nan] * len(eval_ids_and_features_3)),  # List or Series of the same size as eval X
+  ]
+)
+```
 ### Use custom loss function in feature selection & metrics calculation
 `FeaturesEnricher` can be initialized with additional string parameter `loss`.
@@ -797,7 +810,7 @@ enricher = FeaturesEnricher(
 enricher.fit(X, y)
 ```
-## Turn off removing of target outliers
+### Turn off removing of target outliers
 Upgini detect rows with target outlier for regression tasks. By default such rows are dropped on metrics calculation. To turn off removing of target outlier rows use parameter `remove_outliers_calc_metrics=False` in fit, fit_transform or calculate_metrics methods:
 ```python
@@ -808,7 +821,7 @@ enricher = FeaturesEnricher(
 enricher.fit(X, y, remove_outliers_calc_metrics=False)
 ```
-## Turn off generating features on search keys
+### Turn off generating features on search keys
 Upgini tries to generate features on email, date and datetime search keys. By default this generation is enabled. To disable it use parameter `generate_search_key_features` of FeaturesEnricher constructor:
 ```python
@@ -816,6 +829,7 @@ enricher = FeaturesEnricher(
   search_keys={"date": SearchKey.DATE},
   generate_search_key_features=False,
 )
+```
 ## 🔑 Open up all capabilities of Upgini

{upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/README.md RENAMED Viewed

@@ -224,9 +224,9 @@ from upgini.metadata import SearchKey
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"country": SearchKey.COUNTRY,
-    		"zip_code": SearchKey.POSTAL_CODE,
-    		"hashed_email": SearchKey.HEM,
+		"country": SearchKey.COUNTRY,
+		"zip_code": SearchKey.POSTAL_CODE,
+		"hashed_email": SearchKey.HEM,
 		"last_visit_ip_address": SearchKey.IP,
 		"registered_with_phone": SearchKey.PHONE
 	})
@@ -312,9 +312,9 @@ from upgini.metadata import SearchKey
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"country": SearchKey.COUNTRY,
-    		"zip_code": SearchKey.POSTAL_CODE,
-    		"hashed_email": SearchKey.HEM,
+		"country": SearchKey.COUNTRY,
+		"zip_code": SearchKey.POSTAL_CODE,
+		"hashed_email": SearchKey.HEM,
 		"last_visit_ip_address": SearchKey.IP,
 		"registered_with_phone": SearchKey.PHONE
 	},
@@ -335,7 +335,7 @@ from upgini.metadata import SearchKey
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"zip_code": SearchKey.POSTAL_CODE,
+		"zip_code": SearchKey.POSTAL_CODE,
 	},
 	country_code = "US",
 	date_format = "%Y-%d-%m"
@@ -363,8 +363,8 @@ y = train_df["churn_flag"]
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"country": SearchKey.COUNTRY,
-    		"zip_code": SearchKey.POSTAL_CODE
+		"country": SearchKey.COUNTRY,
+		"zip_code": SearchKey.POSTAL_CODE
 	})
 # everything is ready to fit! For 200к records fitting should take around 10 minutes,
@@ -418,8 +418,8 @@ And then, for `transform` in a production ML pipeline, you'll get enrichment wit
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"country": SearchKey.COUNTRY,
-    		"zip_code": SearchKey.POSTAL_CODE,
+		"country": SearchKey.COUNTRY,
+		"zip_code": SearchKey.POSTAL_CODE,
 	},
 )
 ```
@@ -470,8 +470,8 @@ enricher = FeaturesEnricher(
 If you're working with multivariate time series, you should specify id columns of individual univariate series in `FeaturesEnricher`. For example, if you have a dataset predicting sales for different stores and products, you should specify store and product id columns as follows:
 ```python
 enricher = FeaturesEnricher(
-    search_keys={
-        "sales_date": SearchKey.DATE,
+		search_keys={
+				"sales_date": SearchKey.DATE,
     },
     id_columns=["store_id", "product_id"],
     cv=CVType.time_series
@@ -687,9 +687,22 @@ enricher.fit(
 )
 ```
 #### ⚠️ Requirements for out-of-time dataset
-- Same data schema as for search initialization dataset
+- Same data schema as for search initialization X dataset
 - Pandas dataframe representation
+There are 3 options to pass out-of-time without labels:
+```python
+enricher.fit(
+  train_ids_and_features,
+  train_label,
+  eval_set = [
+    (eval_ids_and_features_1,),  # Just tuple of 1 element
+    (eval_ids_and_features_2, None),  # None as labels
+    (eval_ids_and_features_3, [np.nan] * len(eval_ids_and_features_3)),  # List or Series of the same size as eval X
+  ]
+)
+```
 ### Use custom loss function in feature selection & metrics calculation
 `FeaturesEnricher` can be initialized with additional string parameter `loss`.
@@ -751,7 +764,7 @@ enricher = FeaturesEnricher(
 enricher.fit(X, y)
 ```
-## Turn off removing of target outliers
+### Turn off removing of target outliers
 Upgini detect rows with target outlier for regression tasks. By default such rows are dropped on metrics calculation. To turn off removing of target outlier rows use parameter `remove_outliers_calc_metrics=False` in fit, fit_transform or calculate_metrics methods:
 ```python
@@ -762,7 +775,7 @@ enricher = FeaturesEnricher(
 enricher.fit(X, y, remove_outliers_calc_metrics=False)
 ```
-## Turn off generating features on search keys
+### Turn off generating features on search keys
 Upgini tries to generate features on email, date and datetime search keys. By default this generation is enabled. To disable it use parameter `generate_search_key_features` of FeaturesEnricher constructor:
 ```python
@@ -770,6 +783,7 @@ enricher = FeaturesEnricher(
   search_keys={"date": SearchKey.DATE},
   generate_search_key_features=False,
 )
+```
 ## 🔑 Open up all capabilities of Upgini

upgini-1.2.114a2/src/upgini/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.2.114a2"

{upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/dataset.py RENAMED Viewed

@@ -50,7 +50,7 @@ except Exception:
 class Dataset:
     MIN_ROWS_COUNT = 100
-    MAX_ROWS = 100_000
+    MAX_ROWS = 200_000
     IMBALANCE_THESHOLD = 0.6
     MIN_TARGET_CLASS_ROWS = 100
     MAX_MULTICLASS_CLASS_COUNT = 100
@@ -184,7 +184,19 @@ class Dataset:
     def __validate_target(self):
         # self.logger.info("Validating target")
         target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, "")
-        target = self.data[target_column]
+        oot_indices = []
+        if EVAL_SET_INDEX in self.data.columns:
+            for eval_set_index in self.data[EVAL_SET_INDEX].unique():
+                eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
+                if eval_set[target_column].isna().all():
+                    oot_indices.append(eval_set_index)
+        df_to_check = self.data.copy()
+        if oot_indices:
+            df_to_check = df_to_check[~df_to_check[EVAL_SET_INDEX].isin(oot_indices)]
+        target = df_to_check[target_column]
         if self.task_type == ModelTaskType.BINARY:
             if not is_integer_dtype(target):
@@ -201,7 +213,7 @@ class Dataset:
         elif self.task_type == ModelTaskType.MULTICLASS:
             if not is_integer_dtype(target):
                 try:
-                    target = self.data[target_column].astype("category").cat.codes
+                    target = target.astype("category").cat.codes
                 except Exception:
                     self.logger.exception("Failed to cast target to category codes for multiclass task type")
                     raise ValidationError(self.bundle.get("dataset_invalid_multiclass_target").format(target.dtype))
@@ -335,15 +347,37 @@ class Dataset:
         all_valid_message = self.bundle.get("validation_all_valid_message")
         invalid_message = self.bundle.get("validation_invalid_message")
+        oot_indices = []
+        if EVAL_SET_INDEX in self.data.columns:
+            for eval_set_index in self.data[EVAL_SET_INDEX].unique():
+                eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
+                if eval_set[target].isna().all():
+                    oot_indices.append(eval_set_index)
         for col in columns_to_validate:
-            self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
             if validate_target and target is not None and col == target:
-                self.data.loc[self.data[target] == np.inf, f"{col}_is_valid"] = False
+                if oot_indices:
+                    mask_not_oot = ~self.data[EVAL_SET_INDEX].isin(oot_indices)
+                    invalid_target_mask = (
+                        self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
+                    )
+                    # Initialize as valid and mark invalid only for non-OOT rows with NaN or +/-inf
+                    self.data[f"{col}_is_valid"] = True
+                    self.data.loc[mask_not_oot & invalid_target_mask, f"{col}_is_valid"] = False
+                else:
+                    # No OOT: mark invalid where target is NaN or +/-inf
+                    self.data[f"{col}_is_valid"] = ~(
+                        self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
+                    )
+            else:
+                self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
             if col in mandatory_columns:
                 self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
-            invalid_values = list(set(self.data.loc[self.data[f"{col}_is_valid"] == 0, col].head().values))
+            # Use stable pandas API across versions: Series.unique keeps order
+            # and collapses multiple NaNs into a single NaN
+            invalid_values = self.data.loc[self.data[f"{col}_is_valid"] == 0, col].unique().tolist()[:5]
             valid_share = self.data[f"{col}_is_valid"].sum() / nrows
             original_col_name = self.columns_renaming[col]
             validation_stats[original_col_name] = {}

upgini 1.2.113a3974.dev2__tar.gz → 1.2.114a2__tar.gz

upgini 1.2.113a3974.dev2tar.gz → 1.2.114a2tar.gz