PyPI - upgini - Versions diffs - 1.2.113a3974.dev2__tar.gz → 1.2.114a1__tar.gz - Mend

upgini 1.2.113a3974.dev2tar.gz → 1.2.114a1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

{upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.113a3974.dev2
+Version: 1.2.114a1
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/
@@ -270,9 +270,9 @@ from upgini.metadata import SearchKey
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"country": SearchKey.COUNTRY,
-    		"zip_code": SearchKey.POSTAL_CODE,
-    		"hashed_email": SearchKey.HEM,
+		"country": SearchKey.COUNTRY,
+		"zip_code": SearchKey.POSTAL_CODE,
+		"hashed_email": SearchKey.HEM,
 		"last_visit_ip_address": SearchKey.IP,
 		"registered_with_phone": SearchKey.PHONE
 	})
@@ -358,9 +358,9 @@ from upgini.metadata import SearchKey
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"country": SearchKey.COUNTRY,
-    		"zip_code": SearchKey.POSTAL_CODE,
-    		"hashed_email": SearchKey.HEM,
+		"country": SearchKey.COUNTRY,
+		"zip_code": SearchKey.POSTAL_CODE,
+		"hashed_email": SearchKey.HEM,
 		"last_visit_ip_address": SearchKey.IP,
 		"registered_with_phone": SearchKey.PHONE
 	},
@@ -381,7 +381,7 @@ from upgini.metadata import SearchKey
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"zip_code": SearchKey.POSTAL_CODE,
+		"zip_code": SearchKey.POSTAL_CODE,
 	},
 	country_code = "US",
 	date_format = "%Y-%d-%m"
@@ -409,8 +409,8 @@ y = train_df["churn_flag"]
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"country": SearchKey.COUNTRY,
-    		"zip_code": SearchKey.POSTAL_CODE
+		"country": SearchKey.COUNTRY,
+		"zip_code": SearchKey.POSTAL_CODE
 	})
 # everything is ready to fit! For 200к records fitting should take around 10 minutes,
@@ -464,8 +464,8 @@ And then, for `transform` in a production ML pipeline, you'll get enrichment wit
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"country": SearchKey.COUNTRY,
-    		"zip_code": SearchKey.POSTAL_CODE,
+		"country": SearchKey.COUNTRY,
+		"zip_code": SearchKey.POSTAL_CODE,
 	},
 )
 ```
@@ -516,8 +516,8 @@ enricher = FeaturesEnricher(
 If you're working with multivariate time series, you should specify id columns of individual univariate series in `FeaturesEnricher`. For example, if you have a dataset predicting sales for different stores and products, you should specify store and product id columns as follows:
 ```python
 enricher = FeaturesEnricher(
-    search_keys={
-        "sales_date": SearchKey.DATE,
+		search_keys={
+				"sales_date": SearchKey.DATE,
     },
     id_columns=["store_id", "product_id"],
     cv=CVType.time_series
@@ -733,9 +733,22 @@ enricher.fit(
 )
 ```
 #### ⚠️ Requirements for out-of-time dataset
-- Same data schema as for search initialization dataset
+- Same data schema as for search initialization X dataset
 - Pandas dataframe representation
+There are 3 options to pass out-of-time without labels:
+```python
+enricher.fit(
+  train_ids_and_features,
+  train_label,
+  eval_set = [
+    (eval_ids_and_features_1,),  # Just tuple of 1 element
+    (eval_ids_and_features_2, None),  # None as labels
+    (eval_ids_and_features_3, [np.nan] * len(eval_ids_and_features_3)),  # List or Series of the same size as eval X
+  ]
+)
+```
 ### Use custom loss function in feature selection & metrics calculation
 `FeaturesEnricher` can be initialized with additional string parameter `loss`.
@@ -797,7 +810,7 @@ enricher = FeaturesEnricher(
 enricher.fit(X, y)
 ```
-## Turn off removing of target outliers
+### Turn off removing of target outliers
 Upgini detect rows with target outlier for regression tasks. By default such rows are dropped on metrics calculation. To turn off removing of target outlier rows use parameter `remove_outliers_calc_metrics=False` in fit, fit_transform or calculate_metrics methods:
 ```python
@@ -808,7 +821,7 @@ enricher = FeaturesEnricher(
 enricher.fit(X, y, remove_outliers_calc_metrics=False)
 ```
-## Turn off generating features on search keys
+### Turn off generating features on search keys
 Upgini tries to generate features on email, date and datetime search keys. By default this generation is enabled. To disable it use parameter `generate_search_key_features` of FeaturesEnricher constructor:
 ```python
@@ -816,6 +829,7 @@ enricher = FeaturesEnricher(
   search_keys={"date": SearchKey.DATE},
   generate_search_key_features=False,
 )
+```
 ## 🔑 Open up all capabilities of Upgini

{upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/README.md RENAMED Viewed

@@ -224,9 +224,9 @@ from upgini.metadata import SearchKey
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"country": SearchKey.COUNTRY,
-    		"zip_code": SearchKey.POSTAL_CODE,
-    		"hashed_email": SearchKey.HEM,
+		"country": SearchKey.COUNTRY,
+		"zip_code": SearchKey.POSTAL_CODE,
+		"hashed_email": SearchKey.HEM,
 		"last_visit_ip_address": SearchKey.IP,
 		"registered_with_phone": SearchKey.PHONE
 	})
@@ -312,9 +312,9 @@ from upgini.metadata import SearchKey
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"country": SearchKey.COUNTRY,
-    		"zip_code": SearchKey.POSTAL_CODE,
-    		"hashed_email": SearchKey.HEM,
+		"country": SearchKey.COUNTRY,
+		"zip_code": SearchKey.POSTAL_CODE,
+		"hashed_email": SearchKey.HEM,
 		"last_visit_ip_address": SearchKey.IP,
 		"registered_with_phone": SearchKey.PHONE
 	},
@@ -335,7 +335,7 @@ from upgini.metadata import SearchKey
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"zip_code": SearchKey.POSTAL_CODE,
+		"zip_code": SearchKey.POSTAL_CODE,
 	},
 	country_code = "US",
 	date_format = "%Y-%d-%m"
@@ -363,8 +363,8 @@ y = train_df["churn_flag"]
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"country": SearchKey.COUNTRY,
-    		"zip_code": SearchKey.POSTAL_CODE
+		"country": SearchKey.COUNTRY,
+		"zip_code": SearchKey.POSTAL_CODE
 	})
 # everything is ready to fit! For 200к records fitting should take around 10 minutes,
@@ -418,8 +418,8 @@ And then, for `transform` in a production ML pipeline, you'll get enrichment wit
 enricher = FeaturesEnricher(
 	search_keys={
 		"subscription_activation_date": SearchKey.DATE,
-    		"country": SearchKey.COUNTRY,
-    		"zip_code": SearchKey.POSTAL_CODE,
+		"country": SearchKey.COUNTRY,
+		"zip_code": SearchKey.POSTAL_CODE,
 	},
 )
 ```
@@ -470,8 +470,8 @@ enricher = FeaturesEnricher(
 If you're working with multivariate time series, you should specify id columns of individual univariate series in `FeaturesEnricher`. For example, if you have a dataset predicting sales for different stores and products, you should specify store and product id columns as follows:
 ```python
 enricher = FeaturesEnricher(
-    search_keys={
-        "sales_date": SearchKey.DATE,
+		search_keys={
+				"sales_date": SearchKey.DATE,
     },
     id_columns=["store_id", "product_id"],
     cv=CVType.time_series
@@ -687,9 +687,22 @@ enricher.fit(
 )
 ```
 #### ⚠️ Requirements for out-of-time dataset
-- Same data schema as for search initialization dataset
+- Same data schema as for search initialization X dataset
 - Pandas dataframe representation
+There are 3 options to pass out-of-time without labels:
+```python
+enricher.fit(
+  train_ids_and_features,
+  train_label,
+  eval_set = [
+    (eval_ids_and_features_1,),  # Just tuple of 1 element
+    (eval_ids_and_features_2, None),  # None as labels
+    (eval_ids_and_features_3, [np.nan] * len(eval_ids_and_features_3)),  # List or Series of the same size as eval X
+  ]
+)
+```
 ### Use custom loss function in feature selection & metrics calculation
 `FeaturesEnricher` can be initialized with additional string parameter `loss`.
@@ -751,7 +764,7 @@ enricher = FeaturesEnricher(
 enricher.fit(X, y)
 ```
-## Turn off removing of target outliers
+### Turn off removing of target outliers
 Upgini detect rows with target outlier for regression tasks. By default such rows are dropped on metrics calculation. To turn off removing of target outlier rows use parameter `remove_outliers_calc_metrics=False` in fit, fit_transform or calculate_metrics methods:
 ```python
@@ -762,7 +775,7 @@ enricher = FeaturesEnricher(
 enricher.fit(X, y, remove_outliers_calc_metrics=False)
 ```
-## Turn off generating features on search keys
+### Turn off generating features on search keys
 Upgini tries to generate features on email, date and datetime search keys. By default this generation is enabled. To disable it use parameter `generate_search_key_features` of FeaturesEnricher constructor:
 ```python
@@ -770,6 +783,7 @@ enricher = FeaturesEnricher(
   search_keys={"date": SearchKey.DATE},
   generate_search_key_features=False,
 )
+```
 ## 🔑 Open up all capabilities of Upgini

upgini-1.2.114a1/src/upgini/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.2.114a1"

{upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/dataset.py RENAMED Viewed

@@ -343,7 +343,9 @@ class Dataset:
             if col in mandatory_columns:
                 self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
-            invalid_values = list(set(self.data.loc[self.data[f"{col}_is_valid"] == 0, col].head().values))
+            # Use stable pandas API across versions: Series.unique keeps order
+            # and collapses multiple NaNs into a single NaN
+            invalid_values = self.data.loc[self.data[f"{col}_is_valid"] == 0, col].unique().tolist()[:5]
             valid_share = self.data[f"{col}_is_valid"].sum() / nrows
             original_col_name = self.columns_renaming[col]
             validation_stats[original_col_name] = {}

upgini 1.2.113a3974.dev2__tar.gz → 1.2.114a1__tar.gz

upgini 1.2.113a3974.dev2tar.gz → 1.2.114a1tar.gz