PyPI - upgini - Versions diffs - 1.2.14a2__py3-none-any.whl → 1.2.14a4__py3-none-any.whl - Mend

upgini 1.2.14a2py3-none-any.whl → 1.2.14a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.~~14a2~~"
1	+ __version__ = "1.2.14a4"

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -203,7 +203,7 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
 email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
 phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
 phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
-target_type_detected=\nDetected task type: {}\n
+target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
 # all_ok_community_invite=Chat with us in Slack community:
 all_ok_community_invite=❓ Support request
 too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics

upgini/utils/target_utils.py CHANGED Viewed

@@ -24,49 +24,87 @@ def define_task(
 ) -> ModelTaskType:
     if logger is None:
         logger = logging.getLogger()
+    # Replace inf and -inf with NaN to handle extreme values correctly
+    y = y.replace([np.inf, -np.inf], np.nan, inplace=False)
+    # Drop NaN values from the target
     target = y.dropna()
+    # Check if target is numeric and finite
     if is_numeric_dtype(target):
         target = target.loc[np.isfinite(target)]
     else:
+        # If not numeric, drop empty strings as well
         target = target.loc[target != ""]
+    # Raise error if there are no valid values left in the target
     if len(target) == 0:
         raise ValidationError(bundle.get("empty_target"))
+    # Count unique values in the target
     target_items = target.nunique()
+    # Raise error if all target values are the same
     if target_items == 1:
         raise ValidationError(bundle.get("dataset_constant_target"))
+    reason = ""  # Will store the reason for selecting the task type
+    # Binary classification case: exactly two unique values
     if target_items == 2:
         task = ModelTaskType.BINARY
+        reason = "only two unique label-values observed"
     else:
+        # Attempt to convert target to numeric
         try:
             target = pd.to_numeric(target)
             is_numeric = True
         except Exception:
             is_numeric = False
-        # If any value is non numeric - multiclass
+        # If target cannot be converted to numeric, assume multiclass classification
         if not is_numeric:
             task = ModelTaskType.MULTICLASS
+            reason = "non-numeric label values observed"
         else:
+            # Calculate the ratio of unique values to total number of values
+            unique_ratio = target.nunique() / float(len(target))
+            # Multiclass classification: few unique values and integer encoding
             if target.nunique() <= 50 and is_int_encoding(target.unique()):
                 task = ModelTaskType.MULTICLASS
+                reason = "few unique label-values observed and can be considered as categorical"
+            # Regression case: if there are date features, assume regression
             elif has_date:
                 task = ModelTaskType.REGRESSION
+                reason = "date features are present, treating as regression"
             else:
+                # Remove zero values and recalculate unique ratio
                 non_zero_target = target[target != 0]
                 target_items = non_zero_target.nunique()
                 target_ratio = target_items / len(non_zero_target)
+                # Use unique_ratio to determine whether to classify as regression or multiclass
                 if (
-                    (target.dtype.kind == "f" and np.any(target != target.astype(int)))  # any non integer
+                    unique_ratio > 0.1  # Use threshold to differentiate between regression and classification
+                    or (target.dtype.kind == "f" and np.any(target != target.astype(int)))  # Non-integer float values
                     or target_items > 50
-                    or target_ratio > 0.2
+                    or target_ratio > 0.2  # If non-zero values have high ratio of uniqueness
                 ):
                     task = ModelTaskType.REGRESSION
+                    reason = "many unique label-values or non-integer floating point values observed"
                 else:
                     task = ModelTaskType.MULTICLASS
+                    reason = "integer-like values with limited unique values observed"
-    logger.info(f"Detected task type: {task}")
+    # Log or print the reason for the selected task type
+    logger.info(f"Detected task type: {task} (Reason: {reason})")
+    # Print task type and reason if silent mode is off
     if not silent:
-        print(bundle.get("target_type_detected").format(task))
+        print(bundle.get("target_type_detected").format(task, reason))
     return task
@@ -138,6 +176,7 @@ def balance_undersample(
                     ),
                 )
                 sample_strategy[class_value] = int(sample_size)
+            logger.info(f"Rebalance sample strategy: {sample_strategy}. Min class count: {min_class_count}")
             sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
             X = df[SYSTEM_RECORD_ID]
             X = X.to_frame(SYSTEM_RECORD_ID)
@@ -162,6 +201,10 @@ def balance_undersample(
             max_class_count,
             binary_bootstrap_loops * (min_class_count + max(binary_min_sample_threshold - 2 * min_class_count, 0)),
         )
+        logger.info(
+            f"Min class count: {min_class_count}. Max class count: {max_class_count}."
+            f" Rebalance sample size: {sample_size}"
+        )
         sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
         resampled_data = df[
             (df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))

{upgini-1.2.14a2.dist-info → upgini-1.2.14a4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.14a2
+Version: 1.2.14a4
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.14a2.dist-info → upgini-1.2.14a4.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-upgini/__about__.py,sha256=BQP0owrMOVx8xdGySP-ZkH5zEwf-hNcDQtPR3Zq2PP4,25
+upgini/__about__.py,sha256=tfhdEEoOzTUSKNF9hQy8PZO57ri0xEeduAwFCwtVLCg,25
 upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
 upgini/normalizer/normalize_utils.py,sha256=bHRPWCNrUvt2R9qMX6dZFCJ0i8ENVCQ2Rw3dHH9IJEg,7447
 upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
 upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
-upgini/resource_bundle/strings.properties,sha256=OX-v3fKbptgm7XqpqbFruN7OXK0WgasfkatJwYOcgkE,26573
+upgini/resource_bundle/strings.properties,sha256=uo9CIQMg8VFeHlL_mY2dwOumQnr0TenJNPNOfXPWlPI,26715
 upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
 upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -54,10 +54,10 @@ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,1
 upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
 upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
 upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
-upgini/utils/target_utils.py,sha256=9jner9JLWCFhEKN2BqhQyOqagdkhA3mUwe6OCJQTaNU,8235
+upgini/utils/target_utils.py,sha256=3itoOxwEycnIdWeTL3KjuS_NdJleL6nMRqblQLmy6Kk,10413
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
-upgini-1.2.14a2.dist-info/METADATA,sha256=omz8hWvDzi98MxyQ_ifPqh8o1RcTsMrelvgdLvpyJ6o,48579
-upgini-1.2.14a2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.14a2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.14a2.dist-info/RECORD,,
+upgini-1.2.14a4.dist-info/METADATA,sha256=iwhYx9Mru7TEEylcJbeOaZlnKVBcpMrnYXzNEU2M4fg,48579
+upgini-1.2.14a4.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.2.14a4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.14a4.dist-info/RECORD,,

{upgini-1.2.14a2.dist-info → upgini-1.2.14a4.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.14a2.dist-info → upgini-1.2.14a4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.14a2__py3-none-any.whl → 1.2.14a4__py3-none-any.whl

upgini 1.2.14a2py3-none-any.whl → 1.2.14a4py3-none-any.whl