upgini 1.2.14a2__py3-none-any.whl → 1.2.14a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.14a2"
1
+ __version__ = "1.2.14a3"
@@ -203,7 +203,7 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
203
203
  email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
204
204
  phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
205
205
  phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
206
- target_type_detected=\nDetected task type: {}\n
206
+ target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
207
207
  # all_ok_community_invite=Chat with us in Slack community:
208
208
  all_ok_community_invite=❓ Support request
209
209
  too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
@@ -24,49 +24,87 @@ def define_task(
24
24
  ) -> ModelTaskType:
25
25
  if logger is None:
26
26
  logger = logging.getLogger()
27
+
28
+ # Replace inf and -inf with NaN to handle extreme values correctly
29
+ y = y.replace([np.inf, -np.inf], np.nan, inplace=False)
30
+
31
+ # Drop NaN values from the target
27
32
  target = y.dropna()
33
+
34
+ # Check if target is numeric and finite
28
35
  if is_numeric_dtype(target):
29
36
  target = target.loc[np.isfinite(target)]
30
37
  else:
38
+ # If not numeric, drop empty strings as well
31
39
  target = target.loc[target != ""]
40
+
41
+ # Raise error if there are no valid values left in the target
32
42
  if len(target) == 0:
33
43
  raise ValidationError(bundle.get("empty_target"))
44
+
45
+ # Count unique values in the target
34
46
  target_items = target.nunique()
47
+
48
+ # Raise error if all target values are the same
35
49
  if target_items == 1:
36
50
  raise ValidationError(bundle.get("dataset_constant_target"))
51
+
52
+ reason = "" # Will store the reason for selecting the task type
53
+
54
+ # Binary classification case: exactly two unique values
37
55
  if target_items == 2:
38
56
  task = ModelTaskType.BINARY
57
+ reason = "only two unique label-values observed"
39
58
  else:
59
+ # Attempt to convert target to numeric
40
60
  try:
41
61
  target = pd.to_numeric(target)
42
62
  is_numeric = True
43
63
  except Exception:
44
64
  is_numeric = False
45
65
 
46
- # If any value is non numeric - multiclass
66
+ # If target cannot be converted to numeric, assume multiclass classification
47
67
  if not is_numeric:
48
68
  task = ModelTaskType.MULTICLASS
69
+ reason = "non-numeric label values observed"
49
70
  else:
71
+ # Calculate the ratio of unique values to total number of values
72
+ unique_ratio = target.nunique() / float(len(target))
73
+
74
+ # Multiclass classification: few unique values and integer encoding
50
75
  if target.nunique() <= 50 and is_int_encoding(target.unique()):
51
76
  task = ModelTaskType.MULTICLASS
77
+ reason = "few unique label-values observed and can be considered as categorical"
78
+ # Regression case: if there are date features, assume regression
52
79
  elif has_date:
53
80
  task = ModelTaskType.REGRESSION
81
+ reason = "date features are present, treating as regression"
54
82
  else:
83
+ # Remove zero values and recalculate unique ratio
55
84
  non_zero_target = target[target != 0]
56
85
  target_items = non_zero_target.nunique()
57
86
  target_ratio = target_items / len(non_zero_target)
87
+
88
+ # Use unique_ratio to determine whether to classify as regression or multiclass
58
89
  if (
59
- (target.dtype.kind == "f" and np.any(target != target.astype(int))) # any non integer
90
+ unique_ratio > 0.1 # Use threshold to differentiate between regression and classification
91
+ or (target.dtype.kind == "f" and np.any(target != target.astype(int))) # Non-integer float values
60
92
  or target_items > 50
61
- or target_ratio > 0.2
93
+ or target_ratio > 0.2 # If non-zero values have high ratio of uniqueness
62
94
  ):
63
95
  task = ModelTaskType.REGRESSION
96
+ reason = "many unique label-values or non-integer floating point values observed"
64
97
  else:
65
98
  task = ModelTaskType.MULTICLASS
99
+ reason = "integer-like values with limited unique values observed"
100
+
101
+ # Log or print the reason for the selected task type
102
+ logger.info(f"Detected task type: {task} (Reason: {reason})")
66
103
 
67
- logger.info(f"Detected task type: {task}")
104
+ # Print task type and reason if silent mode is off
68
105
  if not silent:
69
- print(bundle.get("target_type_detected").format(task))
106
+ print(bundle.get("target_type_detected").format(task, reason))
107
+
70
108
  return task
71
109
 
72
110
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.14a2
3
+ Version: 1.2.14a3
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,4 +1,4 @@
1
- upgini/__about__.py,sha256=BQP0owrMOVx8xdGySP-ZkH5zEwf-hNcDQtPR3Zq2PP4,25
1
+ upgini/__about__.py,sha256=FtGXNn_dLawjJC8X1Icai3ZGyctjvAhmviJGa0zgVTM,25
2
2
  upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
30
30
  upgini/normalizer/normalize_utils.py,sha256=bHRPWCNrUvt2R9qMX6dZFCJ0i8ENVCQ2Rw3dHH9IJEg,7447
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
- upgini/resource_bundle/strings.properties,sha256=OX-v3fKbptgm7XqpqbFruN7OXK0WgasfkatJwYOcgkE,26573
33
+ upgini/resource_bundle/strings.properties,sha256=uo9CIQMg8VFeHlL_mY2dwOumQnr0TenJNPNOfXPWlPI,26715
34
34
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
35
35
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -54,10 +54,10 @@ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,1
54
54
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
55
55
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
56
56
  upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
57
- upgini/utils/target_utils.py,sha256=9jner9JLWCFhEKN2BqhQyOqagdkhA3mUwe6OCJQTaNU,8235
57
+ upgini/utils/target_utils.py,sha256=FsGRmZ2B9-Y1Mnkh2esz41nNgAUX2nleQRUmuDzjk_s,10133
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.2.14a2.dist-info/METADATA,sha256=omz8hWvDzi98MxyQ_ifPqh8o1RcTsMrelvgdLvpyJ6o,48579
61
- upgini-1.2.14a2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
- upgini-1.2.14a2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.2.14a2.dist-info/RECORD,,
60
+ upgini-1.2.14a3.dist-info/METADATA,sha256=JIjXqPhg5R96xFOY1uDijJpO1LHxwkb-sh8Bf7Sc394,48579
61
+ upgini-1.2.14a3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
+ upgini-1.2.14a3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.2.14a3.dist-info/RECORD,,