upgini 1.2.120__tar.gz → 1.2.121__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. {upgini-1.2.120 → upgini-1.2.121}/PKG-INFO +1 -1
  2. upgini-1.2.121/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/data_source/data_source_publisher.py +6 -3
  4. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/resource_bundle/strings.properties +1 -1
  5. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/features_validator.py +49 -17
  6. upgini-1.2.120/src/upgini/__about__.py +0 -1
  7. {upgini-1.2.120 → upgini-1.2.121}/.gitignore +0 -0
  8. {upgini-1.2.120 → upgini-1.2.121}/LICENSE +0 -0
  9. {upgini-1.2.120 → upgini-1.2.121}/README.md +0 -0
  10. {upgini-1.2.120 → upgini-1.2.121}/pyproject.toml +0 -0
  11. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/__init__.py +0 -0
  12. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/ads.py +0 -0
  13. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/ads_management/__init__.py +0 -0
  14. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/ads_management/ads_manager.py +0 -0
  15. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/autofe/__init__.py +0 -0
  16. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/autofe/all_operators.py +0 -0
  17. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/autofe/binary.py +0 -0
  18. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/autofe/date.py +0 -0
  19. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/autofe/feature.py +0 -0
  20. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/autofe/groupby.py +0 -0
  21. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/autofe/operator.py +0 -0
  22. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/autofe/timeseries/__init__.py +0 -0
  23. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/autofe/timeseries/base.py +0 -0
  24. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/autofe/timeseries/cross.py +0 -0
  25. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/autofe/timeseries/delta.py +0 -0
  26. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/autofe/timeseries/lag.py +0 -0
  27. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/autofe/timeseries/roll.py +0 -0
  28. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/autofe/timeseries/trend.py +0 -0
  29. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/autofe/timeseries/volatility.py +0 -0
  30. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/autofe/unary.py +0 -0
  31. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/autofe/utils.py +0 -0
  32. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/autofe/vector.py +0 -0
  33. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/data_source/__init__.py +0 -0
  34. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/dataset.py +0 -0
  35. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/errors.py +0 -0
  36. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/features_enricher.py +0 -0
  37. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/http.py +0 -0
  38. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/mdc/__init__.py +0 -0
  39. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/mdc/context.py +0 -0
  40. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/metadata.py +0 -0
  41. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/metrics.py +0 -0
  42. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/normalizer/__init__.py +0 -0
  43. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/normalizer/normalize_utils.py +0 -0
  44. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/resource_bundle/__init__.py +0 -0
  45. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/resource_bundle/exceptions.py +0 -0
  46. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  47. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/sampler/__init__.py +0 -0
  48. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/sampler/base.py +0 -0
  49. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/sampler/random_under_sampler.py +0 -0
  50. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/sampler/utils.py +0 -0
  51. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/search_task.py +0 -0
  52. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/spinner.py +0 -0
  53. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  54. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/__init__.py +0 -0
  55. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/base_search_key_detector.py +0 -0
  56. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/blocked_time_series.py +0 -0
  57. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/config.py +0 -0
  58. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/country_utils.py +0 -0
  59. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/custom_loss_utils.py +0 -0
  60. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/cv_utils.py +0 -0
  61. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/datetime_utils.py +0 -0
  62. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/deduplicate_utils.py +0 -0
  63. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/display_utils.py +0 -0
  64. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/email_utils.py +0 -0
  65. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/fallback_progress_bar.py +0 -0
  66. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/feature_info.py +0 -0
  67. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/format.py +0 -0
  68. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/hash_utils.py +0 -0
  69. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/ip_utils.py +0 -0
  70. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/mstats.py +0 -0
  71. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/phone_utils.py +0 -0
  72. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/postal_code_utils.py +0 -0
  73. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/progress_bar.py +0 -0
  74. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/psi.py +0 -0
  75. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/sample_utils.py +0 -0
  76. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/sklearn_ext.py +0 -0
  77. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/sort.py +0 -0
  78. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/target_utils.py +0 -0
  79. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/track_info.py +0 -0
  80. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/ts_utils.py +0 -0
  81. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/utils/warning_counter.py +0 -0
  82. {upgini-1.2.120 → upgini-1.2.121}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.120
3
+ Version: 1.2.121
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.121"
@@ -519,21 +519,24 @@ class DataSourcePublisher:
519
519
  description: str = "",
520
520
  ):
521
521
  if model_type is not None and model_type not in ["ONNX", "CATBOOST"]:
522
- raise ValueError(f"Invalid model type: {model_type}. Available values: ONNX")
522
+ raise ValueError(f"Invalid model type: {model_type}. Available values: ONNX, CATBOOST")
523
523
  metadata = {
524
524
  "modelName": name,
525
525
  "inputNames": input_names,
526
526
  "dateColumn": date_column,
527
527
  "scoreName": score_name,
528
528
  "searchTaskId": search_id,
529
- "modelType": model_type or "ONNX",
529
+ "modelType": model_type or "CATBOOST",
530
530
  "description": description,
531
531
  }
532
532
 
533
533
  trace_id = str(uuid.uuid4())
534
534
  with MDC(trace_id=trace_id):
535
535
  try:
536
- self._rest_client.upload_autofe_model(file_path, metadata, trace_id)
536
+ result = self._rest_client.upload_autofe_model(file_path, metadata, trace_id)
537
+ if "ERROR" in result:
538
+ raise Exception(result)
539
+ print(result)
537
540
  except Exception:
538
541
  self.logger.exception("Failed to upload autofe model")
539
542
  raise
@@ -155,7 +155,7 @@ target_outliers_warning=We detected {} outliers in your sample.\nExamples of out
155
155
  # features validation
156
156
  empty_or_contant_features=Columns {} has value with frequency more than 99%, removed from X
157
157
  high_cardinality_features=Columns {} has high cardinality (>90% unique values), removed from X
158
- # one_hot_encoded_features=One hot encoded features detected. Use int encoding for correct results of fit.\n{}
158
+ one_hot_encoded_features=One hot encoded features detected. Use int encoding for correct results of fit.\n{}
159
159
 
160
160
  # Dataset validation
161
161
  dataset_too_few_rows=X size should be at least {} rows after validation
@@ -24,7 +24,7 @@ class FeaturesValidator:
24
24
  features_for_generate: Optional[List[str]] = None,
25
25
  columns_renaming: Optional[Dict[str, str]] = None,
26
26
  ) -> Tuple[List[str], List[str]]:
27
- # one_hot_encoded_features = []
27
+ one_hot_encoded_features = []
28
28
  empty_or_constant_features = []
29
29
  high_cardinality_features = []
30
30
  warnings = []
@@ -36,23 +36,17 @@ class FeaturesValidator:
36
36
  value_counts = column.value_counts(dropna=False, normalize=True)
37
37
  most_frequent_percent = value_counts.iloc[0]
38
38
 
39
- if most_frequent_percent >= 0.99:
39
+ if len(value_counts) == 1:
40
40
  empty_or_constant_features.append(f)
41
-
42
- # TODO implement one-hot encoding check
43
- # if len(value_counts) == 1:
44
- # empty_or_constant_features.append(f)
45
- # elif most_frequent_percent >= 0.99:
46
- # empty_or_constant_features.append(f)
47
- # if set(value_counts.index.to_list()) == {0, 1}:
48
- # one_hot_encoded_features.append(f)
49
- # else:
50
- # empty_or_constant_features.append(f)
51
- # continue
52
-
53
- # if one_hot_encoded_features:
54
- # msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
55
- # warnings.append(msg)
41
+ elif most_frequent_percent >= 0.99:
42
+ if self.is_one_hot_encoded(column):
43
+ one_hot_encoded_features.append(f)
44
+ else:
45
+ empty_or_constant_features.append(f)
46
+
47
+ if one_hot_encoded_features:
48
+ msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
49
+ warnings.append(msg)
56
50
 
57
51
  columns_renaming = columns_renaming or {}
58
52
 
@@ -102,3 +96,41 @@ class FeaturesValidator:
102
96
  @staticmethod
103
97
  def find_constant_features(df: pd.DataFrame) -> List[str]:
104
98
  return [i for i in df if df[i].nunique() <= 1]
99
+
100
+ @staticmethod
101
+ def is_one_hot_encoded(series: pd.Series) -> bool:
102
+ try:
103
+ # All rows should be the same type
104
+ if series.apply(lambda x: type(x)).nunique() != 1:
105
+ return False
106
+
107
+ # First, handle string representations of True/False
108
+ series_copy = series.copy()
109
+ if series_copy.dtype == "object" or series_copy.dtype == "string":
110
+ # Convert string representations of boolean values to numeric
111
+ series_copy = series_copy.astype(str).str.strip().str.lower()
112
+ series_copy = series_copy.replace({"true": "1", "false": "0"})
113
+
114
+ # Column contains only 0 and 1 (as strings or numbers or booleans)
115
+ series_copy = series_copy.astype(float)
116
+ if set(series_copy.unique()) != {0.0, 1.0}:
117
+ return False
118
+
119
+ series_copy = series_copy.astype(int)
120
+
121
+ # Column doesn't contain any NaN, np.NaN, space, null, etc.
122
+ if not (series_copy.isin([0, 1])).all():
123
+ return False
124
+
125
+ vc = series_copy.value_counts()
126
+ # Column should contain both 0 and 1
127
+ if len(vc) != 2:
128
+ return False
129
+
130
+ # Minority class is 1
131
+ if vc[1] >= vc[0]:
132
+ return False
133
+
134
+ return True
135
+ except ValueError:
136
+ return False
@@ -1 +0,0 @@
1
- __version__ = "1.2.120"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes