upgini 1.2.121a1__py3-none-any.whl → 1.2.121a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.121a1"
1
+ __version__ = "1.2.121a3"
@@ -155,7 +155,7 @@ target_outliers_warning=We detected {} outliers in your sample.\nExamples of out
155
155
  # features validation
156
156
  empty_or_contant_features=Columns {} has value with frequency more than 99%, removed from X
157
157
  high_cardinality_features=Columns {} has high cardinality (>90% unique values), removed from X
158
- # one_hot_encoded_features=One hot encoded features detected. Use int encoding for correct results of fit.\n{}
158
+ one_hot_encoded_features=One hot encoded features detected: {}
159
159
 
160
160
  # Dataset validation
161
161
  dataset_too_few_rows=X size should be at least {} rows after validation
@@ -36,9 +36,6 @@ class FeaturesValidator:
36
36
  value_counts = column.value_counts(dropna=False, normalize=True)
37
37
  most_frequent_percent = value_counts.iloc[0]
38
38
 
39
- if most_frequent_percent >= 0.99:
40
- empty_or_constant_features.append(f)
41
-
42
39
  if len(value_counts) == 1:
43
40
  empty_or_constant_features.append(f)
44
41
  elif most_frequent_percent >= 0.99:
@@ -103,18 +100,25 @@ class FeaturesValidator:
103
100
  @staticmethod
104
101
  def is_one_hot_encoded(series: pd.Series) -> bool:
105
102
  try:
106
- # Column contains only 0 and 1 (as strings or numbers)
107
- series = series.astype(float)
108
- if set(series.unique()) != {0.0, 1.0}:
103
+ # First, handle string representations of True/False
104
+ series_copy = series.copy()
105
+ if series_copy.dtype == "object" or series_copy.dtype == "string":
106
+ # Convert string representations of boolean values to numeric
107
+ series_copy = series_copy.astype(str).str.strip().str.lower()
108
+ series_copy = series_copy.replace({"true": "1", "false": "0"})
109
+
110
+ # Column contains only 0 and 1 (as strings or numbers or booleans)
111
+ series_copy = series_copy.astype(float)
112
+ if set(series_copy.unique()) != {0.0, 1.0}:
109
113
  return False
110
114
 
111
- series = series.astype(int)
115
+ series_copy = series_copy.astype(int)
112
116
 
113
117
  # Column doesn't contain any NaN, np.NaN, space, null, etc.
114
- if not (series.isin([0, 1])).all():
118
+ if not (series_copy.isin([0, 1])).all():
115
119
  return False
116
120
 
117
- vc = series.value_counts()
121
+ vc = series_copy.value_counts()
118
122
  # Column should contain both 0 and 1
119
123
  if len(vc) != 2:
120
124
  return False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.121a1
3
+ Version: 1.2.121a3
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,4 +1,4 @@
1
- upgini/__about__.py,sha256=lbqEfhDGdLuugmia7aJpwXt4xpDEZT5h_07_bMMutgk,26
1
+ upgini/__about__.py,sha256=Cxw4BNratannn0d_z-lnfpv2ewsqbgmtVkJj8WoA6x0,26
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=pQ8JQe0cdygD-W9GefJmfE6bnj4EYzXsjlgWdIS9nS8,31578
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
38
38
  upgini/normalizer/normalize_utils.py,sha256=mDh2mBW3aQMB4EFP2aHbf2dGMVkOcWnp4sKKvKDBh8w,8511
39
39
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
40
40
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
41
- upgini/resource_bundle/strings.properties,sha256=cNeVkWZMyjGCYGqmOOeJqisqPSEBtmfIw_U1rmgQw4w,29285
41
+ upgini/resource_bundle/strings.properties,sha256=Kmc6ZHpo0hK-bEQuoQkU0SPIQCnIDYRKqkfN3a_gvRU,29237
42
42
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
43
43
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  upgini/sampler/base.py,sha256=Fva2FEhLiNRPZ9Q6uOtJRtRzwsayjv7aphalAZO_4lc,6452
@@ -58,7 +58,7 @@ upgini/utils/display_utils.py,sha256=uSG3JwpwCIgRJXsp-8ktuJ0Dh-WFti7IrRLMUfHfoDc
58
58
  upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
59
59
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
60
60
  upgini/utils/feature_info.py,sha256=6vihytwKma_TlXtTn4l6Aj4kqlOj0ouLy-yWVV6VUw8,7551
61
- upgini/utils/features_validator.py,sha256=wkPQlQFK6EQdnOd1MxFCSmb8gEqzCYJX1isLPaeRsgU,4365
61
+ upgini/utils/features_validator.py,sha256=5oNO8UV6SJ03fSkq15CQZARQZo-cii-hT9YbAbxE20o,4732
62
62
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
63
63
  upgini/utils/hash_utils.py,sha256=mP2yHyzvDNdpa5g3B4MHzulxBeEz_ZSoGl1YF_VnAyE,5538
64
64
  upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,
74
74
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
75
75
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
76
76
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
77
- upgini-1.2.121a1.dist-info/METADATA,sha256=8lCLPlcxApmxxhl8DgplSrHe_Z_GHqIiOLB66OCabPo,50745
78
- upgini-1.2.121a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
79
- upgini-1.2.121a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
- upgini-1.2.121a1.dist-info/RECORD,,
77
+ upgini-1.2.121a3.dist-info/METADATA,sha256=Ej1y42arckLkQQc0FhSuzKTWK6FS8n7I_aSCSm5CPKQ,50745
78
+ upgini-1.2.121a3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
79
+ upgini-1.2.121a3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
+ upgini-1.2.121a3.dist-info/RECORD,,