upgini 1.2.121a1__py3-none-any.whl → 1.2.121a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/resource_bundle/strings.properties +1 -1
- upgini/utils/features_validator.py +13 -9
- {upgini-1.2.121a1.dist-info → upgini-1.2.121a3.dist-info}/METADATA +1 -1
- {upgini-1.2.121a1.dist-info → upgini-1.2.121a3.dist-info}/RECORD +7 -7
- {upgini-1.2.121a1.dist-info → upgini-1.2.121a3.dist-info}/WHEEL +0 -0
- {upgini-1.2.121a1.dist-info → upgini-1.2.121a3.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.121a3"
|
@@ -155,7 +155,7 @@ target_outliers_warning=We detected {} outliers in your sample.\nExamples of out
|
|
155
155
|
# features validation
|
156
156
|
empty_or_contant_features=Columns {} has value with frequency more than 99%, removed from X
|
157
157
|
high_cardinality_features=Columns {} has high cardinality (>90% unique values), removed from X
|
158
|
-
|
158
|
+
one_hot_encoded_features=One hot encoded features detected: {}
|
159
159
|
|
160
160
|
# Dataset validation
|
161
161
|
dataset_too_few_rows=X size should be at least {} rows after validation
|
@@ -36,9 +36,6 @@ class FeaturesValidator:
|
|
36
36
|
value_counts = column.value_counts(dropna=False, normalize=True)
|
37
37
|
most_frequent_percent = value_counts.iloc[0]
|
38
38
|
|
39
|
-
if most_frequent_percent >= 0.99:
|
40
|
-
empty_or_constant_features.append(f)
|
41
|
-
|
42
39
|
if len(value_counts) == 1:
|
43
40
|
empty_or_constant_features.append(f)
|
44
41
|
elif most_frequent_percent >= 0.99:
|
@@ -103,18 +100,25 @@ class FeaturesValidator:
|
|
103
100
|
@staticmethod
|
104
101
|
def is_one_hot_encoded(series: pd.Series) -> bool:
|
105
102
|
try:
|
106
|
-
#
|
107
|
-
|
108
|
-
if
|
103
|
+
# First, handle string representations of True/False
|
104
|
+
series_copy = series.copy()
|
105
|
+
if series_copy.dtype == "object" or series_copy.dtype == "string":
|
106
|
+
# Convert string representations of boolean values to numeric
|
107
|
+
series_copy = series_copy.astype(str).str.strip().str.lower()
|
108
|
+
series_copy = series_copy.replace({"true": "1", "false": "0"})
|
109
|
+
|
110
|
+
# Column contains only 0 and 1 (as strings or numbers or booleans)
|
111
|
+
series_copy = series_copy.astype(float)
|
112
|
+
if set(series_copy.unique()) != {0.0, 1.0}:
|
109
113
|
return False
|
110
114
|
|
111
|
-
|
115
|
+
series_copy = series_copy.astype(int)
|
112
116
|
|
113
117
|
# Column doesn't contain any NaN, np.NaN, space, null, etc.
|
114
|
-
if not (
|
118
|
+
if not (series_copy.isin([0, 1])).all():
|
115
119
|
return False
|
116
120
|
|
117
|
-
vc =
|
121
|
+
vc = series_copy.value_counts()
|
118
122
|
# Column should contain both 0 and 1
|
119
123
|
if len(vc) != 2:
|
120
124
|
return False
|
@@ -1,4 +1,4 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=Cxw4BNratannn0d_z-lnfpv2ewsqbgmtVkJj8WoA6x0,26
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=pQ8JQe0cdygD-W9GefJmfE6bnj4EYzXsjlgWdIS9nS8,31578
|
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
38
38
|
upgini/normalizer/normalize_utils.py,sha256=mDh2mBW3aQMB4EFP2aHbf2dGMVkOcWnp4sKKvKDBh8w,8511
|
39
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
40
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
41
|
-
upgini/resource_bundle/strings.properties,sha256=
|
41
|
+
upgini/resource_bundle/strings.properties,sha256=Kmc6ZHpo0hK-bEQuoQkU0SPIQCnIDYRKqkfN3a_gvRU,29237
|
42
42
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
43
43
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
44
|
upgini/sampler/base.py,sha256=Fva2FEhLiNRPZ9Q6uOtJRtRzwsayjv7aphalAZO_4lc,6452
|
@@ -58,7 +58,7 @@ upgini/utils/display_utils.py,sha256=uSG3JwpwCIgRJXsp-8ktuJ0Dh-WFti7IrRLMUfHfoDc
|
|
58
58
|
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
59
59
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
60
60
|
upgini/utils/feature_info.py,sha256=6vihytwKma_TlXtTn4l6Aj4kqlOj0ouLy-yWVV6VUw8,7551
|
61
|
-
upgini/utils/features_validator.py,sha256=
|
61
|
+
upgini/utils/features_validator.py,sha256=5oNO8UV6SJ03fSkq15CQZARQZo-cii-hT9YbAbxE20o,4732
|
62
62
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
63
63
|
upgini/utils/hash_utils.py,sha256=mP2yHyzvDNdpa5g3B4MHzulxBeEz_ZSoGl1YF_VnAyE,5538
|
64
64
|
upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
|
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,
|
|
74
74
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
75
75
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
76
76
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
77
|
-
upgini-1.2.
|
78
|
-
upgini-1.2.
|
79
|
-
upgini-1.2.
|
80
|
-
upgini-1.2.
|
77
|
+
upgini-1.2.121a3.dist-info/METADATA,sha256=Ej1y42arckLkQQc0FhSuzKTWK6FS8n7I_aSCSm5CPKQ,50745
|
78
|
+
upgini-1.2.121a3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
79
|
+
upgini-1.2.121a3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
80
|
+
upgini-1.2.121a3.dist-info/RECORD,,
|
File without changes
|
File without changes
|