upgini 1.1.255a3233.post2__py3-none-any.whl → 1.1.255a3233.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/data_source/data_source_publisher.py +6 -0
- upgini/dataset.py +10 -9
- upgini/features_enricher.py +33 -23
- upgini/metrics.py +27 -15
- upgini/resource_bundle/strings.properties +8 -9
- upgini/utils/datetime_utils.py +14 -1
- upgini/utils/features_validator.py +25 -9
- {upgini-1.1.255a3233.post2.dist-info → upgini-1.1.255a3233.post4.dist-info}/METADATA +1 -1
- {upgini-1.1.255a3233.post2.dist-info → upgini-1.1.255a3233.post4.dist-info}/RECORD +12 -12
- {upgini-1.1.255a3233.post2.dist-info → upgini-1.1.255a3233.post4.dist-info}/WHEEL +1 -1
- {upgini-1.1.255a3233.post2.dist-info → upgini-1.1.255a3233.post4.dist-info}/LICENSE +0 -0
- {upgini-1.1.255a3233.post2.dist-info → upgini-1.1.255a3233.post4.dist-info}/top_level.txt +0 -0
|
@@ -79,6 +79,12 @@ class DataSourcePublisher:
|
|
|
79
79
|
f"Invalid update frequency: {update_frequency}. "
|
|
80
80
|
f"Available values: {self.ACCEPTABLE_UPDATE_FREQUENCIES}"
|
|
81
81
|
)
|
|
82
|
+
if (
|
|
83
|
+
set(search_keys.values()) == {SearchKey.IP_RANGE_FROM, SearchKey.IP_RANGE_TO}
|
|
84
|
+
or set(search_keys.values()) == {SearchKey.IPV6_RANGE_FROM, SearchKey.IPV6_RANGE_TO}
|
|
85
|
+
or set(search_keys.values()) == {SearchKey.MSISDN_RANGE_FROM, SearchKey.MSISDN_RANGE_TO}
|
|
86
|
+
) and sort_column is None:
|
|
87
|
+
raise ValidationError("Sort column is required for passed search keys")
|
|
82
88
|
|
|
83
89
|
request = {
|
|
84
90
|
"dataTableUri": data_table_uri,
|
upgini/dataset.py
CHANGED
|
@@ -225,11 +225,11 @@ class Dataset: # (pd.DataFrame):
|
|
|
225
225
|
self.data[col] = self.data[col].astype("str").str.slice(stop=self.MAX_STRING_FEATURE_LENGTH)
|
|
226
226
|
|
|
227
227
|
def __convert_bools(self):
|
|
228
|
-
"""Convert bool columns
|
|
228
|
+
"""Convert bool columns to string"""
|
|
229
229
|
# self.logger.info("Converting bool to int")
|
|
230
230
|
for col in self.data.columns:
|
|
231
231
|
if is_bool(self.data[col]):
|
|
232
|
-
self.data[col] = self.data[col].astype("
|
|
232
|
+
self.data[col] = self.data[col].astype("str")
|
|
233
233
|
|
|
234
234
|
def __convert_float16(self):
|
|
235
235
|
"""Convert float16 to float"""
|
|
@@ -309,13 +309,12 @@ class Dataset: # (pd.DataFrame):
|
|
|
309
309
|
if self.data[ip].isnull().all():
|
|
310
310
|
raise ValidationError(self.bundle.get("invalid_ip").format(ip))
|
|
311
311
|
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
self.columns_renaming[ipv4] = original_ip
|
|
312
|
+
ipv4 = ip + "_v4"
|
|
313
|
+
self.data[ipv4] = self.data[ip].apply(self._to_ipv4).apply(self._ip_to_int).astype("Int64")
|
|
314
|
+
self.meaning_types[ipv4] = FileColumnMeaningType.IP_ADDRESS
|
|
315
|
+
self.etalon_def[FileColumnMeaningType.IP_ADDRESS.value] = ipv4
|
|
316
|
+
search_keys.add(ipv4)
|
|
317
|
+
self.columns_renaming[ipv4] = original_ip
|
|
319
318
|
|
|
320
319
|
ipv6 = ip + "_v6"
|
|
321
320
|
self.data[ipv6] = (
|
|
@@ -687,8 +686,10 @@ class Dataset: # (pd.DataFrame):
|
|
|
687
686
|
+ "".join("<tr>" + "".join(map(map_color, row[1:])) + "</tr>" for row in df_stats.itertuples())
|
|
688
687
|
+ "</table>"
|
|
689
688
|
)
|
|
689
|
+
print()
|
|
690
690
|
display(HTML(html_stats))
|
|
691
691
|
except (ImportError, NameError):
|
|
692
|
+
print()
|
|
692
693
|
print(df_stats)
|
|
693
694
|
|
|
694
695
|
if len(self.data) == 0:
|
upgini/features_enricher.py
CHANGED
|
@@ -27,7 +27,6 @@ from scipy.stats import ks_2samp
|
|
|
27
27
|
from sklearn.base import TransformerMixin
|
|
28
28
|
from sklearn.exceptions import NotFittedError
|
|
29
29
|
from sklearn.model_selection import BaseCrossValidator
|
|
30
|
-
from sklearn.model_selection._split import GroupsConsumerMixin
|
|
31
30
|
|
|
32
31
|
from upgini.autofe.feature import Feature
|
|
33
32
|
from upgini.data_source.data_source_publisher import CommercialSchema
|
|
@@ -1255,8 +1254,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1255
1254
|
_cv, groups = CVConfig(
|
|
1256
1255
|
_cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
|
|
1257
1256
|
).get_cv_and_groups(X)
|
|
1258
|
-
|
|
1259
|
-
|
|
1257
|
+
else:
|
|
1258
|
+
from sklearn import __version__ as sklearn_version
|
|
1259
|
+
try:
|
|
1260
|
+
from sklearn.model_selection._split import GroupsConsumerMixin
|
|
1261
|
+
|
|
1262
|
+
if isinstance(_cv, GroupsConsumerMixin):
|
|
1263
|
+
groups = get_groups(X, group_columns)
|
|
1264
|
+
except ImportError:
|
|
1265
|
+
print(f"WARNING: Unsupported scikit-learn version {sklearn_version}. Restart kernel and try again")
|
|
1266
|
+
self.logger.exception(
|
|
1267
|
+
f"Failed to import GroupsConsumerMixin to check CV. Version of sklearn: {sklearn_version}"
|
|
1268
|
+
)
|
|
1260
1269
|
|
|
1261
1270
|
return _cv, groups
|
|
1262
1271
|
|
|
@@ -1329,18 +1338,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1329
1338
|
fitting_X = X_sorted[client_features].copy()
|
|
1330
1339
|
fitting_enriched_X = enriched_X_sorted[client_features + existing_filtered_enriched_features].copy()
|
|
1331
1340
|
|
|
1332
|
-
#
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
# fitting_enriched_X = fitting_enriched_X.drop(columns=columns_with_high_cardinality, errors="ignore")
|
|
1341
|
+
# Detect and drop high cardinality columns in train
|
|
1342
|
+
columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
|
|
1343
|
+
columns_with_high_cardinality = [
|
|
1344
|
+
c for c in columns_with_high_cardinality if c not in (self.generate_features or [])
|
|
1345
|
+
]
|
|
1346
|
+
if len(columns_with_high_cardinality) > 0:
|
|
1347
|
+
self.logger.warning(
|
|
1348
|
+
f"High cardinality columns {columns_with_high_cardinality} will be dropped for metrics calculation"
|
|
1349
|
+
)
|
|
1350
|
+
fitting_X = fitting_X.drop(columns=columns_with_high_cardinality, errors="ignore")
|
|
1351
|
+
fitting_enriched_X = fitting_enriched_X.drop(columns=columns_with_high_cardinality, errors="ignore")
|
|
1344
1352
|
|
|
1345
1353
|
# Detect and drop constant columns
|
|
1346
1354
|
constant_columns = FeaturesValidator.find_constant_features(fitting_X)
|
|
@@ -1389,11 +1397,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1389
1397
|
].copy()
|
|
1390
1398
|
|
|
1391
1399
|
# # Drop high cardinality features in eval set
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1400
|
+
if len(columns_with_high_cardinality) > 0:
|
|
1401
|
+
fitting_eval_X = fitting_eval_X.drop(columns=columns_with_high_cardinality, errors="ignore")
|
|
1402
|
+
fitting_enriched_eval_X = fitting_enriched_eval_X.drop(
|
|
1403
|
+
columns=columns_with_high_cardinality, errors="ignore"
|
|
1404
|
+
)
|
|
1397
1405
|
# Drop constant features in eval_set
|
|
1398
1406
|
if len(constant_columns) > 0:
|
|
1399
1407
|
fitting_eval_X = fitting_eval_X.drop(columns=constant_columns, errors="ignore")
|
|
@@ -1673,7 +1681,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1673
1681
|
eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
|
|
1674
1682
|
else:
|
|
1675
1683
|
self.logger.info("Transform without eval_set")
|
|
1676
|
-
df =
|
|
1684
|
+
df = validated_X.copy()
|
|
1677
1685
|
|
|
1678
1686
|
df[TARGET] = validated_y
|
|
1679
1687
|
num_samples = _num_samples(df)
|
|
@@ -1850,7 +1858,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1850
1858
|
msg = self.bundle.get("transform_usage_info").format(
|
|
1851
1859
|
transform_usage.limit, transform_usage.transformed_rows
|
|
1852
1860
|
)
|
|
1853
|
-
self.logger.info(
|
|
1861
|
+
self.logger.info(msg)
|
|
1854
1862
|
print(msg)
|
|
1855
1863
|
|
|
1856
1864
|
validated_X = self._validate_X(X, is_transform=True)
|
|
@@ -2276,7 +2284,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2276
2284
|
|
|
2277
2285
|
features_columns = [c for c in df.columns if c not in non_feature_columns]
|
|
2278
2286
|
|
|
2279
|
-
features_to_drop = FeaturesValidator(self.logger).validate(
|
|
2287
|
+
features_to_drop = FeaturesValidator(self.logger).validate(
|
|
2288
|
+
df, features_columns, self.generate_features, self.warning_counter
|
|
2289
|
+
)
|
|
2280
2290
|
self.fit_dropped_features.update(features_to_drop)
|
|
2281
2291
|
df = df.drop(columns=features_to_drop)
|
|
2282
2292
|
|
upgini/metrics.py
CHANGED
|
@@ -6,6 +6,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
8
|
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
9
|
+
import catboost
|
|
9
10
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
10
11
|
from numpy import log1p
|
|
11
12
|
from pandas.api.types import is_numeric_dtype
|
|
@@ -424,24 +425,35 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
424
425
|
X, y, groups, params = super()._prepare_to_fit(X, y)
|
|
425
426
|
|
|
426
427
|
# Find embeddings
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
self.
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
428
|
+
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
|
429
|
+
emb_pattern = r"(.+)_emb\d+"
|
|
430
|
+
self.emb_features = [c for c in X.columns if re.match(emb_pattern, c) and is_numeric_dtype(X[c])]
|
|
431
|
+
embedding_features = []
|
|
432
|
+
if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
|
|
433
|
+
self.logger.info(
|
|
434
|
+
"Embedding features count more than 3, so group them into one vector for CatBoost: "
|
|
435
|
+
f"{self.emb_features}"
|
|
436
|
+
)
|
|
437
|
+
X, embedding_features = self.group_embeddings(X)
|
|
438
|
+
params["embedding_features"] = embedding_features
|
|
439
|
+
else:
|
|
440
|
+
self.logger.info(
|
|
441
|
+
f"Embedding features count less than 3, so use them separately: {self.emb_features}"
|
|
442
|
+
)
|
|
443
|
+
self.emb_features = []
|
|
436
444
|
else:
|
|
437
|
-
self.
|
|
445
|
+
self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
|
|
438
446
|
|
|
439
447
|
# Find text features from passed in generate_features
|
|
440
|
-
if
|
|
441
|
-
self.
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
448
|
+
if hasattr(CatBoostClassifier, "get_text_feature_indices"):
|
|
449
|
+
if self.text_features is not None:
|
|
450
|
+
self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
|
|
451
|
+
self.text_features = [f for f in self.text_features if f in X.columns and not is_numeric_dtype(X[f])]
|
|
452
|
+
self.logger.info(f"Rest text features after checks: {self.text_features}")
|
|
453
|
+
params["text_features"] = self.text_features
|
|
454
|
+
else:
|
|
455
|
+
self.text_features = None
|
|
456
|
+
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
|
445
457
|
|
|
446
458
|
# Find rest categorical features
|
|
447
459
|
self.cat_features = _get_cat_features(X, self.text_features, embedding_features)
|
|
@@ -28,8 +28,8 @@ metrics_exclude_paid_features=\nWARNING: Metrics calculated after enrichment has
|
|
|
28
28
|
metrics_no_important_free_features=\nWARNING: No important free features to calculate metrics
|
|
29
29
|
metrics_no_important_features=\nWARNING: No important features to calculate metrics
|
|
30
30
|
metrics_negative_uplift_without_cv=Please re-check that your task is not a time series prediction. If so, restart search with cv=CVType.time_series param for correct search results. See docs https://github.com/upgini/upgini#-time-series-prediction-support
|
|
31
|
-
metrics_with_trial_features=The calculation of final accuracy metrics using Trial data is not available for unauthorized users.\nGet a free API key on https://upgini.com and repeat your request.
|
|
32
|
-
transform_with_trial_features=\nWARNING: Your search results contain Trial data sources. To enrich your dataframe using transform or fit_transform with features from these Trial data sources, please register for a Free API key at https://upgini.com and resubmit your request.
|
|
31
|
+
# metrics_with_trial_features=The calculation of final accuracy metrics using Trial data is not available for unauthorized users.\nGet a free API key on https://upgini.com and repeat your request.
|
|
32
|
+
# transform_with_trial_features=\nWARNING: Your search results contain Trial data sources. To enrich your dataframe using transform or fit_transform with features from these Trial data sources, please register for a Free API key at https://upgini.com and resubmit your request.
|
|
33
33
|
# Enriching with Trial data is not available for unauthorized users.\nGet a free API key on https://upgini.com and repeat your request.
|
|
34
34
|
metrics_with_paid_features=\nWARNING: The calculation of final accuracy metrics using Paid data is not available.\nContact Upgini support for the data access
|
|
35
35
|
transform_with_paid_features=\nWARNING: Enriching with Paid data is not available.\nContact Upgini support for the data access
|
|
@@ -132,18 +132,17 @@ baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input
|
|
|
132
132
|
baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
|
|
133
133
|
# target validation
|
|
134
134
|
empty_target=Target is empty in all rows
|
|
135
|
-
non_numeric_target=Binary target should be numerical type
|
|
135
|
+
# non_numeric_target=Binary target should be numerical type
|
|
136
136
|
uneven_eval_target_distribution=\nWARNING: y distributions from the training sample and eval_set differ according to the Kolmogorov-Smirnov test,\nwhich makes metrics between the train and eval_set incomparable.
|
|
137
|
-
target_outliers_warning
|
|
137
|
+
target_outliers_warning=\nWARNING: We detected {} outliers in your sample.\nExamples of outliers with maximum value of target:\n{}\nOutliers will {}be excluded during the metrics calculation.
|
|
138
138
|
# features validation
|
|
139
|
-
empty_or_contant_features
|
|
140
|
-
high_cardinality_features
|
|
141
|
-
one_hot_encoded_features=\nWARNING: One hot encoded features detected. Use int encoding for correct results of fit.\n{}
|
|
139
|
+
empty_or_contant_features=\nWARNING: Columns {} has value with frequency more than 99%, removed from X
|
|
140
|
+
high_cardinality_features=\nWARNING: Columns {} has high cardinality (>90% unique values), removed from X
|
|
141
|
+
# one_hot_encoded_features=\nWARNING: One hot encoded features detected. Use int encoding for correct results of fit.\n{}
|
|
142
142
|
# Dataset validation
|
|
143
143
|
dataset_too_few_rows=X size should be at least {} rows after validation
|
|
144
144
|
dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample X
|
|
145
145
|
dataset_empty_column_names=Some column names are empty. Add names please
|
|
146
|
-
dataset_too_long_column_name=Column {} is too long: {} characters. Remove this column or trim length to 50 characters
|
|
147
146
|
dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
|
|
148
147
|
dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
|
|
149
148
|
dataset_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
|
@@ -165,7 +164,7 @@ dataset_invalid_column_type=Unsupported data type of column {}: {}
|
|
|
165
164
|
dataset_invalid_filter=Unknown field in filter_features. Should be {'min_importance', 'max_psi', 'max_count', 'selected_features'}.
|
|
166
165
|
dataset_too_big_file=Too big size of dataframe X for processing. Please reduce number of rows or columns
|
|
167
166
|
dataset_transform_diff_fit=You try to enrich dataset that column names are different from the train dataset column names that you used on the fit stage. Please make the column names the same as in the train dataset and restart.
|
|
168
|
-
binary_small_dataset
|
|
167
|
+
binary_small_dataset=\nWARNING: The least populated class in Target contains less than 1000 rows.\nSmall numbers of observations may negatively affect the number of selected features and quality of your ML model.\nUpgini recommends you increase the number of observations in the least populated class.
|
|
169
168
|
all_search_keys_invalid=All search keys are invalid
|
|
170
169
|
all_emails_invalid=\nWARNING: All values in column {} are invalid emails
|
|
171
170
|
# Metrics validation
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -11,7 +11,20 @@ from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
|
|
|
11
11
|
from upgini.errors import ValidationError
|
|
12
12
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
13
13
|
|
|
14
|
-
DATE_FORMATS = [
|
|
14
|
+
DATE_FORMATS = [
|
|
15
|
+
"%Y-%m-%d",
|
|
16
|
+
"%d.%m.%y",
|
|
17
|
+
"%d.%m.%Y",
|
|
18
|
+
"%m.%d.%y",
|
|
19
|
+
"%m.%d.%Y",
|
|
20
|
+
"%Y/%m/%d",
|
|
21
|
+
"%y/%m/%d",
|
|
22
|
+
"%d/%m/%Y",
|
|
23
|
+
"%d/%m/%y",
|
|
24
|
+
"%m/%d/%Y",
|
|
25
|
+
"%m/%d/%y",
|
|
26
|
+
"%Y-%m-%dT%H:%M:%S.%f",
|
|
27
|
+
]
|
|
15
28
|
|
|
16
29
|
DATETIME_PATTERN = r"^[\d\s\.\-:T]+$"
|
|
17
30
|
|
|
@@ -3,7 +3,8 @@ from logging import Logger
|
|
|
3
3
|
from typing import List, Optional
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
|
-
from pandas.api.types import
|
|
6
|
+
from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
|
|
7
|
+
|
|
7
8
|
from upgini.resource_bundle import bundle
|
|
8
9
|
from upgini.utils.warning_counter import WarningCounter
|
|
9
10
|
|
|
@@ -16,9 +17,16 @@ class FeaturesValidator:
|
|
|
16
17
|
self.logger = logging.getLogger()
|
|
17
18
|
self.logger.setLevel("FATAL")
|
|
18
19
|
|
|
19
|
-
def validate(
|
|
20
|
+
def validate(
|
|
21
|
+
self,
|
|
22
|
+
df: pd.DataFrame,
|
|
23
|
+
features: List[str],
|
|
24
|
+
features_for_generate: Optional[List[str]],
|
|
25
|
+
warning_counter: WarningCounter,
|
|
26
|
+
) -> List[str]:
|
|
20
27
|
# one_hot_encoded_features = []
|
|
21
28
|
empty_or_constant_features = []
|
|
29
|
+
high_cardinality_features = []
|
|
22
30
|
|
|
23
31
|
for f in features:
|
|
24
32
|
column = df[f]
|
|
@@ -51,23 +59,31 @@ class FeaturesValidator:
|
|
|
51
59
|
msg = bundle.get("empty_or_contant_features").format(empty_or_constant_features)
|
|
52
60
|
print(msg)
|
|
53
61
|
self.logger.warning(msg)
|
|
62
|
+
warning_counter.increment()
|
|
63
|
+
|
|
64
|
+
high_cardinality_features = self.find_high_cardinality(df[features])
|
|
65
|
+
if features_for_generate:
|
|
66
|
+
high_cardinality_features = [f for f in high_cardinality_features if f not in features_for_generate]
|
|
67
|
+
if high_cardinality_features:
|
|
68
|
+
msg = bundle.get("high_cardinality_features").format(high_cardinality_features)
|
|
69
|
+
print(msg)
|
|
70
|
+
self.logger.warning(msg)
|
|
71
|
+
warning_counter.increment()
|
|
54
72
|
|
|
55
|
-
return empty_or_constant_features
|
|
73
|
+
return empty_or_constant_features + high_cardinality_features
|
|
56
74
|
|
|
57
75
|
@staticmethod
|
|
58
76
|
def find_high_cardinality(df: pd.DataFrame) -> List[str]:
|
|
59
77
|
# Remove high cardinality columns
|
|
60
78
|
row_count = df.shape[0]
|
|
79
|
+
if row_count < 100: # For tests with small datasets
|
|
80
|
+
return []
|
|
61
81
|
return [
|
|
62
82
|
i
|
|
63
83
|
for i in df
|
|
64
|
-
if (is_string_dtype(df[i]) or is_integer_dtype(df[i])) and (df[i].nunique() / row_count >= 0.
|
|
84
|
+
if (is_string_dtype(df[i]) or is_integer_dtype(df[i])) and (df[i].nunique(dropna=False) / row_count >= 0.95)
|
|
65
85
|
]
|
|
66
86
|
|
|
67
87
|
@staticmethod
|
|
68
88
|
def find_constant_features(df: pd.DataFrame) -> List[str]:
|
|
69
|
-
return [
|
|
70
|
-
i
|
|
71
|
-
for i in df
|
|
72
|
-
if df[i].nunique() == 1
|
|
73
|
-
]
|
|
89
|
+
return [i for i in df if df[i].nunique() == 1]
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
|
|
2
2
|
upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
|
|
3
|
-
upgini/dataset.py,sha256=
|
|
3
|
+
upgini/dataset.py,sha256=4LfrUwxhd__ZVqZkjPVxbC4SW3YLsk1sMMqnYPUaVpw,45529
|
|
4
4
|
upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
|
|
5
|
-
upgini/features_enricher.py,sha256=
|
|
5
|
+
upgini/features_enricher.py,sha256=WbwnLvPVqn4m995b6jSamWkXyRVy18fnG7faBeuJbWI,172132
|
|
6
6
|
upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
|
|
7
7
|
upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
|
|
8
8
|
upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
|
|
9
|
-
upgini/metrics.py,sha256=
|
|
9
|
+
upgini/metrics.py,sha256=3VvSZW1cCOIPHImXuqcnWzD3fWcpPzVa9k8eulLbUmY,27426
|
|
10
10
|
upgini/search_task.py,sha256=tmJ17WUxv3J5NWrYUJB_NKdZ792Ifz8Z8UnDXeQnpss,17077
|
|
11
11
|
upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
|
|
12
12
|
upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
|
|
@@ -21,14 +21,14 @@ upgini/autofe/operand.py,sha256=Rhy7Ky3we-I1Su1--dS4xdsO3K8neV4rqM_Q4xYE4ug,2779
|
|
|
21
21
|
upgini/autofe/unary.py,sha256=gyMkrx9bfa3o19zS-4JaRlScHrfeZGBsYe7d_6ePT-0,2853
|
|
22
22
|
upgini/autofe/vector.py,sha256=Qk7VmdwURNwVw7fIMEspWEo7HTiyUWCYIqu3hcWQQio,507
|
|
23
23
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
-
upgini/data_source/data_source_publisher.py,sha256=
|
|
24
|
+
upgini/data_source/data_source_publisher.py,sha256=QASEDhJ9SxJKcWxoN2vUPxrM_HTlwKQOPa92L7EQneA,15962
|
|
25
25
|
upgini/mdc/__init__.py,sha256=ETDh3JKbrDdPMOECiYLAa8lvKYe68mv4IY6fZa9FimA,1126
|
|
26
26
|
upgini/mdc/context.py,sha256=Sl1S_InKlzzRxYqwJ2k24lawJdCKWgGJ-RIRfvzWJrk,1468
|
|
27
27
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
28
|
upgini/normalizer/phone_normalizer.py,sha256=lhwsPEnfyjeIsndW2EcQGZksXYsfxaQ1ghAzVYoDRKM,9927
|
|
29
29
|
upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
|
|
30
30
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
31
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
31
|
+
upgini/resource_bundle/strings.properties,sha256=MGU_oBc15VAmbPZdThCpm3B4xERAKwbCIUTIG66dvUo,25228
|
|
32
32
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
33
33
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
34
34
|
upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
|
|
@@ -40,12 +40,12 @@ upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6P
|
|
|
40
40
|
upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU,6462
|
|
41
41
|
upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
|
|
42
42
|
upgini/utils/cv_utils.py,sha256=Tn01RJvpZGZh0PUQUimlBkV-AXwe7s6yjCNFtw352Uc,3525
|
|
43
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
43
|
+
upgini/utils/datetime_utils.py,sha256=ol5Bgh98wU6KBY9z4QskNO0ja-L7HJL70HmTAjl7iRU,8836
|
|
44
44
|
upgini/utils/deduplicate_utils.py,sha256=ckJrpU8Ruc_vcwIPTopbUjyJuNiseLHNAbQlLfhUCxo,5888
|
|
45
45
|
upgini/utils/display_utils.py,sha256=BfPaJGUJAkGaijdAKPrdIfUqjXewFbBRrYqzzylB9t4,10667
|
|
46
46
|
upgini/utils/email_utils.py,sha256=3CvHXTSzlgLyGsQOXfRYVfFhfPy6OXG4uXOBWRaLfHg,3479
|
|
47
47
|
upgini/utils/fallback_progress_bar.py,sha256=cdbd1XGcWm4Ed4eAqV2_St3z7uC_kkH22gEyrN5ub6M,1090
|
|
48
|
-
upgini/utils/features_validator.py,sha256=
|
|
48
|
+
upgini/utils/features_validator.py,sha256=P-dfjBLAMxgzOcUX1Jo1bhVp8-8WyTyF3Ef0YZ5nfRI,3269
|
|
49
49
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
50
50
|
upgini/utils/ip_utils.py,sha256=Zf3F2cnQmOCH09QLQHetpjMFu1PnD0cTmDymn0SnSy8,1672
|
|
51
51
|
upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,408
|
|
@@ -55,8 +55,8 @@ upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,4
|
|
|
55
55
|
upgini/utils/target_utils.py,sha256=WVhhxpQVvnhsDV7ctlds51VFg7hz59S_MFUSoRZFszw,7204
|
|
56
56
|
upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
|
|
57
57
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
58
|
-
upgini-1.1.255a3233.
|
|
59
|
-
upgini-1.1.255a3233.
|
|
60
|
-
upgini-1.1.255a3233.
|
|
61
|
-
upgini-1.1.255a3233.
|
|
62
|
-
upgini-1.1.255a3233.
|
|
58
|
+
upgini-1.1.255a3233.post4.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
59
|
+
upgini-1.1.255a3233.post4.dist-info/METADATA,sha256=LISA1JiOQR8ZPKCt7QlF-sTEJyiban04m9Zfln5DVyA,48167
|
|
60
|
+
upgini-1.1.255a3233.post4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
61
|
+
upgini-1.1.255a3233.post4.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
|
|
62
|
+
upgini-1.1.255a3233.post4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|