upgini 1.1.246a101__py3-none-any.whl → 1.1.248a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/autofe/operand.py +4 -2
- upgini/autofe/unary.py +2 -2
- upgini/dataset.py +4 -4
- upgini/features_enricher.py +59 -33
- upgini/metadata.py +1 -0
- upgini/resource_bundle/strings.properties +2 -0
- upgini/resource_bundle/strings_widget.properties +2 -0
- upgini/utils/cv_utils.py +37 -11
- upgini/utils/datetime_utils.py +12 -7
- upgini/utils/deduplicate_utils.py +9 -4
- {upgini-1.1.246a101.dist-info → upgini-1.1.248a1.dist-info}/METADATA +1 -1
- {upgini-1.1.246a101.dist-info → upgini-1.1.248a1.dist-info}/RECORD +15 -15
- {upgini-1.1.246a101.dist-info → upgini-1.1.248a1.dist-info}/LICENSE +0 -0
- {upgini-1.1.246a101.dist-info → upgini-1.1.248a1.dist-info}/WHEEL +0 -0
- {upgini-1.1.246a101.dist-info → upgini-1.1.248a1.dist-info}/top_level.txt +0 -0
upgini/autofe/operand.py
CHANGED
|
@@ -59,12 +59,14 @@ class PandasOperand(Operand, abc.ABC):
|
|
|
59
59
|
df_from.loc[np.nan] = np.nan
|
|
60
60
|
return df_to.fillna(np.nan).apply(lambda x: df_from.loc[x])
|
|
61
61
|
|
|
62
|
-
def _round_value(
|
|
62
|
+
def _round_value(
|
|
63
|
+
self, value: Union[pd.Series, pd.DataFrame], precision: Optional[int] = None
|
|
64
|
+
) -> Union[pd.Series, pd.DataFrame]:
|
|
63
65
|
if isinstance(value, pd.DataFrame):
|
|
64
66
|
return value.apply(self._round_value, axis=1)
|
|
65
67
|
|
|
66
68
|
if np.issubdtype(value.dtype, np.floating):
|
|
67
|
-
precision = np.finfo(value.dtype).precision
|
|
69
|
+
precision = precision or np.finfo(value.dtype).precision
|
|
68
70
|
return np.trunc(value * 10**precision) / (10**precision)
|
|
69
71
|
else:
|
|
70
72
|
return value
|
upgini/autofe/unary.py
CHANGED
|
@@ -22,10 +22,10 @@ class Log(PandasOperand):
|
|
|
22
22
|
output_type = "float"
|
|
23
23
|
|
|
24
24
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
25
|
-
return self._round_value(np.log(np.abs(data.replace(0, np.nan))))
|
|
25
|
+
return self._round_value(np.log(np.abs(data.replace(0, np.nan))), 10)
|
|
26
26
|
|
|
27
27
|
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
28
|
-
return self._round_value(np.log(data.replace(0, np.nan).abs()))
|
|
28
|
+
return self._round_value(np.log(data.replace(0, np.nan).abs()), 10)
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class Sqrt(PandasOperand):
|
upgini/dataset.py
CHANGED
|
@@ -662,15 +662,15 @@ class Dataset: # (pd.DataFrame):
|
|
|
662
662
|
# if self.task_type != ModelTaskType.MULTICLASS:
|
|
663
663
|
# self.data[target] = self.data[target].apply(pd.to_numeric, errors="coerce")
|
|
664
664
|
|
|
665
|
-
keys_to_validate =
|
|
665
|
+
keys_to_validate = {
|
|
666
666
|
key
|
|
667
667
|
for search_group in self.search_keys_checked
|
|
668
668
|
for key in search_group
|
|
669
669
|
if self.columns_renaming.get(key) != EmailSearchKeyConverter.EMAIL_ONE_DOMAIN_COLUMN_NAME
|
|
670
|
-
|
|
671
|
-
ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS)
|
|
670
|
+
}
|
|
671
|
+
ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS.value)
|
|
672
672
|
if (
|
|
673
|
-
FileColumnMeaningType.IPV6_ADDRESS in self.etalon_def_checked
|
|
673
|
+
FileColumnMeaningType.IPV6_ADDRESS.value in self.etalon_def_checked
|
|
674
674
|
and ipv4_column is not None
|
|
675
675
|
and ipv4_column in keys_to_validate
|
|
676
676
|
):
|
upgini/features_enricher.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import dataclasses
|
|
1
2
|
import gc
|
|
2
3
|
import hashlib
|
|
3
4
|
import itertools
|
|
@@ -9,8 +10,7 @@ import sys
|
|
|
9
10
|
import tempfile
|
|
10
11
|
import time
|
|
11
12
|
import uuid
|
|
12
|
-
from
|
|
13
|
-
from functools import reduce
|
|
13
|
+
from dataclasses import dataclass
|
|
14
14
|
from threading import Thread
|
|
15
15
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
|
16
16
|
|
|
@@ -21,6 +21,7 @@ from scipy.stats import ks_2samp
|
|
|
21
21
|
from sklearn.base import TransformerMixin
|
|
22
22
|
from sklearn.exceptions import NotFittedError
|
|
23
23
|
from sklearn.model_selection import BaseCrossValidator
|
|
24
|
+
from sklearn.model_selection._split import GroupsConsumerMixin
|
|
24
25
|
|
|
25
26
|
from upgini.autofe.feature import Feature
|
|
26
27
|
from upgini.data_source.data_source_publisher import CommercialSchema
|
|
@@ -59,7 +60,7 @@ from upgini.utils.custom_loss_utils import (
|
|
|
59
60
|
get_additional_params_custom_loss,
|
|
60
61
|
get_runtime_params_custom_loss,
|
|
61
62
|
)
|
|
62
|
-
from upgini.utils.cv_utils import CVConfig
|
|
63
|
+
from upgini.utils.cv_utils import CVConfig, get_groups
|
|
63
64
|
from upgini.utils.datetime_utils import (
|
|
64
65
|
DateTimeSearchKeyConverter,
|
|
65
66
|
is_blocked_time_series,
|
|
@@ -920,6 +921,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
920
921
|
fitting_eval_set_dict,
|
|
921
922
|
search_keys,
|
|
922
923
|
groups,
|
|
924
|
+
_cv,
|
|
923
925
|
) = prepared_data
|
|
924
926
|
|
|
925
927
|
gc.collect()
|
|
@@ -936,16 +938,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
936
938
|
|
|
937
939
|
has_date = self._get_date_column(search_keys) is not None
|
|
938
940
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
|
939
|
-
_cv = cv or self.cv
|
|
940
|
-
if groups is None and _cv == CVType.group_k_fold:
|
|
941
|
-
self.logger.info("Replacing group_k_fold with k_fold as no groups were found")
|
|
942
|
-
_cv = CVType.k_fold
|
|
943
|
-
if not isinstance(_cv, BaseCrossValidator):
|
|
944
|
-
date_column = self._get_date_column(search_keys)
|
|
945
|
-
date_series = validated_X[date_column] if date_column is not None else None
|
|
946
|
-
_cv = CVConfig(
|
|
947
|
-
_cv, date_series, self.random_state, self._search_task.get_shuffle_kfold()
|
|
948
|
-
).get_cv()
|
|
949
941
|
|
|
950
942
|
wrapper = EstimatorWrapper.create(
|
|
951
943
|
estimator,
|
|
@@ -1213,7 +1205,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1213
1205
|
generated_features = []
|
|
1214
1206
|
date_column = self._get_date_column(search_keys)
|
|
1215
1207
|
if date_column is not None:
|
|
1216
|
-
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger)
|
|
1208
|
+
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
1217
1209
|
extended_X = converter.convert(extended_X, keep_time=True)
|
|
1218
1210
|
generated_features.extend(converter.generated_features)
|
|
1219
1211
|
email_column = self._get_email_column(search_keys)
|
|
@@ -1265,6 +1257,27 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1265
1257
|
self.logger.info("Passed X, y and eval_set that differs from passed on fit. Transform will be used")
|
|
1266
1258
|
return False, X, y, checked_eval_set
|
|
1267
1259
|
|
|
1260
|
+
def _get_cv_and_groups(
|
|
1261
|
+
self,
|
|
1262
|
+
X: pd.DataFrame,
|
|
1263
|
+
cv_override: Union[BaseCrossValidator, CVType, str, None],
|
|
1264
|
+
search_keys: Dict[str, SearchKey],
|
|
1265
|
+
) -> Tuple[BaseCrossValidator, Optional[np.ndarray]]:
|
|
1266
|
+
_cv = cv_override or self.cv
|
|
1267
|
+
group_columns = sorted(self._get_group_columns(X, search_keys))
|
|
1268
|
+
groups = None
|
|
1269
|
+
|
|
1270
|
+
if not isinstance(_cv, BaseCrossValidator):
|
|
1271
|
+
date_column = self._get_date_column(search_keys)
|
|
1272
|
+
date_series = X[date_column] if date_column is not None else None
|
|
1273
|
+
_cv, groups = CVConfig(
|
|
1274
|
+
_cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
|
|
1275
|
+
).get_cv_and_groups(X)
|
|
1276
|
+
elif isinstance(_cv, GroupsConsumerMixin):
|
|
1277
|
+
groups = get_groups(X, group_columns)
|
|
1278
|
+
|
|
1279
|
+
return _cv, groups
|
|
1280
|
+
|
|
1268
1281
|
def _prepare_data_for_metrics(
|
|
1269
1282
|
self,
|
|
1270
1283
|
trace_id: str,
|
|
@@ -1275,6 +1288,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1275
1288
|
importance_threshold: Optional[float] = None,
|
|
1276
1289
|
max_features: Optional[int] = None,
|
|
1277
1290
|
remove_outliers_calc_metrics: Optional[bool] = None,
|
|
1291
|
+
cv_override: Union[BaseCrossValidator, CVType, str, None] = None,
|
|
1278
1292
|
search_keys_for_metrics: Optional[List[str]] = None,
|
|
1279
1293
|
progress_bar: Optional[ProgressBar] = None,
|
|
1280
1294
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
@@ -1290,7 +1304,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1290
1304
|
else None
|
|
1291
1305
|
)
|
|
1292
1306
|
|
|
1293
|
-
|
|
1307
|
+
sampled_data = self._sample_data_for_metrics(
|
|
1294
1308
|
trace_id,
|
|
1295
1309
|
validated_X,
|
|
1296
1310
|
validated_y,
|
|
@@ -1302,6 +1316,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1302
1316
|
progress_bar,
|
|
1303
1317
|
progress_callback,
|
|
1304
1318
|
)
|
|
1319
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = dataclasses.astuple(sampled_data)
|
|
1305
1320
|
|
|
1306
1321
|
excluding_search_keys = list(search_keys.keys())
|
|
1307
1322
|
if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
|
|
@@ -1325,14 +1340,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1325
1340
|
X_sorted, y_sorted = self._sort_by_system_record_id(X_sampled, y_sampled, self.cv)
|
|
1326
1341
|
enriched_X_sorted, enriched_y_sorted = self._sort_by_system_record_id(enriched_X, y_sampled, self.cv)
|
|
1327
1342
|
|
|
1328
|
-
|
|
1329
|
-
groups = (
|
|
1330
|
-
None
|
|
1331
|
-
if not group_columns or self.cv != CVType.group_k_fold
|
|
1332
|
-
else reduce(
|
|
1333
|
-
lambda left, right: left + "_" + right, [enriched_X_sorted[c].astype(str) for c in group_columns]
|
|
1334
|
-
).factorize()[0]
|
|
1335
|
-
)
|
|
1343
|
+
cv, groups = self._get_cv_and_groups(enriched_X_sorted, cv_override, search_keys)
|
|
1336
1344
|
|
|
1337
1345
|
existing_filtered_enriched_features = [c for c in filtered_enriched_features if c in enriched_X_sorted.columns]
|
|
1338
1346
|
|
|
@@ -1382,11 +1390,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1382
1390
|
fitting_eval_set_dict,
|
|
1383
1391
|
search_keys,
|
|
1384
1392
|
groups,
|
|
1393
|
+
cv,
|
|
1385
1394
|
)
|
|
1386
1395
|
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1396
|
+
@dataclass
|
|
1397
|
+
class _SampledDataForMetrics:
|
|
1398
|
+
X_sampled: pd.DataFrame
|
|
1399
|
+
y_sampled: pd.Series
|
|
1400
|
+
enriched_X: pd.DataFrame
|
|
1401
|
+
eval_set_sampled_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]]
|
|
1402
|
+
search_keys: Dict[str, SearchKey]
|
|
1390
1403
|
|
|
1391
1404
|
def _sample_data_for_metrics(
|
|
1392
1405
|
self,
|
|
@@ -1571,7 +1584,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1571
1584
|
df_with_eval_set_index = pd.concat([df_with_eval_set_index, eval_df_with_index])
|
|
1572
1585
|
|
|
1573
1586
|
_, df_with_eval_set_index = remove_fintech_duplicates(
|
|
1574
|
-
df_with_eval_set_index,
|
|
1587
|
+
df_with_eval_set_index,
|
|
1588
|
+
self.search_keys,
|
|
1589
|
+
date_format=self.date_format,
|
|
1590
|
+
logger=self.logger,
|
|
1591
|
+
silent=True,
|
|
1592
|
+
bundle=self.bundle,
|
|
1575
1593
|
)
|
|
1576
1594
|
|
|
1577
1595
|
# downsample if need to eval_set threshold
|
|
@@ -1673,7 +1691,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1673
1691
|
):
|
|
1674
1692
|
search_keys = {k: v for k, v in search_keys.items() if k in X_sampled.columns.to_list()}
|
|
1675
1693
|
return FeaturesEnricher._SampledDataForMetrics(
|
|
1676
|
-
X_sampled,
|
|
1694
|
+
X_sampled=X_sampled,
|
|
1695
|
+
y_sampled=y_sampled,
|
|
1696
|
+
enriched_X=enriched_X,
|
|
1697
|
+
eval_set_sampled_dict=eval_set_sampled_dict,
|
|
1698
|
+
search_keys=search_keys,
|
|
1677
1699
|
)
|
|
1678
1700
|
|
|
1679
1701
|
def get_search_id(self) -> Optional[str]:
|
|
@@ -1810,7 +1832,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1810
1832
|
generated_features = []
|
|
1811
1833
|
date_column = self._get_date_column(search_keys)
|
|
1812
1834
|
if date_column is not None:
|
|
1813
|
-
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger)
|
|
1835
|
+
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
1814
1836
|
df = converter.convert(df)
|
|
1815
1837
|
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
1816
1838
|
generated_features.extend(converter.generated_features)
|
|
@@ -1868,7 +1890,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1868
1890
|
|
|
1869
1891
|
df_without_features = df.drop(columns=non_keys_columns)
|
|
1870
1892
|
|
|
1871
|
-
df_without_features = clean_full_duplicates(
|
|
1893
|
+
df_without_features = clean_full_duplicates(
|
|
1894
|
+
df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
|
|
1895
|
+
)
|
|
1872
1896
|
|
|
1873
1897
|
del df
|
|
1874
1898
|
gc.collect()
|
|
@@ -2148,9 +2172,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2148
2172
|
|
|
2149
2173
|
df = self.__add_country_code(df, self.fit_search_keys)
|
|
2150
2174
|
|
|
2151
|
-
need_full_defuplication, df = remove_fintech_duplicates(
|
|
2175
|
+
need_full_defuplication, df = remove_fintech_duplicates(
|
|
2176
|
+
df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
|
|
2177
|
+
)
|
|
2152
2178
|
if need_full_defuplication:
|
|
2153
|
-
df = clean_full_duplicates(df, self.logger)
|
|
2179
|
+
df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
|
|
2154
2180
|
|
|
2155
2181
|
date_column = self._get_date_column(self.fit_search_keys)
|
|
2156
2182
|
self.__adjust_cv(df, date_column, model_task_type)
|
|
@@ -2158,7 +2184,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2158
2184
|
self.fit_generated_features = []
|
|
2159
2185
|
|
|
2160
2186
|
if date_column is not None:
|
|
2161
|
-
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger)
|
|
2187
|
+
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2162
2188
|
df = converter.convert(df, keep_time=True)
|
|
2163
2189
|
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
2164
2190
|
self.fit_generated_features.extend(converter.generated_features)
|
upgini/metadata.py
CHANGED
|
@@ -89,6 +89,8 @@ empty_search_key=Search key {} is empty. Please fill values or remove this searc
|
|
|
89
89
|
single_constant_search_key=\nWARNING: Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
|
|
90
90
|
unsupported_index_column=\nWARNING: Your column with name `index` was dropped because it's reserved name is booked for system needs.
|
|
91
91
|
date_string_without_format=Date column `{}` has string type, but date_format is not specified. Convert column to datetime type or pass date_format
|
|
92
|
+
invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit date format in date_format argument of FeaturesEnricher constructor
|
|
93
|
+
unsupported_date_type=Unsupported type of date column `{}`. Convert to datetime please.
|
|
92
94
|
invalid_postal_code=All values of POSTAL_CODE column `{}` are invalid
|
|
93
95
|
invalid_country=All values of COUNTRY column `{}` are invalid
|
|
94
96
|
invalid_ip=All values of IPv4 column `{}` are invalid
|
|
@@ -9,6 +9,8 @@ dataset_too_few_rows=Labeled dataset size with unique search keys must be not le
|
|
|
9
9
|
dataset_too_big_file=Too big size of labeled dataset for processing. Please reduce number of rows or columns
|
|
10
10
|
dataset_too_many_rows_registered=Labeled dataset size rows limit is {}. Please sample it and retry
|
|
11
11
|
dataset_all_dates_old=There is empty labeled dataset after removing data before '2000-01-01'
|
|
12
|
+
invalid_date_format=Failed to parse date in column `{}`
|
|
13
|
+
unsupported_date_type=Unsupported type of date column `{}`. Convert to datetime please.
|
|
12
14
|
|
|
13
15
|
# Validate target
|
|
14
16
|
y_is_constant=Target label is a constant. Relevant feature search requires a non-constant target
|
upgini/utils/cv_utils.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
from
|
|
1
|
+
from functools import reduce
|
|
2
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
3
|
+
import numpy as np
|
|
2
4
|
|
|
3
5
|
import pandas as pd
|
|
4
6
|
from sklearn.model_selection import BaseCrossValidator, KFold, TimeSeriesSplit, GroupKFold, GroupShuffleSplit
|
|
@@ -14,6 +16,9 @@ class CVConfig:
|
|
|
14
16
|
date_column: Optional[pd.Series],
|
|
15
17
|
random_state=None,
|
|
16
18
|
shuffle_kfold: Optional[bool] = None,
|
|
19
|
+
test_size: Optional[float] = 0.2,
|
|
20
|
+
n_folds: Optional[int] = 5,
|
|
21
|
+
group_columns: Optional[List[str]] = None,
|
|
17
22
|
):
|
|
18
23
|
if cv_type is None:
|
|
19
24
|
self.cv_type = CVType.k_fold
|
|
@@ -24,9 +29,10 @@ class CVConfig:
|
|
|
24
29
|
else:
|
|
25
30
|
raise Exception(f"Unexpected type of cv_type: {type(cv_type)}")
|
|
26
31
|
|
|
32
|
+
self.group_columns = group_columns
|
|
27
33
|
self.shuffle_kfold: Optional[bool] = shuffle_kfold
|
|
28
|
-
self.test_size =
|
|
29
|
-
self.n_folds =
|
|
34
|
+
self.test_size = test_size
|
|
35
|
+
self.n_folds = n_folds
|
|
30
36
|
if (self.cv_type == CVType.k_fold or self.cv_type == CVType.group_k_fold) and self.shuffle_kfold is None:
|
|
31
37
|
self.shuffle_kfold = date_column is None or is_constant(date_column)
|
|
32
38
|
if self.shuffle_kfold:
|
|
@@ -45,17 +51,37 @@ class CVConfig:
|
|
|
45
51
|
config["test_size"] = self.test_size
|
|
46
52
|
return config
|
|
47
53
|
|
|
48
|
-
def
|
|
54
|
+
def get_cv_and_groups(self, X: pd.DataFrame) -> Tuple[BaseCrossValidator, Optional[np.ndarray]]:
|
|
49
55
|
if self.cv_type == CVType.time_series:
|
|
50
|
-
return TimeSeriesSplit(n_splits=self.n_folds)
|
|
56
|
+
return TimeSeriesSplit(n_splits=self.n_folds), None
|
|
51
57
|
elif self.cv_type == CVType.blocked_time_series:
|
|
52
|
-
return BlockedTimeSeriesSplit(n_splits=self.n_folds, test_size=self.test_size)
|
|
53
|
-
elif self.cv_type == CVType.group_k_fold and self.
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
58
|
+
return BlockedTimeSeriesSplit(n_splits=self.n_folds, test_size=self.test_size), None
|
|
59
|
+
elif self.cv_type == CVType.group_k_fold and self.group_columns:
|
|
60
|
+
groups = get_groups(X, self.group_columns)
|
|
61
|
+
|
|
62
|
+
if groups is None or np.unique(groups).size < self.n_folds:
|
|
63
|
+
return KFold(n_splits=self.n_folds, shuffle=self.shuffle_kfold, random_state=self.random_state), None
|
|
64
|
+
|
|
65
|
+
if self.shuffle_kfold:
|
|
66
|
+
return (
|
|
67
|
+
GroupShuffleSplit(n_splits=self.n_folds, test_size=self.test_size, random_state=self.random_state),
|
|
68
|
+
groups,
|
|
69
|
+
)
|
|
70
|
+
else:
|
|
71
|
+
return GroupKFold(n_splits=self.n_folds), groups
|
|
57
72
|
else:
|
|
58
|
-
return KFold(n_splits=self.n_folds, shuffle=self.shuffle_kfold, random_state=self.random_state)
|
|
73
|
+
return KFold(n_splits=self.n_folds, shuffle=self.shuffle_kfold, random_state=self.random_state), None
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def get_groups(X: pd.DataFrame, group_columns: Optional[List[str]]) -> Optional[np.ndarray]:
|
|
77
|
+
existing_group_columns = [c for c in group_columns if c in X.columns]
|
|
78
|
+
return (
|
|
79
|
+
None
|
|
80
|
+
if not group_columns
|
|
81
|
+
else reduce(
|
|
82
|
+
lambda left, right: left + "_" + right, [X[c].astype(str) for c in existing_group_columns]
|
|
83
|
+
).factorize()[0]
|
|
84
|
+
)
|
|
59
85
|
|
|
60
86
|
|
|
61
87
|
def is_constant(s, dropna=True) -> bool:
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -9,6 +9,7 @@ from dateutil.relativedelta import relativedelta
|
|
|
9
9
|
from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
|
|
10
10
|
|
|
11
11
|
from upgini.errors import ValidationError
|
|
12
|
+
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
12
13
|
|
|
13
14
|
DATE_FORMATS = ["%Y-%m-%d", "%d.%m.%y", "%d.%m.%Y", "%m.%d.%y", "%m.%d.%Y", "%Y-%m-%dT%H:%M:%S.%f"]
|
|
14
15
|
|
|
@@ -18,7 +19,13 @@ DATETIME_PATTERN = r"^[\d\s\.\-:T]+$"
|
|
|
18
19
|
class DateTimeSearchKeyConverter:
|
|
19
20
|
DATETIME_COL = "_date_time"
|
|
20
21
|
|
|
21
|
-
def __init__(
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
date_column: str,
|
|
25
|
+
date_format: Optional[str] = None,
|
|
26
|
+
logger: Optional[logging.Logger] = None,
|
|
27
|
+
bundle: ResourceBundle = None,
|
|
28
|
+
):
|
|
22
29
|
self.date_column = date_column
|
|
23
30
|
self.date_format = date_format
|
|
24
31
|
if logger is not None:
|
|
@@ -27,6 +34,7 @@ class DateTimeSearchKeyConverter:
|
|
|
27
34
|
self.logger = logging.getLogger()
|
|
28
35
|
self.logger.setLevel("FATAL")
|
|
29
36
|
self.generated_features: List[str] = []
|
|
37
|
+
self.bundle = bundle or get_custom_bundle()
|
|
30
38
|
|
|
31
39
|
@staticmethod
|
|
32
40
|
def _int_to_opt(i: int) -> Optional[int]:
|
|
@@ -71,10 +79,10 @@ class DateTimeSearchKeyConverter:
|
|
|
71
79
|
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="us")
|
|
72
80
|
elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
|
|
73
81
|
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ms")
|
|
74
|
-
elif df[self.date_column].apply(lambda x: 0 < x < 10*11).all():
|
|
82
|
+
elif df[self.date_column].apply(lambda x: 0 < x < 10 * 11).all():
|
|
75
83
|
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
|
|
76
84
|
else:
|
|
77
|
-
msg =
|
|
85
|
+
msg = self.bundle.get("unsupported_date_type").format(self.date_column)
|
|
78
86
|
self.logger.warning(msg)
|
|
79
87
|
raise ValidationError(msg)
|
|
80
88
|
|
|
@@ -121,10 +129,7 @@ class DateTimeSearchKeyConverter:
|
|
|
121
129
|
return pd.to_datetime(df[self.date_column], format=date_format)
|
|
122
130
|
except ValueError:
|
|
123
131
|
pass
|
|
124
|
-
raise ValidationError(
|
|
125
|
-
f"Failed to parse date in column `{self.date_column}`. "
|
|
126
|
-
"Try to pass explicit date format in date_format argument of FeaturesEnricher constructor"
|
|
127
|
-
)
|
|
132
|
+
raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
|
|
128
133
|
|
|
129
134
|
|
|
130
135
|
def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
|
|
@@ -4,13 +4,18 @@ from typing import Dict, List, Optional, Tuple, Union
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
6
6
|
from upgini.metadata import SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
|
|
7
|
-
from upgini.resource_bundle import
|
|
7
|
+
from upgini.resource_bundle import ResourceBundle
|
|
8
8
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
9
9
|
from upgini.utils.target_utils import define_task
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
def remove_fintech_duplicates(
|
|
13
|
-
df: pd.DataFrame,
|
|
13
|
+
df: pd.DataFrame,
|
|
14
|
+
search_keys: Dict[str, SearchKey],
|
|
15
|
+
date_format: Optional[str] = None,
|
|
16
|
+
logger: Optional[Logger] = None,
|
|
17
|
+
silent=False,
|
|
18
|
+
bundle: ResourceBundle = None,
|
|
14
19
|
) -> Tuple[bool, pd.DataFrame]:
|
|
15
20
|
# Base checks
|
|
16
21
|
need_full_deduplication = True
|
|
@@ -72,7 +77,7 @@ def remove_fintech_duplicates(
|
|
|
72
77
|
nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
|
|
73
78
|
sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
|
|
74
79
|
|
|
75
|
-
sub_df = DateTimeSearchKeyConverter(date_col).convert(sub_df)
|
|
80
|
+
sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(sub_df)
|
|
76
81
|
grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
|
|
77
82
|
rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
|
|
78
83
|
if len(rows_with_diff_target) > 0:
|
|
@@ -95,7 +100,7 @@ def remove_fintech_duplicates(
|
|
|
95
100
|
|
|
96
101
|
|
|
97
102
|
def clean_full_duplicates(
|
|
98
|
-
df: pd.DataFrame, logger: Optional[Logger] = None, silent=False
|
|
103
|
+
df: pd.DataFrame, logger: Optional[Logger] = None, silent=False, bundle: ResourceBundle = None
|
|
99
104
|
) -> pd.DataFrame:
|
|
100
105
|
nrows = len(df)
|
|
101
106
|
if nrows == 0:
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
|
|
2
2
|
upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
|
|
3
|
-
upgini/dataset.py,sha256=
|
|
3
|
+
upgini/dataset.py,sha256=8TE_NKdhNeL3mlU-S57bHFub4G0gmyOGtceqO-CRaMk,50436
|
|
4
4
|
upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
|
|
5
|
-
upgini/features_enricher.py,sha256=
|
|
5
|
+
upgini/features_enricher.py,sha256=xoaG_cEhQJciYHHE-UkjdkprfThNnxGRe9YvT0NVnoI,168617
|
|
6
6
|
upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
|
|
7
7
|
upgini/http.py,sha256=eSG4gOpmCGlXmB6KIPNzAG8tRZNUjyYpMeUeHw_2li4,42264
|
|
8
|
-
upgini/metadata.py,sha256=
|
|
8
|
+
upgini/metadata.py,sha256=5SQjDsQrfpGKT42i0H40C6wxTY_up94xSqAARH140_U,9637
|
|
9
9
|
upgini/metrics.py,sha256=LS2MgEKgmn9VEXsKzxv3pBZ-q71mTnpWu6vL8fYgpo4,26727
|
|
10
10
|
upgini/search_task.py,sha256=5n4qGJmtu48s0-FHAtF3L5qVLMd1JVW3FJlM8dFbh-s,17063
|
|
11
11
|
upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
|
|
@@ -17,8 +17,8 @@ upgini/autofe/all_operands.py,sha256=du44N6ISWe3ikb0y9ZzSOHNbLiyEYrJPwoBo0Z6xp2s
|
|
|
17
17
|
upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
|
|
18
18
|
upgini/autofe/feature.py,sha256=xeqTq35-BX4KCt0xAkk3UZAGzV5VyjorV5AdNdA5yLs,11851
|
|
19
19
|
upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
|
|
20
|
-
upgini/autofe/operand.py,sha256=
|
|
21
|
-
upgini/autofe/unary.py,sha256=
|
|
20
|
+
upgini/autofe/operand.py,sha256=Rhy7Ky3we-I1Su1--dS4xdsO3K8neV4rqM_Q4xYE4ug,2779
|
|
21
|
+
upgini/autofe/unary.py,sha256=gyMkrx9bfa3o19zS-4JaRlScHrfeZGBsYe7d_6ePT-0,2853
|
|
22
22
|
upgini/autofe/vector.py,sha256=Qk7VmdwURNwVw7fIMEspWEo7HTiyUWCYIqu3hcWQQio,507
|
|
23
23
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
24
|
upgini/data_source/data_source_publisher.py,sha256=PeotNz-taBT6aweQc9xc_pdtUOqmqe1IZr1-3NeUsd0,14008
|
|
@@ -28,8 +28,8 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
28
28
|
upgini/normalizer/phone_normalizer.py,sha256=lhwsPEnfyjeIsndW2EcQGZksXYsfxaQ1ghAzVYoDRKM,9927
|
|
29
29
|
upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
|
|
30
30
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
31
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
32
|
-
upgini/resource_bundle/strings_widget.properties,sha256=
|
|
31
|
+
upgini/resource_bundle/strings.properties,sha256=dmkObOr0FJPkGQmiybmFcI3RbYjl_uCz_IQ4nm8SRkk,25242
|
|
32
|
+
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
33
33
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
34
34
|
upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
|
|
35
35
|
upgini/sampler/random_under_sampler.py,sha256=XU4c2swPIFxVXHOPpxgM2bUao0Xm-aoMmd6fKjIuV5s,4068
|
|
@@ -39,9 +39,9 @@ upgini/utils/base_search_key_detector.py,sha256=DGwhXLvc8i5VZWMDr0rncFfV5GEHdsCS
|
|
|
39
39
|
upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6PuMMjPg,3380
|
|
40
40
|
upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU,6462
|
|
41
41
|
upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
|
|
42
|
-
upgini/utils/cv_utils.py,sha256=
|
|
43
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
44
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
|
42
|
+
upgini/utils/cv_utils.py,sha256=Tn01RJvpZGZh0PUQUimlBkV-AXwe7s6yjCNFtw352Uc,3525
|
|
43
|
+
upgini/utils/datetime_utils.py,sha256=P5no4mFgYpEP6oY524ebTKvKc3TBMJzAYpWdj210_Fw,8699
|
|
44
|
+
upgini/utils/deduplicate_utils.py,sha256=GRPwD8bXZNspKvf19W3SrYjqg1qQMDlZD-BDkHnKYyo,6176
|
|
45
45
|
upgini/utils/display_utils.py,sha256=tiq5sFOfMwkKCjQ7OGdyK_twe0Qdr9F3mzkW1QXSDog,10664
|
|
46
46
|
upgini/utils/email_utils.py,sha256=3CvHXTSzlgLyGsQOXfRYVfFhfPy6OXG4uXOBWRaLfHg,3479
|
|
47
47
|
upgini/utils/fallback_progress_bar.py,sha256=cdbd1XGcWm4Ed4eAqV2_St3z7uC_kkH22gEyrN5ub6M,1090
|
|
@@ -55,8 +55,8 @@ upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,4
|
|
|
55
55
|
upgini/utils/target_utils.py,sha256=DH812qcZ7Pvf9WVVb33fbwQjb1W9h1hXRNCCiG7Y6tI,2563
|
|
56
56
|
upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
|
|
57
57
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
58
|
-
upgini-1.1.
|
|
59
|
-
upgini-1.1.
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
62
|
-
upgini-1.1.
|
|
58
|
+
upgini-1.1.248a1.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
59
|
+
upgini-1.1.248a1.dist-info/METADATA,sha256=leV6Cyrr3xf01Jd34x-1asI-e4rk2bjDPJvdUsaagNg,48208
|
|
60
|
+
upgini-1.1.248a1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
|
61
|
+
upgini-1.1.248a1.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
|
|
62
|
+
upgini-1.1.248a1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|