upgini 1.2.37__py3-none-any.whl → 1.2.38a3769.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/dataset.py +20 -2
- upgini/features_enricher.py +43 -24
- upgini/metadata.py +3 -0
- upgini/resource_bundle/strings.properties +0 -1
- upgini/utils/target_utils.py +66 -3
- {upgini-1.2.37.dist-info → upgini-1.2.38a3769.dev1.dist-info}/METADATA +1 -1
- {upgini-1.2.37.dist-info → upgini-1.2.38a3769.dev1.dist-info}/RECORD +10 -10
- {upgini-1.2.37.dist-info → upgini-1.2.38a3769.dev1.dist-info}/WHEEL +1 -1
- {upgini-1.2.37.dist-info → upgini-1.2.38a3769.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.38a3769.dev1"
|
upgini/dataset.py
CHANGED
|
@@ -22,6 +22,7 @@ from upgini.metadata import (
|
|
|
22
22
|
EVAL_SET_INDEX,
|
|
23
23
|
SYSTEM_RECORD_ID,
|
|
24
24
|
TARGET,
|
|
25
|
+
CVType,
|
|
25
26
|
DataType,
|
|
26
27
|
FeaturesFilter,
|
|
27
28
|
FileColumnMeaningType,
|
|
@@ -32,11 +33,12 @@ from upgini.metadata import (
|
|
|
32
33
|
NumericInterval,
|
|
33
34
|
RuntimeParameters,
|
|
34
35
|
SearchCustomization,
|
|
36
|
+
SearchKey,
|
|
35
37
|
)
|
|
36
38
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
37
39
|
from upgini.search_task import SearchTask
|
|
38
40
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
|
39
|
-
from upgini.utils.target_utils import balance_undersample, balance_undersample_forced
|
|
41
|
+
from upgini.utils.target_utils import balance_undersample, balance_undersample_forced, balance_undersample_time_series
|
|
40
42
|
|
|
41
43
|
try:
|
|
42
44
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
|
@@ -74,6 +76,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
74
76
|
search_keys: Optional[List[Tuple[str, ...]]] = None,
|
|
75
77
|
unnest_search_keys: Optional[Dict[str, str]] = None,
|
|
76
78
|
model_task_type: Optional[ModelTaskType] = None,
|
|
79
|
+
cv_type: Optional[CVType] = None,
|
|
77
80
|
random_state: Optional[int] = None,
|
|
78
81
|
rest_client: Optional[_RestClient] = None,
|
|
79
82
|
logger: Optional[logging.Logger] = None,
|
|
@@ -104,6 +107,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
104
107
|
|
|
105
108
|
self.dataset_name = dataset_name
|
|
106
109
|
self.task_type = model_task_type
|
|
110
|
+
self.cv_type = cv_type
|
|
107
111
|
self.description = description
|
|
108
112
|
self.meaning_types = meaning_types
|
|
109
113
|
self.search_keys = search_keys
|
|
@@ -225,6 +229,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
225
229
|
df=self.data,
|
|
226
230
|
target_column=target_column,
|
|
227
231
|
task_type=self.task_type,
|
|
232
|
+
cv_type=self.cv_type,
|
|
228
233
|
random_state=self.random_state,
|
|
229
234
|
sample_size=self.FORCE_SAMPLE_SIZE,
|
|
230
235
|
logger=self.logger,
|
|
@@ -297,7 +302,20 @@ class Dataset: # (pd.DataFrame):
|
|
|
297
302
|
f"Etalon has size {len(self.data)} more than threshold {sample_threshold} "
|
|
298
303
|
f"and will be downsampled to {sample_rows}"
|
|
299
304
|
)
|
|
300
|
-
|
|
305
|
+
if self.cv_type is not None and self.cv_type.is_time_series():
|
|
306
|
+
resampled_data = balance_undersample_time_series(
|
|
307
|
+
df=self.data,
|
|
308
|
+
id_columns=[k for k, v in self.meaning_types.items() if v == FileColumnMeaningType.CUSTOM_KEY],
|
|
309
|
+
date_column=next(
|
|
310
|
+
k
|
|
311
|
+
for k, v in self.meaning_types.items()
|
|
312
|
+
if v in [FileColumnMeaningType.DATE, FileColumnMeaningType.DATETIME]
|
|
313
|
+
),
|
|
314
|
+
sample_size=sample_rows,
|
|
315
|
+
logger=self.logger,
|
|
316
|
+
)
|
|
317
|
+
else:
|
|
318
|
+
resampled_data = self.data.sample(n=sample_rows, random_state=self.random_state)
|
|
301
319
|
self.data = resampled_data
|
|
302
320
|
self.logger.info(f"Shape after threshold resampling: {self.data.shape}")
|
|
303
321
|
|
upgini/features_enricher.py
CHANGED
|
@@ -237,6 +237,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
237
237
|
add_date_if_missing: bool = True,
|
|
238
238
|
select_features: bool = False,
|
|
239
239
|
disable_force_downsampling: bool = False,
|
|
240
|
+
id_columns: Optional[List[str]] = None,
|
|
240
241
|
**kwargs,
|
|
241
242
|
):
|
|
242
243
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -277,9 +278,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
277
278
|
)
|
|
278
279
|
|
|
279
280
|
validate_version(self.logger, self.__log_warning)
|
|
281
|
+
|
|
280
282
|
self.search_keys = search_keys or {}
|
|
283
|
+
self.id_columns = id_columns
|
|
284
|
+
if id_columns is not None:
|
|
285
|
+
self.search_keys.update({col: SearchKey.CUSTOM_KEY for col in id_columns})
|
|
281
286
|
self.country_code = country_code
|
|
282
287
|
self.__validate_search_keys(search_keys, search_id)
|
|
288
|
+
|
|
283
289
|
self.model_task_type = model_task_type
|
|
284
290
|
self.endpoint = endpoint
|
|
285
291
|
self._search_task: Optional[SearchTask] = None
|
|
@@ -983,7 +989,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
983
989
|
with Spinner():
|
|
984
990
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
|
985
991
|
|
|
986
|
-
has_date =
|
|
992
|
+
has_date = self._get_date_column(search_keys) is not None
|
|
987
993
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
|
988
994
|
|
|
989
995
|
wrapper = EstimatorWrapper.create(
|
|
@@ -1185,7 +1191,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1185
1191
|
)
|
|
1186
1192
|
|
|
1187
1193
|
uplift_col = self.bundle.get("quality_metrics_uplift_header")
|
|
1188
|
-
date_column =
|
|
1194
|
+
date_column = self._get_date_column(search_keys)
|
|
1189
1195
|
if (
|
|
1190
1196
|
uplift_col in metrics_df.columns
|
|
1191
1197
|
and (metrics_df[uplift_col] < 0).any()
|
|
@@ -1354,7 +1360,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1354
1360
|
groups = None
|
|
1355
1361
|
|
|
1356
1362
|
if not isinstance(_cv, BaseCrossValidator):
|
|
1357
|
-
date_column =
|
|
1363
|
+
date_column = self._get_date_column(search_keys)
|
|
1358
1364
|
date_series = X[date_column] if date_column is not None else None
|
|
1359
1365
|
_cv, groups = CVConfig(
|
|
1360
1366
|
_cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
|
|
@@ -1667,7 +1673,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1667
1673
|
search_keys = self.search_keys.copy()
|
|
1668
1674
|
search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1669
1675
|
|
|
1670
|
-
date_column =
|
|
1676
|
+
date_column = self._get_date_column(search_keys)
|
|
1671
1677
|
generated_features = []
|
|
1672
1678
|
if date_column is not None:
|
|
1673
1679
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
@@ -1741,7 +1747,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1741
1747
|
search_keys = self.fit_search_keys
|
|
1742
1748
|
|
|
1743
1749
|
rows_to_drop = None
|
|
1744
|
-
has_date =
|
|
1750
|
+
has_date = self._get_date_column(search_keys) is not None
|
|
1745
1751
|
self.model_task_type = self.model_task_type or define_task(
|
|
1746
1752
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
|
1747
1753
|
)
|
|
@@ -1853,7 +1859,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1853
1859
|
df = balance_undersample_forced(
|
|
1854
1860
|
df=df,
|
|
1855
1861
|
target_column=TARGET,
|
|
1862
|
+
id_columns=self.id_columns,
|
|
1863
|
+
date_column=self._get_date_column(self.search_keys),
|
|
1856
1864
|
task_type=self.model_task_type,
|
|
1865
|
+
cv_type=self.cv,
|
|
1857
1866
|
random_state=self.random_state,
|
|
1858
1867
|
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1859
1868
|
logger=self.logger,
|
|
@@ -1995,7 +2004,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1995
2004
|
trace_id = trace_id or uuid.uuid4()
|
|
1996
2005
|
return search_task.get_progress(trace_id)
|
|
1997
2006
|
|
|
1998
|
-
def get_transactional_transform_api(self
|
|
2007
|
+
def get_transactional_transform_api(self):
|
|
1999
2008
|
if self.api_key is None:
|
|
2000
2009
|
raise ValidationError(self.bundle.get("transactional_transform_unregistered"))
|
|
2001
2010
|
if self._search_task is None:
|
|
@@ -2053,7 +2062,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2053
2062
|
api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
|
|
2054
2063
|
-H 'Authorization: {self.api_key}' \\
|
|
2055
2064
|
-H 'Content-Type: application/json' \\
|
|
2056
|
-
-d '{{"search_keys": {keys}{features_section}
|
|
2065
|
+
-d '{{"search_keys": {keys}{features_section}}}'"""
|
|
2057
2066
|
return api_example
|
|
2058
2067
|
|
|
2059
2068
|
def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
|
|
@@ -2102,10 +2111,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2102
2111
|
self.logger.warning(
|
|
2103
2112
|
f"There are important features for transform, that generated by online API: {online_api_features}"
|
|
2104
2113
|
)
|
|
2105
|
-
|
|
2106
|
-
|
|
2107
|
-
print(msg)
|
|
2108
|
-
print(self.get_transactional_transform_api(only_online_sources=True))
|
|
2114
|
+
# TODO
|
|
2115
|
+
raise Exception("There are features selected that are paid. Contact support (sales@upgini.com)")
|
|
2109
2116
|
|
|
2110
2117
|
if not metrics_calculation:
|
|
2111
2118
|
transform_usage = self.rest_client.get_current_transform_usage(trace_id)
|
|
@@ -2155,7 +2162,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2155
2162
|
df = self.__add_country_code(df, search_keys)
|
|
2156
2163
|
|
|
2157
2164
|
generated_features = []
|
|
2158
|
-
date_column =
|
|
2165
|
+
date_column = self._get_date_column(search_keys)
|
|
2159
2166
|
if date_column is not None:
|
|
2160
2167
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2161
2168
|
df = converter.convert(df, keep_time=True)
|
|
@@ -2163,7 +2170,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2163
2170
|
generated_features.extend(converter.generated_features)
|
|
2164
2171
|
else:
|
|
2165
2172
|
self.logger.info("Input dataset hasn't date column")
|
|
2166
|
-
if self.
|
|
2173
|
+
if self.__should_add_date_column():
|
|
2167
2174
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
2168
2175
|
|
|
2169
2176
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
@@ -2446,7 +2453,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2446
2453
|
# Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
|
|
2447
2454
|
multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
|
|
2448
2455
|
for multi_key in multi_keys:
|
|
2449
|
-
if multi_key not in [
|
|
2456
|
+
if multi_key not in [
|
|
2457
|
+
SearchKey.PHONE,
|
|
2458
|
+
SearchKey.IP,
|
|
2459
|
+
SearchKey.POSTAL_CODE,
|
|
2460
|
+
SearchKey.EMAIL,
|
|
2461
|
+
SearchKey.HEM,
|
|
2462
|
+
SearchKey.CUSTOM_KEY,
|
|
2463
|
+
]:
|
|
2450
2464
|
msg = self.bundle.get("unsupported_multi_key").format(multi_key)
|
|
2451
2465
|
self.logger.warning(msg)
|
|
2452
2466
|
raise ValidationError(msg)
|
|
@@ -2610,7 +2624,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2610
2624
|
self.fit_generated_features.extend(converter.generated_features)
|
|
2611
2625
|
else:
|
|
2612
2626
|
self.logger.info("Input dataset hasn't date column")
|
|
2613
|
-
if self.
|
|
2627
|
+
if self.__should_add_date_column():
|
|
2614
2628
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2615
2629
|
|
|
2616
2630
|
email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
|
|
@@ -2764,6 +2778,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2764
2778
|
search_keys=combined_search_keys,
|
|
2765
2779
|
unnest_search_keys=unnest_search_keys,
|
|
2766
2780
|
model_task_type=self.model_task_type,
|
|
2781
|
+
cv_type=self.cv,
|
|
2767
2782
|
date_format=self.date_format,
|
|
2768
2783
|
random_state=self.random_state,
|
|
2769
2784
|
rest_client=self.rest_client,
|
|
@@ -2920,6 +2935,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2920
2935
|
if not self.warning_counter.has_warnings():
|
|
2921
2936
|
self.__display_support_link(self.bundle.get("all_ok_community_invite"))
|
|
2922
2937
|
|
|
2938
|
+
def __should_add_date_column(self):
|
|
2939
|
+
return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
|
|
2940
|
+
|
|
2923
2941
|
def __adjust_cv(self, df: pd.DataFrame):
|
|
2924
2942
|
date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2925
2943
|
# Check Multivariate time series
|
|
@@ -3165,7 +3183,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3165
3183
|
if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
|
|
3166
3184
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3167
3185
|
else:
|
|
3168
|
-
date_column =
|
|
3186
|
+
date_column = FeaturesEnricher._get_date_column(search_keys)
|
|
3169
3187
|
sort_columns = [date_column] if date_column is not None else []
|
|
3170
3188
|
|
|
3171
3189
|
# Xy = pd.concat([X, y], axis=1)
|
|
@@ -3357,6 +3375,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3357
3375
|
if t == SearchKey.POSTAL_CODE:
|
|
3358
3376
|
return col
|
|
3359
3377
|
|
|
3378
|
+
@staticmethod
|
|
3379
|
+
def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3380
|
+
return SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
3381
|
+
|
|
3360
3382
|
def _explode_multiple_search_keys(
|
|
3361
3383
|
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
|
|
3362
3384
|
) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
|
|
@@ -3365,7 +3387,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3365
3387
|
for key_name, key_type in search_keys.items():
|
|
3366
3388
|
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
3367
3389
|
search_key_names_by_type = {
|
|
3368
|
-
key_type: key_names
|
|
3390
|
+
key_type: key_names
|
|
3391
|
+
for key_type, key_names in search_key_names_by_type.items()
|
|
3392
|
+
if len(key_names) > 1 and key_type != SearchKey.CUSTOM_KEY
|
|
3369
3393
|
}
|
|
3370
3394
|
if len(search_key_names_by_type) == 0:
|
|
3371
3395
|
return df, {}
|
|
@@ -3418,9 +3442,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3418
3442
|
]
|
|
3419
3443
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3420
3444
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3421
|
-
sort_exclude_columns.append(
|
|
3445
|
+
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
3422
3446
|
else:
|
|
3423
|
-
date_column =
|
|
3447
|
+
date_column = self._get_date_column(search_keys)
|
|
3424
3448
|
sort_columns = [date_column] if date_column is not None else []
|
|
3425
3449
|
|
|
3426
3450
|
sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
|
|
@@ -3856,11 +3880,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3856
3880
|
self.logger.warning(msg + f" Provided search keys: {search_keys}")
|
|
3857
3881
|
raise ValidationError(msg)
|
|
3858
3882
|
|
|
3859
|
-
if SearchKey.CUSTOM_KEY in valid_search_keys.values():
|
|
3860
|
-
custom_keys = [column for column, key in valid_search_keys.items() if key == SearchKey.CUSTOM_KEY]
|
|
3861
|
-
for key in custom_keys:
|
|
3862
|
-
del valid_search_keys[key]
|
|
3863
|
-
|
|
3864
3883
|
if (
|
|
3865
3884
|
len(valid_search_keys.values()) == 1
|
|
3866
3885
|
and self.country_code is None
|
upgini/metadata.py
CHANGED
|
@@ -216,7 +216,6 @@ imbalanced_target=\nTarget is imbalanced and will be undersampled. Frequency of
|
|
|
216
216
|
loss_selection_info=Using loss `{}` for feature selection
|
|
217
217
|
loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
|
|
218
218
|
forced_balance_undersample=For quick data retrieval, your dataset has been sampled. To use data search without data sampling please contact support (sales@upgini.com)
|
|
219
|
-
online_api_features_transform=Please note that some of the selected features {} are provided through a slow enrichment interface and are not available via transformation. However, they can be accessed via the API:
|
|
220
219
|
|
|
221
220
|
# Validation table
|
|
222
221
|
validation_column_name_header=Column name
|
upgini/utils/target_utils.py
CHANGED
|
@@ -1,15 +1,18 @@
|
|
|
1
|
+
import itertools
|
|
1
2
|
import logging
|
|
2
|
-
from typing import Callable, Optional, Union
|
|
3
|
+
from typing import Callable, List, Optional, Union
|
|
3
4
|
|
|
4
5
|
import numpy as np
|
|
5
6
|
import pandas as pd
|
|
6
7
|
from pandas.api.types import is_numeric_dtype, is_bool_dtype
|
|
7
8
|
|
|
8
9
|
from upgini.errors import ValidationError
|
|
9
|
-
from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
|
|
10
|
+
from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
|
|
10
11
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
|
11
12
|
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
|
12
13
|
|
|
14
|
+
TS_MIN_DIFFERENT_IDS_RATIO = 0.2
|
|
15
|
+
|
|
13
16
|
|
|
14
17
|
def correct_string_target(y: Union[pd.Series, np.ndarray]) -> Union[pd.Series, np.ndarray]:
|
|
15
18
|
if isinstance(y, pd.Series):
|
|
@@ -201,7 +204,10 @@ def balance_undersample(
|
|
|
201
204
|
def balance_undersample_forced(
|
|
202
205
|
df: pd.DataFrame,
|
|
203
206
|
target_column: str,
|
|
207
|
+
id_columns: List[str],
|
|
208
|
+
date_column: str,
|
|
204
209
|
task_type: ModelTaskType,
|
|
210
|
+
cv_type: CVType | None,
|
|
205
211
|
random_state: int,
|
|
206
212
|
sample_size: int = 7000,
|
|
207
213
|
logger: Optional[logging.Logger] = None,
|
|
@@ -233,7 +239,16 @@ def balance_undersample_forced(
|
|
|
233
239
|
|
|
234
240
|
resampled_data = df
|
|
235
241
|
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
236
|
-
if
|
|
242
|
+
if cv_type is not None and cv_type.is_time_series():
|
|
243
|
+
logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
|
|
244
|
+
resampled_data = balance_undersample_time_series(
|
|
245
|
+
df,
|
|
246
|
+
id_columns=id_columns,
|
|
247
|
+
date_column=date_column,
|
|
248
|
+
sample_size=sample_size,
|
|
249
|
+
logger=logger,
|
|
250
|
+
)
|
|
251
|
+
elif task_type in [ModelTaskType.MULTICLASS, ModelTaskType.REGRESSION]:
|
|
237
252
|
logger.warning(f"Sampling dataset from {len(df)} to {sample_size}")
|
|
238
253
|
resampled_data = df.sample(n=sample_size, random_state=random_state)
|
|
239
254
|
else:
|
|
@@ -264,6 +279,54 @@ def balance_undersample_forced(
|
|
|
264
279
|
return resampled_data
|
|
265
280
|
|
|
266
281
|
|
|
282
|
+
def balance_undersample_time_series(
|
|
283
|
+
df: pd.DataFrame,
|
|
284
|
+
id_columns: List[str],
|
|
285
|
+
date_column: str,
|
|
286
|
+
sample_size: int,
|
|
287
|
+
min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO,
|
|
288
|
+
logger: Optional[logging.Logger] = None,
|
|
289
|
+
):
|
|
290
|
+
def ensure_tuple(x):
|
|
291
|
+
return tuple([x]) if not isinstance(x, tuple) else x
|
|
292
|
+
|
|
293
|
+
ids_sort = df.groupby(id_columns)[date_column].aggregate(["max", "count"]).T.to_dict()
|
|
294
|
+
ids_sort = {ensure_tuple(k): (v["max"], v["count"]) for k, v in ids_sort.items()}
|
|
295
|
+
id_counts = df[id_columns].value_counts()
|
|
296
|
+
id_counts.index = [ensure_tuple(i) for i in id_counts.index]
|
|
297
|
+
id_counts = id_counts.sort_index(key=lambda x: [ids_sort[y] for y in x], ascending=False).cumsum()
|
|
298
|
+
id_counts = id_counts[id_counts <= sample_size]
|
|
299
|
+
min_different_ids = int(len(df[id_columns].drop_duplicates()) * min_different_ids_ratio)
|
|
300
|
+
|
|
301
|
+
def id_mask(sample_index: pd.Index) -> pd.Index:
|
|
302
|
+
if isinstance(sample_index, pd.MultiIndex):
|
|
303
|
+
return pd.MultiIndex.from_frame(df[id_columns]).isin(sample_index)
|
|
304
|
+
else:
|
|
305
|
+
return df[id_columns[0]].isin(sample_index)
|
|
306
|
+
|
|
307
|
+
if len(id_counts) < min_different_ids:
|
|
308
|
+
if logger is not None:
|
|
309
|
+
logger.info(
|
|
310
|
+
f"Different ids count {len(id_counts)} is less than min different ids {min_different_ids}, sampling time window"
|
|
311
|
+
)
|
|
312
|
+
date_counts = df.groupby(id_columns)[date_column].nunique().sort_values(ascending=False)
|
|
313
|
+
ids_to_sample = date_counts.index[:min_different_ids]
|
|
314
|
+
mask = id_mask(ids_to_sample)
|
|
315
|
+
df = df[mask]
|
|
316
|
+
sample_date_counts = df[date_column].value_counts().sort_index(ascending=False).cumsum()
|
|
317
|
+
sample_date_counts = sample_date_counts[sample_date_counts <= sample_size]
|
|
318
|
+
df = df[df[date_column].isin(sample_date_counts.index)]
|
|
319
|
+
else:
|
|
320
|
+
if len(id_columns) > 1:
|
|
321
|
+
id_counts.index = pd.MultiIndex.from_tuples(id_counts.index)
|
|
322
|
+
else:
|
|
323
|
+
id_counts.index = [i[0] for i in id_counts.index]
|
|
324
|
+
mask = id_mask(id_counts.index)
|
|
325
|
+
df = df[mask]
|
|
326
|
+
|
|
327
|
+
return df
|
|
328
|
+
|
|
329
|
+
|
|
267
330
|
def calculate_psi(expected: pd.Series, actual: pd.Series) -> Union[float, Exception]:
|
|
268
331
|
try:
|
|
269
332
|
df = pd.concat([expected, actual])
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=p0xaY3SHrNu5ANUCNBeoBbJ2dD9QsJL_eb_HjEWLp7Q,33
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=zJQUzCTcSV5bqZ9B0oy2a77-oigLmW9F8BGs23WYwA0,33109
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=NQuaXJTKf-CR6fM9fGrAjxYMxcoxGPO-YPvyHDRDfag,195477
|
|
7
7
|
upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
|
-
upgini/metadata.py,sha256
|
|
9
|
+
upgini/metadata.py,sha256=-ibqiNjD7dTagqg53FoEJNEqvAYbwgfyn9PGTRQ_YKU,12054
|
|
10
10
|
upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
|
|
11
11
|
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
30
30
|
upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=TiYWmFnuhOq0R3aVg2nbA3F5AWLgjrgh68Yj6MhG-x8,27088
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -56,10 +56,10 @@ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,1
|
|
|
56
56
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
57
57
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
58
58
|
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
59
|
-
upgini/utils/target_utils.py,sha256=
|
|
59
|
+
upgini/utils/target_utils.py,sha256=9LWG8LiCzgYD1h3_MvOFnN3BG8bMLnwfCWdRV47cs_I,13910
|
|
60
60
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
61
61
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
62
|
+
upgini-1.2.38a3769.dev1.dist-info/METADATA,sha256=xECfr7DVtLllQD_hQft1lzZVdFAXB1uMjGK_BkNXdLc,48604
|
|
63
|
+
upgini-1.2.38a3769.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
64
|
+
upgini-1.2.38a3769.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
65
|
+
upgini-1.2.38a3769.dev1.dist-info/RECORD,,
|
|
File without changes
|