upgini 1.2.36a1__tar.gz → 1.2.38a3769.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/PKG-INFO +1 -1
- upgini-1.2.38a3769.dev1/src/upgini/__about__.py +1 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/dataset.py +20 -2
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/features_enricher.py +39 -18
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/metadata.py +3 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/target_utils.py +66 -3
- upgini-1.2.36a1/src/upgini/__about__.py +0 -1
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/.gitignore +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/LICENSE +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/README.md +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/pyproject.toml +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/__init__.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/ads.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/errors.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/http.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/metrics.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/search_task.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/spinner.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.38a3769.dev1"
|
|
@@ -22,6 +22,7 @@ from upgini.metadata import (
|
|
|
22
22
|
EVAL_SET_INDEX,
|
|
23
23
|
SYSTEM_RECORD_ID,
|
|
24
24
|
TARGET,
|
|
25
|
+
CVType,
|
|
25
26
|
DataType,
|
|
26
27
|
FeaturesFilter,
|
|
27
28
|
FileColumnMeaningType,
|
|
@@ -32,11 +33,12 @@ from upgini.metadata import (
|
|
|
32
33
|
NumericInterval,
|
|
33
34
|
RuntimeParameters,
|
|
34
35
|
SearchCustomization,
|
|
36
|
+
SearchKey,
|
|
35
37
|
)
|
|
36
38
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
37
39
|
from upgini.search_task import SearchTask
|
|
38
40
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
|
39
|
-
from upgini.utils.target_utils import balance_undersample, balance_undersample_forced
|
|
41
|
+
from upgini.utils.target_utils import balance_undersample, balance_undersample_forced, balance_undersample_time_series
|
|
40
42
|
|
|
41
43
|
try:
|
|
42
44
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
|
@@ -74,6 +76,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
74
76
|
search_keys: Optional[List[Tuple[str, ...]]] = None,
|
|
75
77
|
unnest_search_keys: Optional[Dict[str, str]] = None,
|
|
76
78
|
model_task_type: Optional[ModelTaskType] = None,
|
|
79
|
+
cv_type: Optional[CVType] = None,
|
|
77
80
|
random_state: Optional[int] = None,
|
|
78
81
|
rest_client: Optional[_RestClient] = None,
|
|
79
82
|
logger: Optional[logging.Logger] = None,
|
|
@@ -104,6 +107,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
104
107
|
|
|
105
108
|
self.dataset_name = dataset_name
|
|
106
109
|
self.task_type = model_task_type
|
|
110
|
+
self.cv_type = cv_type
|
|
107
111
|
self.description = description
|
|
108
112
|
self.meaning_types = meaning_types
|
|
109
113
|
self.search_keys = search_keys
|
|
@@ -225,6 +229,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
225
229
|
df=self.data,
|
|
226
230
|
target_column=target_column,
|
|
227
231
|
task_type=self.task_type,
|
|
232
|
+
cv_type=self.cv_type,
|
|
228
233
|
random_state=self.random_state,
|
|
229
234
|
sample_size=self.FORCE_SAMPLE_SIZE,
|
|
230
235
|
logger=self.logger,
|
|
@@ -297,7 +302,20 @@ class Dataset: # (pd.DataFrame):
|
|
|
297
302
|
f"Etalon has size {len(self.data)} more than threshold {sample_threshold} "
|
|
298
303
|
f"and will be downsampled to {sample_rows}"
|
|
299
304
|
)
|
|
300
|
-
|
|
305
|
+
if self.cv_type is not None and self.cv_type.is_time_series():
|
|
306
|
+
resampled_data = balance_undersample_time_series(
|
|
307
|
+
df=self.data,
|
|
308
|
+
id_columns=[k for k, v in self.meaning_types.items() if v == FileColumnMeaningType.CUSTOM_KEY],
|
|
309
|
+
date_column=next(
|
|
310
|
+
k
|
|
311
|
+
for k, v in self.meaning_types.items()
|
|
312
|
+
if v in [FileColumnMeaningType.DATE, FileColumnMeaningType.DATETIME]
|
|
313
|
+
),
|
|
314
|
+
sample_size=sample_rows,
|
|
315
|
+
logger=self.logger,
|
|
316
|
+
)
|
|
317
|
+
else:
|
|
318
|
+
resampled_data = self.data.sample(n=sample_rows, random_state=self.random_state)
|
|
301
319
|
self.data = resampled_data
|
|
302
320
|
self.logger.info(f"Shape after threshold resampling: {self.data.shape}")
|
|
303
321
|
|
|
@@ -237,6 +237,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
237
237
|
add_date_if_missing: bool = True,
|
|
238
238
|
select_features: bool = False,
|
|
239
239
|
disable_force_downsampling: bool = False,
|
|
240
|
+
id_columns: Optional[List[str]] = None,
|
|
240
241
|
**kwargs,
|
|
241
242
|
):
|
|
242
243
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -277,9 +278,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
277
278
|
)
|
|
278
279
|
|
|
279
280
|
validate_version(self.logger, self.__log_warning)
|
|
281
|
+
|
|
280
282
|
self.search_keys = search_keys or {}
|
|
283
|
+
self.id_columns = id_columns
|
|
284
|
+
if id_columns is not None:
|
|
285
|
+
self.search_keys.update({col: SearchKey.CUSTOM_KEY for col in id_columns})
|
|
281
286
|
self.country_code = country_code
|
|
282
287
|
self.__validate_search_keys(search_keys, search_id)
|
|
288
|
+
|
|
283
289
|
self.model_task_type = model_task_type
|
|
284
290
|
self.endpoint = endpoint
|
|
285
291
|
self._search_task: Optional[SearchTask] = None
|
|
@@ -983,7 +989,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
983
989
|
with Spinner():
|
|
984
990
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
|
985
991
|
|
|
986
|
-
has_date =
|
|
992
|
+
has_date = self._get_date_column(search_keys) is not None
|
|
987
993
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
|
988
994
|
|
|
989
995
|
wrapper = EstimatorWrapper.create(
|
|
@@ -1185,7 +1191,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1185
1191
|
)
|
|
1186
1192
|
|
|
1187
1193
|
uplift_col = self.bundle.get("quality_metrics_uplift_header")
|
|
1188
|
-
date_column =
|
|
1194
|
+
date_column = self._get_date_column(search_keys)
|
|
1189
1195
|
if (
|
|
1190
1196
|
uplift_col in metrics_df.columns
|
|
1191
1197
|
and (metrics_df[uplift_col] < 0).any()
|
|
@@ -1354,7 +1360,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1354
1360
|
groups = None
|
|
1355
1361
|
|
|
1356
1362
|
if not isinstance(_cv, BaseCrossValidator):
|
|
1357
|
-
date_column =
|
|
1363
|
+
date_column = self._get_date_column(search_keys)
|
|
1358
1364
|
date_series = X[date_column] if date_column is not None else None
|
|
1359
1365
|
_cv, groups = CVConfig(
|
|
1360
1366
|
_cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
|
|
@@ -1667,7 +1673,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1667
1673
|
search_keys = self.search_keys.copy()
|
|
1668
1674
|
search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1669
1675
|
|
|
1670
|
-
date_column =
|
|
1676
|
+
date_column = self._get_date_column(search_keys)
|
|
1671
1677
|
generated_features = []
|
|
1672
1678
|
if date_column is not None:
|
|
1673
1679
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
@@ -1741,7 +1747,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1741
1747
|
search_keys = self.fit_search_keys
|
|
1742
1748
|
|
|
1743
1749
|
rows_to_drop = None
|
|
1744
|
-
has_date =
|
|
1750
|
+
has_date = self._get_date_column(search_keys) is not None
|
|
1745
1751
|
self.model_task_type = self.model_task_type or define_task(
|
|
1746
1752
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
|
1747
1753
|
)
|
|
@@ -1853,7 +1859,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1853
1859
|
df = balance_undersample_forced(
|
|
1854
1860
|
df=df,
|
|
1855
1861
|
target_column=TARGET,
|
|
1862
|
+
id_columns=self.id_columns,
|
|
1863
|
+
date_column=self._get_date_column(self.search_keys),
|
|
1856
1864
|
task_type=self.model_task_type,
|
|
1865
|
+
cv_type=self.cv,
|
|
1857
1866
|
random_state=self.random_state,
|
|
1858
1867
|
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1859
1868
|
logger=self.logger,
|
|
@@ -2153,7 +2162,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2153
2162
|
df = self.__add_country_code(df, search_keys)
|
|
2154
2163
|
|
|
2155
2164
|
generated_features = []
|
|
2156
|
-
date_column =
|
|
2165
|
+
date_column = self._get_date_column(search_keys)
|
|
2157
2166
|
if date_column is not None:
|
|
2158
2167
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2159
2168
|
df = converter.convert(df, keep_time=True)
|
|
@@ -2161,7 +2170,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2161
2170
|
generated_features.extend(converter.generated_features)
|
|
2162
2171
|
else:
|
|
2163
2172
|
self.logger.info("Input dataset hasn't date column")
|
|
2164
|
-
if self.
|
|
2173
|
+
if self.__should_add_date_column():
|
|
2165
2174
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
2166
2175
|
|
|
2167
2176
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
@@ -2444,7 +2453,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2444
2453
|
# Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
|
|
2445
2454
|
multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
|
|
2446
2455
|
for multi_key in multi_keys:
|
|
2447
|
-
if multi_key not in [
|
|
2456
|
+
if multi_key not in [
|
|
2457
|
+
SearchKey.PHONE,
|
|
2458
|
+
SearchKey.IP,
|
|
2459
|
+
SearchKey.POSTAL_CODE,
|
|
2460
|
+
SearchKey.EMAIL,
|
|
2461
|
+
SearchKey.HEM,
|
|
2462
|
+
SearchKey.CUSTOM_KEY,
|
|
2463
|
+
]:
|
|
2448
2464
|
msg = self.bundle.get("unsupported_multi_key").format(multi_key)
|
|
2449
2465
|
self.logger.warning(msg)
|
|
2450
2466
|
raise ValidationError(msg)
|
|
@@ -2608,7 +2624,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2608
2624
|
self.fit_generated_features.extend(converter.generated_features)
|
|
2609
2625
|
else:
|
|
2610
2626
|
self.logger.info("Input dataset hasn't date column")
|
|
2611
|
-
if self.
|
|
2627
|
+
if self.__should_add_date_column():
|
|
2612
2628
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2613
2629
|
|
|
2614
2630
|
email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
|
|
@@ -2762,6 +2778,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2762
2778
|
search_keys=combined_search_keys,
|
|
2763
2779
|
unnest_search_keys=unnest_search_keys,
|
|
2764
2780
|
model_task_type=self.model_task_type,
|
|
2781
|
+
cv_type=self.cv,
|
|
2765
2782
|
date_format=self.date_format,
|
|
2766
2783
|
random_state=self.random_state,
|
|
2767
2784
|
rest_client=self.rest_client,
|
|
@@ -2918,6 +2935,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2918
2935
|
if not self.warning_counter.has_warnings():
|
|
2919
2936
|
self.__display_support_link(self.bundle.get("all_ok_community_invite"))
|
|
2920
2937
|
|
|
2938
|
+
def __should_add_date_column(self):
|
|
2939
|
+
return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
|
|
2940
|
+
|
|
2921
2941
|
def __adjust_cv(self, df: pd.DataFrame):
|
|
2922
2942
|
date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2923
2943
|
# Check Multivariate time series
|
|
@@ -3163,7 +3183,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3163
3183
|
if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
|
|
3164
3184
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3165
3185
|
else:
|
|
3166
|
-
date_column =
|
|
3186
|
+
date_column = FeaturesEnricher._get_date_column(search_keys)
|
|
3167
3187
|
sort_columns = [date_column] if date_column is not None else []
|
|
3168
3188
|
|
|
3169
3189
|
# Xy = pd.concat([X, y], axis=1)
|
|
@@ -3355,6 +3375,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3355
3375
|
if t == SearchKey.POSTAL_CODE:
|
|
3356
3376
|
return col
|
|
3357
3377
|
|
|
3378
|
+
@staticmethod
|
|
3379
|
+
def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3380
|
+
return SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
3381
|
+
|
|
3358
3382
|
def _explode_multiple_search_keys(
|
|
3359
3383
|
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
|
|
3360
3384
|
) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
|
|
@@ -3363,7 +3387,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3363
3387
|
for key_name, key_type in search_keys.items():
|
|
3364
3388
|
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
3365
3389
|
search_key_names_by_type = {
|
|
3366
|
-
key_type: key_names
|
|
3390
|
+
key_type: key_names
|
|
3391
|
+
for key_type, key_names in search_key_names_by_type.items()
|
|
3392
|
+
if len(key_names) > 1 and key_type != SearchKey.CUSTOM_KEY
|
|
3367
3393
|
}
|
|
3368
3394
|
if len(search_key_names_by_type) == 0:
|
|
3369
3395
|
return df, {}
|
|
@@ -3416,9 +3442,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3416
3442
|
]
|
|
3417
3443
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3418
3444
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3419
|
-
sort_exclude_columns.append(
|
|
3445
|
+
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
3420
3446
|
else:
|
|
3421
|
-
date_column =
|
|
3447
|
+
date_column = self._get_date_column(search_keys)
|
|
3422
3448
|
sort_columns = [date_column] if date_column is not None else []
|
|
3423
3449
|
|
|
3424
3450
|
sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
|
|
@@ -3854,11 +3880,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3854
3880
|
self.logger.warning(msg + f" Provided search keys: {search_keys}")
|
|
3855
3881
|
raise ValidationError(msg)
|
|
3856
3882
|
|
|
3857
|
-
if SearchKey.CUSTOM_KEY in valid_search_keys.values():
|
|
3858
|
-
custom_keys = [column for column, key in valid_search_keys.items() if key == SearchKey.CUSTOM_KEY]
|
|
3859
|
-
for key in custom_keys:
|
|
3860
|
-
del valid_search_keys[key]
|
|
3861
|
-
|
|
3862
3883
|
if (
|
|
3863
3884
|
len(valid_search_keys.values()) == 1
|
|
3864
3885
|
and self.country_code is None
|
|
@@ -1,15 +1,18 @@
|
|
|
1
|
+
import itertools
|
|
1
2
|
import logging
|
|
2
|
-
from typing import Callable, Optional, Union
|
|
3
|
+
from typing import Callable, List, Optional, Union
|
|
3
4
|
|
|
4
5
|
import numpy as np
|
|
5
6
|
import pandas as pd
|
|
6
7
|
from pandas.api.types import is_numeric_dtype, is_bool_dtype
|
|
7
8
|
|
|
8
9
|
from upgini.errors import ValidationError
|
|
9
|
-
from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
|
|
10
|
+
from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
|
|
10
11
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
|
11
12
|
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
|
12
13
|
|
|
14
|
+
TS_MIN_DIFFERENT_IDS_RATIO = 0.2
|
|
15
|
+
|
|
13
16
|
|
|
14
17
|
def correct_string_target(y: Union[pd.Series, np.ndarray]) -> Union[pd.Series, np.ndarray]:
|
|
15
18
|
if isinstance(y, pd.Series):
|
|
@@ -201,7 +204,10 @@ def balance_undersample(
|
|
|
201
204
|
def balance_undersample_forced(
|
|
202
205
|
df: pd.DataFrame,
|
|
203
206
|
target_column: str,
|
|
207
|
+
id_columns: List[str],
|
|
208
|
+
date_column: str,
|
|
204
209
|
task_type: ModelTaskType,
|
|
210
|
+
cv_type: CVType | None,
|
|
205
211
|
random_state: int,
|
|
206
212
|
sample_size: int = 7000,
|
|
207
213
|
logger: Optional[logging.Logger] = None,
|
|
@@ -233,7 +239,16 @@ def balance_undersample_forced(
|
|
|
233
239
|
|
|
234
240
|
resampled_data = df
|
|
235
241
|
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
236
|
-
if
|
|
242
|
+
if cv_type is not None and cv_type.is_time_series():
|
|
243
|
+
logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
|
|
244
|
+
resampled_data = balance_undersample_time_series(
|
|
245
|
+
df,
|
|
246
|
+
id_columns=id_columns,
|
|
247
|
+
date_column=date_column,
|
|
248
|
+
sample_size=sample_size,
|
|
249
|
+
logger=logger,
|
|
250
|
+
)
|
|
251
|
+
elif task_type in [ModelTaskType.MULTICLASS, ModelTaskType.REGRESSION]:
|
|
237
252
|
logger.warning(f"Sampling dataset from {len(df)} to {sample_size}")
|
|
238
253
|
resampled_data = df.sample(n=sample_size, random_state=random_state)
|
|
239
254
|
else:
|
|
@@ -264,6 +279,54 @@ def balance_undersample_forced(
|
|
|
264
279
|
return resampled_data
|
|
265
280
|
|
|
266
281
|
|
|
282
|
+
def balance_undersample_time_series(
|
|
283
|
+
df: pd.DataFrame,
|
|
284
|
+
id_columns: List[str],
|
|
285
|
+
date_column: str,
|
|
286
|
+
sample_size: int,
|
|
287
|
+
min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO,
|
|
288
|
+
logger: Optional[logging.Logger] = None,
|
|
289
|
+
):
|
|
290
|
+
def ensure_tuple(x):
|
|
291
|
+
return tuple([x]) if not isinstance(x, tuple) else x
|
|
292
|
+
|
|
293
|
+
ids_sort = df.groupby(id_columns)[date_column].aggregate(["max", "count"]).T.to_dict()
|
|
294
|
+
ids_sort = {ensure_tuple(k): (v["max"], v["count"]) for k, v in ids_sort.items()}
|
|
295
|
+
id_counts = df[id_columns].value_counts()
|
|
296
|
+
id_counts.index = [ensure_tuple(i) for i in id_counts.index]
|
|
297
|
+
id_counts = id_counts.sort_index(key=lambda x: [ids_sort[y] for y in x], ascending=False).cumsum()
|
|
298
|
+
id_counts = id_counts[id_counts <= sample_size]
|
|
299
|
+
min_different_ids = int(len(df[id_columns].drop_duplicates()) * min_different_ids_ratio)
|
|
300
|
+
|
|
301
|
+
def id_mask(sample_index: pd.Index) -> pd.Index:
|
|
302
|
+
if isinstance(sample_index, pd.MultiIndex):
|
|
303
|
+
return pd.MultiIndex.from_frame(df[id_columns]).isin(sample_index)
|
|
304
|
+
else:
|
|
305
|
+
return df[id_columns[0]].isin(sample_index)
|
|
306
|
+
|
|
307
|
+
if len(id_counts) < min_different_ids:
|
|
308
|
+
if logger is not None:
|
|
309
|
+
logger.info(
|
|
310
|
+
f"Different ids count {len(id_counts)} is less than min different ids {min_different_ids}, sampling time window"
|
|
311
|
+
)
|
|
312
|
+
date_counts = df.groupby(id_columns)[date_column].nunique().sort_values(ascending=False)
|
|
313
|
+
ids_to_sample = date_counts.index[:min_different_ids]
|
|
314
|
+
mask = id_mask(ids_to_sample)
|
|
315
|
+
df = df[mask]
|
|
316
|
+
sample_date_counts = df[date_column].value_counts().sort_index(ascending=False).cumsum()
|
|
317
|
+
sample_date_counts = sample_date_counts[sample_date_counts <= sample_size]
|
|
318
|
+
df = df[df[date_column].isin(sample_date_counts.index)]
|
|
319
|
+
else:
|
|
320
|
+
if len(id_columns) > 1:
|
|
321
|
+
id_counts.index = pd.MultiIndex.from_tuples(id_counts.index)
|
|
322
|
+
else:
|
|
323
|
+
id_counts.index = [i[0] for i in id_counts.index]
|
|
324
|
+
mask = id_mask(id_counts.index)
|
|
325
|
+
df = df[mask]
|
|
326
|
+
|
|
327
|
+
return df
|
|
328
|
+
|
|
329
|
+
|
|
267
330
|
def calculate_psi(expected: pd.Series, actual: pd.Series) -> Union[float, Exception]:
|
|
268
331
|
try:
|
|
269
332
|
df = pd.concat([expected, actual])
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.36a1"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.2.36a1 → upgini-1.2.38a3769.dev1}/src/upgini/resource_bundle/strings_widget.properties
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|