upgini 1.2.37__tar.gz → 1.2.38a3769.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/PKG-INFO +1 -1
- upgini-1.2.38a3769.dev2/src/upgini/__about__.py +1 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/dataset.py +21 -2
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/features_enricher.py +44 -24
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/metadata.py +3 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/resource_bundle/strings.properties +0 -1
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/target_utils.py +76 -3
- upgini-1.2.37/src/upgini/__about__.py +0 -1
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/.gitignore +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/LICENSE +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/README.md +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/pyproject.toml +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/__init__.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/ads.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/errors.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/http.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/metrics.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/search_task.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/spinner.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.38a3769.dev2"
|
|
@@ -22,6 +22,7 @@ from upgini.metadata import (
|
|
|
22
22
|
EVAL_SET_INDEX,
|
|
23
23
|
SYSTEM_RECORD_ID,
|
|
24
24
|
TARGET,
|
|
25
|
+
CVType,
|
|
25
26
|
DataType,
|
|
26
27
|
FeaturesFilter,
|
|
27
28
|
FileColumnMeaningType,
|
|
@@ -32,11 +33,12 @@ from upgini.metadata import (
|
|
|
32
33
|
NumericInterval,
|
|
33
34
|
RuntimeParameters,
|
|
34
35
|
SearchCustomization,
|
|
36
|
+
SearchKey,
|
|
35
37
|
)
|
|
36
38
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
37
39
|
from upgini.search_task import SearchTask
|
|
38
40
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
|
39
|
-
from upgini.utils.target_utils import balance_undersample, balance_undersample_forced
|
|
41
|
+
from upgini.utils.target_utils import balance_undersample, balance_undersample_forced, balance_undersample_time_series
|
|
40
42
|
|
|
41
43
|
try:
|
|
42
44
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
|
@@ -74,6 +76,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
74
76
|
search_keys: Optional[List[Tuple[str, ...]]] = None,
|
|
75
77
|
unnest_search_keys: Optional[Dict[str, str]] = None,
|
|
76
78
|
model_task_type: Optional[ModelTaskType] = None,
|
|
79
|
+
cv_type: Optional[CVType] = None,
|
|
77
80
|
random_state: Optional[int] = None,
|
|
78
81
|
rest_client: Optional[_RestClient] = None,
|
|
79
82
|
logger: Optional[logging.Logger] = None,
|
|
@@ -104,6 +107,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
104
107
|
|
|
105
108
|
self.dataset_name = dataset_name
|
|
106
109
|
self.task_type = model_task_type
|
|
110
|
+
self.cv_type = cv_type
|
|
107
111
|
self.description = description
|
|
108
112
|
self.meaning_types = meaning_types
|
|
109
113
|
self.search_keys = search_keys
|
|
@@ -225,6 +229,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
225
229
|
df=self.data,
|
|
226
230
|
target_column=target_column,
|
|
227
231
|
task_type=self.task_type,
|
|
232
|
+
cv_type=self.cv_type,
|
|
228
233
|
random_state=self.random_state,
|
|
229
234
|
sample_size=self.FORCE_SAMPLE_SIZE,
|
|
230
235
|
logger=self.logger,
|
|
@@ -297,7 +302,21 @@ class Dataset: # (pd.DataFrame):
|
|
|
297
302
|
f"Etalon has size {len(self.data)} more than threshold {sample_threshold} "
|
|
298
303
|
f"and will be downsampled to {sample_rows}"
|
|
299
304
|
)
|
|
300
|
-
|
|
305
|
+
if self.cv_type is not None and self.cv_type.is_time_series():
|
|
306
|
+
resampled_data = balance_undersample_time_series(
|
|
307
|
+
df=self.data,
|
|
308
|
+
id_columns=[k for k, v in self.meaning_types.items() if v == FileColumnMeaningType.CUSTOM_KEY],
|
|
309
|
+
date_column=next(
|
|
310
|
+
k
|
|
311
|
+
for k, v in self.meaning_types.items()
|
|
312
|
+
if v in [FileColumnMeaningType.DATE, FileColumnMeaningType.DATETIME]
|
|
313
|
+
),
|
|
314
|
+
sample_size=sample_rows,
|
|
315
|
+
random_state=self.random_state,
|
|
316
|
+
logger=self.logger,
|
|
317
|
+
)
|
|
318
|
+
else:
|
|
319
|
+
resampled_data = self.data.sample(n=sample_rows, random_state=self.random_state)
|
|
301
320
|
self.data = resampled_data
|
|
302
321
|
self.logger.info(f"Shape after threshold resampling: {self.data.shape}")
|
|
303
322
|
|
|
@@ -237,6 +237,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
237
237
|
add_date_if_missing: bool = True,
|
|
238
238
|
select_features: bool = False,
|
|
239
239
|
disable_force_downsampling: bool = False,
|
|
240
|
+
id_columns: Optional[List[str]] = None,
|
|
240
241
|
**kwargs,
|
|
241
242
|
):
|
|
242
243
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -277,9 +278,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
277
278
|
)
|
|
278
279
|
|
|
279
280
|
validate_version(self.logger, self.__log_warning)
|
|
281
|
+
|
|
280
282
|
self.search_keys = search_keys or {}
|
|
283
|
+
self.id_columns = id_columns
|
|
281
284
|
self.country_code = country_code
|
|
282
285
|
self.__validate_search_keys(search_keys, search_id)
|
|
286
|
+
|
|
283
287
|
self.model_task_type = model_task_type
|
|
284
288
|
self.endpoint = endpoint
|
|
285
289
|
self._search_task: Optional[SearchTask] = None
|
|
@@ -983,7 +987,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
983
987
|
with Spinner():
|
|
984
988
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
|
985
989
|
|
|
986
|
-
has_date =
|
|
990
|
+
has_date = self._get_date_column(search_keys) is not None
|
|
987
991
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
|
988
992
|
|
|
989
993
|
wrapper = EstimatorWrapper.create(
|
|
@@ -1185,7 +1189,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1185
1189
|
)
|
|
1186
1190
|
|
|
1187
1191
|
uplift_col = self.bundle.get("quality_metrics_uplift_header")
|
|
1188
|
-
date_column =
|
|
1192
|
+
date_column = self._get_date_column(search_keys)
|
|
1189
1193
|
if (
|
|
1190
1194
|
uplift_col in metrics_df.columns
|
|
1191
1195
|
and (metrics_df[uplift_col] < 0).any()
|
|
@@ -1354,7 +1358,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1354
1358
|
groups = None
|
|
1355
1359
|
|
|
1356
1360
|
if not isinstance(_cv, BaseCrossValidator):
|
|
1357
|
-
date_column =
|
|
1361
|
+
date_column = self._get_date_column(search_keys)
|
|
1358
1362
|
date_series = X[date_column] if date_column is not None else None
|
|
1359
1363
|
_cv, groups = CVConfig(
|
|
1360
1364
|
_cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
|
|
@@ -1667,7 +1671,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1667
1671
|
search_keys = self.search_keys.copy()
|
|
1668
1672
|
search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1669
1673
|
|
|
1670
|
-
date_column =
|
|
1674
|
+
date_column = self._get_date_column(search_keys)
|
|
1671
1675
|
generated_features = []
|
|
1672
1676
|
if date_column is not None:
|
|
1673
1677
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
@@ -1741,7 +1745,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1741
1745
|
search_keys = self.fit_search_keys
|
|
1742
1746
|
|
|
1743
1747
|
rows_to_drop = None
|
|
1744
|
-
has_date =
|
|
1748
|
+
has_date = self._get_date_column(search_keys) is not None
|
|
1745
1749
|
self.model_task_type = self.model_task_type or define_task(
|
|
1746
1750
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
|
1747
1751
|
)
|
|
@@ -1853,7 +1857,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1853
1857
|
df = balance_undersample_forced(
|
|
1854
1858
|
df=df,
|
|
1855
1859
|
target_column=TARGET,
|
|
1860
|
+
id_columns=self.id_columns,
|
|
1861
|
+
date_column=self._get_date_column(self.search_keys),
|
|
1856
1862
|
task_type=self.model_task_type,
|
|
1863
|
+
cv_type=self.cv,
|
|
1857
1864
|
random_state=self.random_state,
|
|
1858
1865
|
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1859
1866
|
logger=self.logger,
|
|
@@ -1995,7 +2002,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1995
2002
|
trace_id = trace_id or uuid.uuid4()
|
|
1996
2003
|
return search_task.get_progress(trace_id)
|
|
1997
2004
|
|
|
1998
|
-
def get_transactional_transform_api(self
|
|
2005
|
+
def get_transactional_transform_api(self):
|
|
1999
2006
|
if self.api_key is None:
|
|
2000
2007
|
raise ValidationError(self.bundle.get("transactional_transform_unregistered"))
|
|
2001
2008
|
if self._search_task is None:
|
|
@@ -2053,7 +2060,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2053
2060
|
api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
|
|
2054
2061
|
-H 'Authorization: {self.api_key}' \\
|
|
2055
2062
|
-H 'Content-Type: application/json' \\
|
|
2056
|
-
-d '{{"search_keys": {keys}{features_section}
|
|
2063
|
+
-d '{{"search_keys": {keys}{features_section}}}'"""
|
|
2057
2064
|
return api_example
|
|
2058
2065
|
|
|
2059
2066
|
def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
|
|
@@ -2102,10 +2109,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2102
2109
|
self.logger.warning(
|
|
2103
2110
|
f"There are important features for transform, that generated by online API: {online_api_features}"
|
|
2104
2111
|
)
|
|
2105
|
-
|
|
2106
|
-
|
|
2107
|
-
print(msg)
|
|
2108
|
-
print(self.get_transactional_transform_api(only_online_sources=True))
|
|
2112
|
+
# TODO
|
|
2113
|
+
raise Exception("There are features selected that are paid. Contact support (sales@upgini.com)")
|
|
2109
2114
|
|
|
2110
2115
|
if not metrics_calculation:
|
|
2111
2116
|
transform_usage = self.rest_client.get_current_transform_usage(trace_id)
|
|
@@ -2155,7 +2160,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2155
2160
|
df = self.__add_country_code(df, search_keys)
|
|
2156
2161
|
|
|
2157
2162
|
generated_features = []
|
|
2158
|
-
date_column =
|
|
2163
|
+
date_column = self._get_date_column(search_keys)
|
|
2159
2164
|
if date_column is not None:
|
|
2160
2165
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2161
2166
|
df = converter.convert(df, keep_time=True)
|
|
@@ -2163,7 +2168,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2163
2168
|
generated_features.extend(converter.generated_features)
|
|
2164
2169
|
else:
|
|
2165
2170
|
self.logger.info("Input dataset hasn't date column")
|
|
2166
|
-
if self.
|
|
2171
|
+
if self.__should_add_date_column():
|
|
2167
2172
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
2168
2173
|
|
|
2169
2174
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
@@ -2446,7 +2451,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2446
2451
|
# Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
|
|
2447
2452
|
multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
|
|
2448
2453
|
for multi_key in multi_keys:
|
|
2449
|
-
if multi_key not in [
|
|
2454
|
+
if multi_key not in [
|
|
2455
|
+
SearchKey.PHONE,
|
|
2456
|
+
SearchKey.IP,
|
|
2457
|
+
SearchKey.POSTAL_CODE,
|
|
2458
|
+
SearchKey.EMAIL,
|
|
2459
|
+
SearchKey.HEM,
|
|
2460
|
+
SearchKey.CUSTOM_KEY,
|
|
2461
|
+
]:
|
|
2450
2462
|
msg = self.bundle.get("unsupported_multi_key").format(multi_key)
|
|
2451
2463
|
self.logger.warning(msg)
|
|
2452
2464
|
raise ValidationError(msg)
|
|
@@ -2610,7 +2622,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2610
2622
|
self.fit_generated_features.extend(converter.generated_features)
|
|
2611
2623
|
else:
|
|
2612
2624
|
self.logger.info("Input dataset hasn't date column")
|
|
2613
|
-
if self.
|
|
2625
|
+
if self.__should_add_date_column():
|
|
2614
2626
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2615
2627
|
|
|
2616
2628
|
email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
|
|
@@ -2643,6 +2655,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2643
2655
|
|
|
2644
2656
|
self.__adjust_cv(df)
|
|
2645
2657
|
|
|
2658
|
+
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
|
2659
|
+
self.search_keys.update({col: SearchKey.CUSTOM_KEY for col in self.id_columns})
|
|
2660
|
+
|
|
2646
2661
|
df, fintech_warnings = remove_fintech_duplicates(
|
|
2647
2662
|
df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
|
|
2648
2663
|
)
|
|
@@ -2764,6 +2779,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2764
2779
|
search_keys=combined_search_keys,
|
|
2765
2780
|
unnest_search_keys=unnest_search_keys,
|
|
2766
2781
|
model_task_type=self.model_task_type,
|
|
2782
|
+
cv_type=self.cv,
|
|
2767
2783
|
date_format=self.date_format,
|
|
2768
2784
|
random_state=self.random_state,
|
|
2769
2785
|
rest_client=self.rest_client,
|
|
@@ -2920,6 +2936,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2920
2936
|
if not self.warning_counter.has_warnings():
|
|
2921
2937
|
self.__display_support_link(self.bundle.get("all_ok_community_invite"))
|
|
2922
2938
|
|
|
2939
|
+
def __should_add_date_column(self):
|
|
2940
|
+
return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
|
|
2941
|
+
|
|
2923
2942
|
def __adjust_cv(self, df: pd.DataFrame):
|
|
2924
2943
|
date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2925
2944
|
# Check Multivariate time series
|
|
@@ -3165,7 +3184,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3165
3184
|
if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
|
|
3166
3185
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3167
3186
|
else:
|
|
3168
|
-
date_column =
|
|
3187
|
+
date_column = FeaturesEnricher._get_date_column(search_keys)
|
|
3169
3188
|
sort_columns = [date_column] if date_column is not None else []
|
|
3170
3189
|
|
|
3171
3190
|
# Xy = pd.concat([X, y], axis=1)
|
|
@@ -3357,6 +3376,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3357
3376
|
if t == SearchKey.POSTAL_CODE:
|
|
3358
3377
|
return col
|
|
3359
3378
|
|
|
3379
|
+
@staticmethod
|
|
3380
|
+
def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3381
|
+
return SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
3382
|
+
|
|
3360
3383
|
def _explode_multiple_search_keys(
|
|
3361
3384
|
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
|
|
3362
3385
|
) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
|
|
@@ -3365,7 +3388,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3365
3388
|
for key_name, key_type in search_keys.items():
|
|
3366
3389
|
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
3367
3390
|
search_key_names_by_type = {
|
|
3368
|
-
key_type: key_names
|
|
3391
|
+
key_type: key_names
|
|
3392
|
+
for key_type, key_names in search_key_names_by_type.items()
|
|
3393
|
+
if len(key_names) > 1 and key_type != SearchKey.CUSTOM_KEY
|
|
3369
3394
|
}
|
|
3370
3395
|
if len(search_key_names_by_type) == 0:
|
|
3371
3396
|
return df, {}
|
|
@@ -3418,9 +3443,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3418
3443
|
]
|
|
3419
3444
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3420
3445
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3421
|
-
sort_exclude_columns.append(
|
|
3446
|
+
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
3422
3447
|
else:
|
|
3423
|
-
date_column =
|
|
3448
|
+
date_column = self._get_date_column(search_keys)
|
|
3424
3449
|
sort_columns = [date_column] if date_column is not None else []
|
|
3425
3450
|
|
|
3426
3451
|
sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
|
|
@@ -3856,11 +3881,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3856
3881
|
self.logger.warning(msg + f" Provided search keys: {search_keys}")
|
|
3857
3882
|
raise ValidationError(msg)
|
|
3858
3883
|
|
|
3859
|
-
if SearchKey.CUSTOM_KEY in valid_search_keys.values():
|
|
3860
|
-
custom_keys = [column for column, key in valid_search_keys.items() if key == SearchKey.CUSTOM_KEY]
|
|
3861
|
-
for key in custom_keys:
|
|
3862
|
-
del valid_search_keys[key]
|
|
3863
|
-
|
|
3864
3884
|
if (
|
|
3865
3885
|
len(valid_search_keys.values()) == 1
|
|
3866
3886
|
and self.country_code is None
|
|
@@ -216,7 +216,6 @@ imbalanced_target=\nTarget is imbalanced and will be undersampled. Frequency of
|
|
|
216
216
|
loss_selection_info=Using loss `{}` for feature selection
|
|
217
217
|
loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
|
|
218
218
|
forced_balance_undersample=For quick data retrieval, your dataset has been sampled. To use data search without data sampling please contact support (sales@upgini.com)
|
|
219
|
-
online_api_features_transform=Please note that some of the selected features {} are provided through a slow enrichment interface and are not available via transformation. However, they can be accessed via the API:
|
|
220
219
|
|
|
221
220
|
# Validation table
|
|
222
221
|
validation_column_name_header=Column name
|
|
@@ -1,15 +1,18 @@
|
|
|
1
|
+
import itertools
|
|
1
2
|
import logging
|
|
2
|
-
from typing import Callable, Optional, Union
|
|
3
|
+
from typing import Callable, List, Optional, Union
|
|
3
4
|
|
|
4
5
|
import numpy as np
|
|
5
6
|
import pandas as pd
|
|
6
7
|
from pandas.api.types import is_numeric_dtype, is_bool_dtype
|
|
7
8
|
|
|
8
9
|
from upgini.errors import ValidationError
|
|
9
|
-
from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
|
|
10
|
+
from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
|
|
10
11
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
|
11
12
|
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
|
12
13
|
|
|
14
|
+
TS_MIN_DIFFERENT_IDS_RATIO = 0.2
|
|
15
|
+
|
|
13
16
|
|
|
14
17
|
def correct_string_target(y: Union[pd.Series, np.ndarray]) -> Union[pd.Series, np.ndarray]:
|
|
15
18
|
if isinstance(y, pd.Series):
|
|
@@ -201,7 +204,10 @@ def balance_undersample(
|
|
|
201
204
|
def balance_undersample_forced(
|
|
202
205
|
df: pd.DataFrame,
|
|
203
206
|
target_column: str,
|
|
207
|
+
id_columns: List[str],
|
|
208
|
+
date_column: str,
|
|
204
209
|
task_type: ModelTaskType,
|
|
210
|
+
cv_type: CVType | None,
|
|
205
211
|
random_state: int,
|
|
206
212
|
sample_size: int = 7000,
|
|
207
213
|
logger: Optional[logging.Logger] = None,
|
|
@@ -233,7 +239,17 @@ def balance_undersample_forced(
|
|
|
233
239
|
|
|
234
240
|
resampled_data = df
|
|
235
241
|
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
236
|
-
if
|
|
242
|
+
if cv_type is not None and cv_type.is_time_series():
|
|
243
|
+
logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
|
|
244
|
+
resampled_data = balance_undersample_time_series(
|
|
245
|
+
df,
|
|
246
|
+
id_columns=id_columns,
|
|
247
|
+
date_column=date_column,
|
|
248
|
+
sample_size=sample_size,
|
|
249
|
+
random_state=random_state,
|
|
250
|
+
logger=logger,
|
|
251
|
+
)
|
|
252
|
+
elif task_type in [ModelTaskType.MULTICLASS, ModelTaskType.REGRESSION]:
|
|
237
253
|
logger.warning(f"Sampling dataset from {len(df)} to {sample_size}")
|
|
238
254
|
resampled_data = df.sample(n=sample_size, random_state=random_state)
|
|
239
255
|
else:
|
|
@@ -264,6 +280,63 @@ def balance_undersample_forced(
|
|
|
264
280
|
return resampled_data
|
|
265
281
|
|
|
266
282
|
|
|
283
|
+
def balance_undersample_time_series(
|
|
284
|
+
df: pd.DataFrame,
|
|
285
|
+
id_columns: List[str],
|
|
286
|
+
date_column: str,
|
|
287
|
+
sample_size: int,
|
|
288
|
+
random_state: int = 42,
|
|
289
|
+
min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO,
|
|
290
|
+
prefer_recent_dates: bool = True,
|
|
291
|
+
logger: Optional[logging.Logger] = None,
|
|
292
|
+
):
|
|
293
|
+
def ensure_tuple(x):
|
|
294
|
+
return tuple([x]) if not isinstance(x, tuple) else x
|
|
295
|
+
|
|
296
|
+
random_state = np.random.RandomState(random_state)
|
|
297
|
+
|
|
298
|
+
ids_sort = df.groupby(id_columns)[date_column].aggregate(["max", "count"]).T.to_dict()
|
|
299
|
+
ids_sort = {
|
|
300
|
+
ensure_tuple(k): (
|
|
301
|
+
(v["max"], v["count"], random_state.rand()) if prefer_recent_dates else (v["count"], random_state.rand())
|
|
302
|
+
)
|
|
303
|
+
for k, v in ids_sort.items()
|
|
304
|
+
}
|
|
305
|
+
id_counts = df[id_columns].value_counts()
|
|
306
|
+
id_counts.index = [ensure_tuple(i) for i in id_counts.index]
|
|
307
|
+
id_counts = id_counts.sort_index(key=lambda x: [ids_sort[y] for y in x], ascending=False).cumsum()
|
|
308
|
+
id_counts = id_counts[id_counts <= sample_size]
|
|
309
|
+
min_different_ids = int(len(df[id_columns].drop_duplicates()) * min_different_ids_ratio)
|
|
310
|
+
|
|
311
|
+
def id_mask(sample_index: pd.Index) -> pd.Index:
|
|
312
|
+
if isinstance(sample_index, pd.MultiIndex):
|
|
313
|
+
return pd.MultiIndex.from_frame(df[id_columns]).isin(sample_index)
|
|
314
|
+
else:
|
|
315
|
+
return df[id_columns[0]].isin(sample_index)
|
|
316
|
+
|
|
317
|
+
if len(id_counts) < min_different_ids:
|
|
318
|
+
if logger is not None:
|
|
319
|
+
logger.info(
|
|
320
|
+
f"Different ids count {len(id_counts)} is less than min different ids {min_different_ids}, sampling time window"
|
|
321
|
+
)
|
|
322
|
+
date_counts = df.groupby(id_columns)[date_column].nunique().sort_values(ascending=False)
|
|
323
|
+
ids_to_sample = date_counts.index[:min_different_ids]
|
|
324
|
+
mask = id_mask(ids_to_sample)
|
|
325
|
+
df = df[mask]
|
|
326
|
+
sample_date_counts = df[date_column].value_counts().sort_index(ascending=False).cumsum()
|
|
327
|
+
sample_date_counts = sample_date_counts[sample_date_counts <= sample_size]
|
|
328
|
+
df = df[df[date_column].isin(sample_date_counts.index)]
|
|
329
|
+
else:
|
|
330
|
+
if len(id_columns) > 1:
|
|
331
|
+
id_counts.index = pd.MultiIndex.from_tuples(id_counts.index)
|
|
332
|
+
else:
|
|
333
|
+
id_counts.index = [i[0] for i in id_counts.index]
|
|
334
|
+
mask = id_mask(id_counts.index)
|
|
335
|
+
df = df[mask]
|
|
336
|
+
|
|
337
|
+
return df
|
|
338
|
+
|
|
339
|
+
|
|
267
340
|
def calculate_psi(expected: pd.Series, actual: pd.Series) -> Union[float, Exception]:
|
|
268
341
|
try:
|
|
269
342
|
df = pd.concat([expected, actual])
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.37"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.2.37 → upgini-1.2.38a3769.dev2}/src/upgini/resource_bundle/strings_widget.properties
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|