upgini 1.2.38a3769.dev8__tar.gz → 1.2.39a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/PKG-INFO +1 -1
- upgini-1.2.39a1/src/upgini/__about__.py +1 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/dataset.py +2 -24
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/features_enricher.py +37 -60
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/metadata.py +0 -3
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/resource_bundle/strings.properties +1 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/target_utils.py +3 -78
- upgini-1.2.38a3769.dev8/src/upgini/__about__.py +0 -1
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/.gitignore +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/LICENSE +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/README.md +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/pyproject.toml +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/__init__.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/ads.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/errors.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/http.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/metrics.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/search_task.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/spinner.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.39a1"
|
|
@@ -22,7 +22,6 @@ from upgini.metadata import (
|
|
|
22
22
|
EVAL_SET_INDEX,
|
|
23
23
|
SYSTEM_RECORD_ID,
|
|
24
24
|
TARGET,
|
|
25
|
-
CVType,
|
|
26
25
|
DataType,
|
|
27
26
|
FeaturesFilter,
|
|
28
27
|
FileColumnMeaningType,
|
|
@@ -33,12 +32,11 @@ from upgini.metadata import (
|
|
|
33
32
|
NumericInterval,
|
|
34
33
|
RuntimeParameters,
|
|
35
34
|
SearchCustomization,
|
|
36
|
-
SearchKey,
|
|
37
35
|
)
|
|
38
36
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
39
37
|
from upgini.search_task import SearchTask
|
|
40
38
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
|
41
|
-
from upgini.utils.target_utils import balance_undersample, balance_undersample_forced
|
|
39
|
+
from upgini.utils.target_utils import balance_undersample, balance_undersample_forced
|
|
42
40
|
|
|
43
41
|
try:
|
|
44
42
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
|
@@ -76,8 +74,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
76
74
|
search_keys: Optional[List[Tuple[str, ...]]] = None,
|
|
77
75
|
unnest_search_keys: Optional[Dict[str, str]] = None,
|
|
78
76
|
model_task_type: Optional[ModelTaskType] = None,
|
|
79
|
-
cv_type: Optional[CVType] = None,
|
|
80
|
-
id_columns: Optional[List[str]] = None,
|
|
81
77
|
random_state: Optional[int] = None,
|
|
82
78
|
rest_client: Optional[_RestClient] = None,
|
|
83
79
|
logger: Optional[logging.Logger] = None,
|
|
@@ -108,7 +104,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
108
104
|
|
|
109
105
|
self.dataset_name = dataset_name
|
|
110
106
|
self.task_type = model_task_type
|
|
111
|
-
self.cv_type = cv_type
|
|
112
107
|
self.description = description
|
|
113
108
|
self.meaning_types = meaning_types
|
|
114
109
|
self.search_keys = search_keys
|
|
@@ -121,7 +116,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
121
116
|
self.random_state = random_state
|
|
122
117
|
self.columns_renaming: Dict[str, str] = {}
|
|
123
118
|
self.imbalanced: bool = False
|
|
124
|
-
self.id_columns = id_columns
|
|
125
119
|
if logger is not None:
|
|
126
120
|
self.logger = logger
|
|
127
121
|
else:
|
|
@@ -231,8 +225,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
231
225
|
df=self.data,
|
|
232
226
|
target_column=target_column,
|
|
233
227
|
task_type=self.task_type,
|
|
234
|
-
cv_type=self.cv_type,
|
|
235
|
-
id_columns=self.id_columns,
|
|
236
228
|
random_state=self.random_state,
|
|
237
229
|
sample_size=self.FORCE_SAMPLE_SIZE,
|
|
238
230
|
logger=self.logger,
|
|
@@ -305,21 +297,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
305
297
|
f"Etalon has size {len(self.data)} more than threshold {sample_threshold} "
|
|
306
298
|
f"and will be downsampled to {sample_rows}"
|
|
307
299
|
)
|
|
308
|
-
|
|
309
|
-
resampled_data = balance_undersample_time_series(
|
|
310
|
-
df=self.data,
|
|
311
|
-
id_columns=self.id_columns,
|
|
312
|
-
date_column=next(
|
|
313
|
-
k
|
|
314
|
-
for k, v in self.meaning_types.items()
|
|
315
|
-
if v in [FileColumnMeaningType.DATE, FileColumnMeaningType.DATETIME]
|
|
316
|
-
),
|
|
317
|
-
sample_size=sample_rows,
|
|
318
|
-
random_state=self.random_state,
|
|
319
|
-
logger=self.logger,
|
|
320
|
-
)
|
|
321
|
-
else:
|
|
322
|
-
resampled_data = self.data.sample(n=sample_rows, random_state=self.random_state)
|
|
300
|
+
resampled_data = self.data.sample(n=sample_rows, random_state=self.random_state)
|
|
323
301
|
self.data = resampled_data
|
|
324
302
|
self.logger.info(f"Shape after threshold resampling: {self.data.shape}")
|
|
325
303
|
|
|
@@ -237,7 +237,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
237
237
|
add_date_if_missing: bool = True,
|
|
238
238
|
select_features: bool = False,
|
|
239
239
|
disable_force_downsampling: bool = False,
|
|
240
|
-
id_columns: Optional[List[str]] = None,
|
|
241
240
|
**kwargs,
|
|
242
241
|
):
|
|
243
242
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -278,12 +277,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
278
277
|
)
|
|
279
278
|
|
|
280
279
|
validate_version(self.logger, self.__log_warning)
|
|
281
|
-
|
|
282
280
|
self.search_keys = search_keys or {}
|
|
283
|
-
self.id_columns = id_columns
|
|
284
281
|
self.country_code = country_code
|
|
285
282
|
self.__validate_search_keys(search_keys, search_id)
|
|
286
|
-
|
|
287
283
|
self.model_task_type = model_task_type
|
|
288
284
|
self.endpoint = endpoint
|
|
289
285
|
self._search_task: Optional[SearchTask] = None
|
|
@@ -932,9 +928,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
932
928
|
cat_features, search_keys_for_metrics = self._get_client_cat_features(
|
|
933
929
|
estimator, validated_X, self.search_keys
|
|
934
930
|
)
|
|
935
|
-
search_keys_for_metrics.extend(
|
|
936
|
-
[c for c in self.__get_renamed_id_columns() or [] if c not in search_keys_for_metrics]
|
|
937
|
-
)
|
|
938
931
|
|
|
939
932
|
prepared_data = self._prepare_data_for_metrics(
|
|
940
933
|
trace_id=trace_id,
|
|
@@ -990,7 +983,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
990
983
|
with Spinner():
|
|
991
984
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
|
992
985
|
|
|
993
|
-
has_date =
|
|
986
|
+
has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
|
|
994
987
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
|
995
988
|
|
|
996
989
|
wrapper = EstimatorWrapper.create(
|
|
@@ -1192,7 +1185,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1192
1185
|
)
|
|
1193
1186
|
|
|
1194
1187
|
uplift_col = self.bundle.get("quality_metrics_uplift_header")
|
|
1195
|
-
date_column =
|
|
1188
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1196
1189
|
if (
|
|
1197
1190
|
uplift_col in metrics_df.columns
|
|
1198
1191
|
and (metrics_df[uplift_col] < 0).any()
|
|
@@ -1361,7 +1354,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1361
1354
|
groups = None
|
|
1362
1355
|
|
|
1363
1356
|
if not isinstance(_cv, BaseCrossValidator):
|
|
1364
|
-
date_column =
|
|
1357
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1365
1358
|
date_series = X[date_column] if date_column is not None else None
|
|
1366
1359
|
_cv, groups = CVConfig(
|
|
1367
1360
|
_cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
|
|
@@ -1450,11 +1443,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1450
1443
|
|
|
1451
1444
|
excluding_search_keys = list(search_keys.keys())
|
|
1452
1445
|
if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
|
|
1453
|
-
excluded = set()
|
|
1454
1446
|
for sk in excluding_search_keys:
|
|
1455
1447
|
if columns_renaming.get(sk) in search_keys_for_metrics:
|
|
1456
|
-
|
|
1457
|
-
excluding_search_keys = [sk for sk in excluding_search_keys if sk not in excluded]
|
|
1448
|
+
excluding_search_keys.remove(sk)
|
|
1458
1449
|
|
|
1459
1450
|
client_features = [
|
|
1460
1451
|
c
|
|
@@ -1676,7 +1667,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1676
1667
|
search_keys = self.search_keys.copy()
|
|
1677
1668
|
search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1678
1669
|
|
|
1679
|
-
date_column =
|
|
1670
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1680
1671
|
generated_features = []
|
|
1681
1672
|
if date_column is not None:
|
|
1682
1673
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
@@ -1750,7 +1741,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1750
1741
|
search_keys = self.fit_search_keys
|
|
1751
1742
|
|
|
1752
1743
|
rows_to_drop = None
|
|
1753
|
-
has_date =
|
|
1744
|
+
has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
|
|
1754
1745
|
self.model_task_type = self.model_task_type or define_task(
|
|
1755
1746
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
|
1756
1747
|
)
|
|
@@ -1862,10 +1853,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1862
1853
|
df = balance_undersample_forced(
|
|
1863
1854
|
df=df,
|
|
1864
1855
|
target_column=TARGET,
|
|
1865
|
-
id_columns=self.id_columns,
|
|
1866
|
-
date_column=self._get_date_column(self.search_keys),
|
|
1867
1856
|
task_type=self.model_task_type,
|
|
1868
|
-
cv_type=self.cv,
|
|
1869
1857
|
random_state=self.random_state,
|
|
1870
1858
|
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1871
1859
|
logger=self.logger,
|
|
@@ -2007,7 +1995,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2007
1995
|
trace_id = trace_id or uuid.uuid4()
|
|
2008
1996
|
return search_task.get_progress(trace_id)
|
|
2009
1997
|
|
|
2010
|
-
def get_transactional_transform_api(self):
|
|
1998
|
+
def get_transactional_transform_api(self, only_online_sources=False):
|
|
2011
1999
|
if self.api_key is None:
|
|
2012
2000
|
raise ValidationError(self.bundle.get("transactional_transform_unregistered"))
|
|
2013
2001
|
if self._search_task is None:
|
|
@@ -2065,7 +2053,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2065
2053
|
api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
|
|
2066
2054
|
-H 'Authorization: {self.api_key}' \\
|
|
2067
2055
|
-H 'Content-Type: application/json' \\
|
|
2068
|
-
-d '{{"search_keys": {keys}{features_section}}}'"""
|
|
2056
|
+
-d '{{"search_keys": {keys}{features_section}, "only_online_sources": {str(only_online_sources).lower()}}}'"""
|
|
2069
2057
|
return api_example
|
|
2070
2058
|
|
|
2071
2059
|
def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
|
|
@@ -2109,13 +2097,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2109
2097
|
return None, {c: c for c in X.columns}, []
|
|
2110
2098
|
|
|
2111
2099
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
2112
|
-
online_api_features = [fm.name for fm in features_meta if fm.from_online_api]
|
|
2100
|
+
online_api_features = [fm.name for fm in features_meta if fm.from_online_api and fm.shap_value > 0]
|
|
2113
2101
|
if len(online_api_features) > 0:
|
|
2114
2102
|
self.logger.warning(
|
|
2115
2103
|
f"There are important features for transform, that generated by online API: {online_api_features}"
|
|
2116
2104
|
)
|
|
2117
|
-
|
|
2118
|
-
|
|
2105
|
+
msg = self.bundle.get("online_api_features_transform").format(online_api_features)
|
|
2106
|
+
self.logger.warning(msg)
|
|
2107
|
+
print(msg)
|
|
2108
|
+
print(self.get_transactional_transform_api(only_online_sources=True))
|
|
2119
2109
|
|
|
2120
2110
|
if not metrics_calculation:
|
|
2121
2111
|
transform_usage = self.rest_client.get_current_transform_usage(trace_id)
|
|
@@ -2165,7 +2155,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2165
2155
|
df = self.__add_country_code(df, search_keys)
|
|
2166
2156
|
|
|
2167
2157
|
generated_features = []
|
|
2168
|
-
date_column =
|
|
2158
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2169
2159
|
if date_column is not None:
|
|
2170
2160
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2171
2161
|
df = converter.convert(df, keep_time=True)
|
|
@@ -2173,7 +2163,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2173
2163
|
generated_features.extend(converter.generated_features)
|
|
2174
2164
|
else:
|
|
2175
2165
|
self.logger.info("Input dataset hasn't date column")
|
|
2176
|
-
if self.
|
|
2166
|
+
if self.add_date_if_missing:
|
|
2177
2167
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
2178
2168
|
|
|
2179
2169
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
@@ -2304,7 +2294,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2304
2294
|
meaning_types=meaning_types,
|
|
2305
2295
|
search_keys=combined_search_keys,
|
|
2306
2296
|
unnest_search_keys=unnest_search_keys,
|
|
2307
|
-
id_columns=self.__get_renamed_id_columns(),
|
|
2308
2297
|
date_format=self.date_format,
|
|
2309
2298
|
rest_client=self.rest_client,
|
|
2310
2299
|
logger=self.logger,
|
|
@@ -2457,14 +2446,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2457
2446
|
# Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
|
|
2458
2447
|
multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
|
|
2459
2448
|
for multi_key in multi_keys:
|
|
2460
|
-
if multi_key not in [
|
|
2461
|
-
SearchKey.PHONE,
|
|
2462
|
-
SearchKey.IP,
|
|
2463
|
-
SearchKey.POSTAL_CODE,
|
|
2464
|
-
SearchKey.EMAIL,
|
|
2465
|
-
SearchKey.HEM,
|
|
2466
|
-
SearchKey.CUSTOM_KEY,
|
|
2467
|
-
]:
|
|
2449
|
+
if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
|
|
2468
2450
|
msg = self.bundle.get("unsupported_multi_key").format(multi_key)
|
|
2469
2451
|
self.logger.warning(msg)
|
|
2470
2452
|
raise ValidationError(msg)
|
|
@@ -2628,7 +2610,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2628
2610
|
self.fit_generated_features.extend(converter.generated_features)
|
|
2629
2611
|
else:
|
|
2630
2612
|
self.logger.info("Input dataset hasn't date column")
|
|
2631
|
-
if self.
|
|
2613
|
+
if self.add_date_if_missing:
|
|
2632
2614
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2633
2615
|
|
|
2634
2616
|
email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
|
|
@@ -2661,12 +2643,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2661
2643
|
|
|
2662
2644
|
self.__adjust_cv(df)
|
|
2663
2645
|
|
|
2664
|
-
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
|
2665
|
-
id_columns = self.__get_renamed_id_columns()
|
|
2666
|
-
if id_columns:
|
|
2667
|
-
self.fit_search_keys.update({col: SearchKey.CUSTOM_KEY for col in id_columns})
|
|
2668
|
-
self.runtime_parameters.properties["id_columns"] = ",".join(id_columns)
|
|
2669
|
-
|
|
2670
2646
|
df, fintech_warnings = remove_fintech_duplicates(
|
|
2671
2647
|
df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
|
|
2672
2648
|
)
|
|
@@ -2696,6 +2672,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2696
2672
|
self.fit_search_keys,
|
|
2697
2673
|
self.fit_columns_renaming,
|
|
2698
2674
|
list(unnest_search_keys.keys()),
|
|
2675
|
+
self.bundle,
|
|
2699
2676
|
self.logger,
|
|
2700
2677
|
)
|
|
2701
2678
|
df = converter.convert(df)
|
|
@@ -2788,8 +2765,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2788
2765
|
search_keys=combined_search_keys,
|
|
2789
2766
|
unnest_search_keys=unnest_search_keys,
|
|
2790
2767
|
model_task_type=self.model_task_type,
|
|
2791
|
-
cv_type=self.cv,
|
|
2792
|
-
id_columns=self.__get_renamed_id_columns(),
|
|
2793
2768
|
date_format=self.date_format,
|
|
2794
2769
|
random_state=self.random_state,
|
|
2795
2770
|
rest_client=self.rest_client,
|
|
@@ -2946,13 +2921,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2946
2921
|
if not self.warning_counter.has_warnings():
|
|
2947
2922
|
self.__display_support_link(self.bundle.get("all_ok_community_invite"))
|
|
2948
2923
|
|
|
2949
|
-
def __should_add_date_column(self):
|
|
2950
|
-
return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
|
|
2951
|
-
|
|
2952
|
-
def __get_renamed_id_columns(self):
|
|
2953
|
-
reverse_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
|
|
2954
|
-
return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
|
|
2955
|
-
|
|
2956
2924
|
def __adjust_cv(self, df: pd.DataFrame):
|
|
2957
2925
|
date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2958
2926
|
# Check Multivariate time series
|
|
@@ -3198,7 +3166,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3198
3166
|
if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
|
|
3199
3167
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3200
3168
|
else:
|
|
3201
|
-
date_column =
|
|
3169
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
3202
3170
|
sort_columns = [date_column] if date_column is not None else []
|
|
3203
3171
|
|
|
3204
3172
|
# Xy = pd.concat([X, y], axis=1)
|
|
@@ -3261,6 +3229,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3261
3229
|
f"Generate features: {self.generate_features}\n"
|
|
3262
3230
|
f"Round embeddings: {self.round_embeddings}\n"
|
|
3263
3231
|
f"Detect missing search keys: {self.detect_missing_search_keys}\n"
|
|
3232
|
+
f"Exclude columns: {self.exclude_columns}\n"
|
|
3264
3233
|
f"Exclude features sources: {exclude_features_sources}\n"
|
|
3265
3234
|
f"Calculate metrics: {calculate_metrics}\n"
|
|
3266
3235
|
f"Scoring: {scoring}\n"
|
|
@@ -3268,6 +3237,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3268
3237
|
f"Remove target outliers: {remove_outliers_calc_metrics}\n"
|
|
3269
3238
|
f"Exclude columns: {self.exclude_columns}\n"
|
|
3270
3239
|
f"Search id: {self.search_id}\n"
|
|
3240
|
+
f"Custom loss: {self.loss}\n"
|
|
3241
|
+
f"Logs enabled: {self.logs_enabled}\n"
|
|
3242
|
+
f"Raise validation error: {self.raise_validation_error}\n"
|
|
3243
|
+
f"Baseline score column: {self.baseline_score_column}\n"
|
|
3244
|
+
f"Client ip: {self.client_ip}\n"
|
|
3245
|
+
f"Client visitorId: {self.client_visitorid}\n"
|
|
3246
|
+
f"Add date if missing: {self.add_date_if_missing}\n"
|
|
3247
|
+
f"Select features: {self.select_features}\n"
|
|
3248
|
+
f"Disable force downsampling: {self.disable_force_downsampling}\n"
|
|
3271
3249
|
)
|
|
3272
3250
|
|
|
3273
3251
|
def sample(df):
|
|
@@ -3390,10 +3368,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3390
3368
|
if t == SearchKey.POSTAL_CODE:
|
|
3391
3369
|
return col
|
|
3392
3370
|
|
|
3393
|
-
@staticmethod
|
|
3394
|
-
def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3395
|
-
return SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
3396
|
-
|
|
3397
3371
|
def _explode_multiple_search_keys(
|
|
3398
3372
|
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
|
|
3399
3373
|
) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
|
|
@@ -3402,9 +3376,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3402
3376
|
for key_name, key_type in search_keys.items():
|
|
3403
3377
|
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
3404
3378
|
search_key_names_by_type = {
|
|
3405
|
-
key_type: key_names
|
|
3406
|
-
for key_type, key_names in search_key_names_by_type.items()
|
|
3407
|
-
if len(key_names) > 1 and key_type != SearchKey.CUSTOM_KEY
|
|
3379
|
+
key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
|
|
3408
3380
|
}
|
|
3409
3381
|
if len(search_key_names_by_type) == 0:
|
|
3410
3382
|
return df, {}
|
|
@@ -3457,9 +3429,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3457
3429
|
]
|
|
3458
3430
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3459
3431
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3460
|
-
sort_exclude_columns.append(
|
|
3432
|
+
sort_exclude_columns.append(SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]))
|
|
3461
3433
|
else:
|
|
3462
|
-
date_column =
|
|
3434
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
3463
3435
|
sort_columns = [date_column] if date_column is not None else []
|
|
3464
3436
|
|
|
3465
3437
|
sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
|
|
@@ -3895,6 +3867,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3895
3867
|
self.logger.warning(msg + f" Provided search keys: {search_keys}")
|
|
3896
3868
|
raise ValidationError(msg)
|
|
3897
3869
|
|
|
3870
|
+
if SearchKey.CUSTOM_KEY in valid_search_keys.values():
|
|
3871
|
+
custom_keys = [column for column, key in valid_search_keys.items() if key == SearchKey.CUSTOM_KEY]
|
|
3872
|
+
for key in custom_keys:
|
|
3873
|
+
del valid_search_keys[key]
|
|
3874
|
+
|
|
3898
3875
|
if (
|
|
3899
3876
|
len(valid_search_keys.values()) == 1
|
|
3900
3877
|
and self.country_code is None
|
|
@@ -216,6 +216,7 @@ imbalanced_target=\nTarget is imbalanced and will be undersampled. Frequency of
|
|
|
216
216
|
loss_selection_info=Using loss `{}` for feature selection
|
|
217
217
|
loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
|
|
218
218
|
forced_balance_undersample=For quick data retrieval, your dataset has been sampled. To use data search without data sampling please contact support (sales@upgini.com)
|
|
219
|
+
online_api_features_transform=Please note that some of the selected features {} are provided through a slow enrichment interface and are not available via transformation. However, they can be accessed via the API:
|
|
219
220
|
|
|
220
221
|
# Validation table
|
|
221
222
|
validation_column_name_header=Column name
|
|
@@ -1,18 +1,15 @@
|
|
|
1
|
-
import itertools
|
|
2
1
|
import logging
|
|
3
|
-
from typing import Callable,
|
|
2
|
+
from typing import Callable, Optional, Union
|
|
4
3
|
|
|
5
4
|
import numpy as np
|
|
6
5
|
import pandas as pd
|
|
7
6
|
from pandas.api.types import is_numeric_dtype, is_bool_dtype
|
|
8
7
|
|
|
9
8
|
from upgini.errors import ValidationError
|
|
10
|
-
from upgini.metadata import SYSTEM_RECORD_ID,
|
|
9
|
+
from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
|
|
11
10
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
|
12
11
|
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
|
13
12
|
|
|
14
|
-
TS_MIN_DIFFERENT_IDS_RATIO = 0.2
|
|
15
|
-
|
|
16
13
|
|
|
17
14
|
def correct_string_target(y: Union[pd.Series, np.ndarray]) -> Union[pd.Series, np.ndarray]:
|
|
18
15
|
if isinstance(y, pd.Series):
|
|
@@ -204,10 +201,7 @@ def balance_undersample(
|
|
|
204
201
|
def balance_undersample_forced(
|
|
205
202
|
df: pd.DataFrame,
|
|
206
203
|
target_column: str,
|
|
207
|
-
id_columns: List[str],
|
|
208
|
-
date_column: str,
|
|
209
204
|
task_type: ModelTaskType,
|
|
210
|
-
cv_type: CVType | None,
|
|
211
205
|
random_state: int,
|
|
212
206
|
sample_size: int = 7000,
|
|
213
207
|
logger: Optional[logging.Logger] = None,
|
|
@@ -239,17 +233,7 @@ def balance_undersample_forced(
|
|
|
239
233
|
|
|
240
234
|
resampled_data = df
|
|
241
235
|
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
242
|
-
if
|
|
243
|
-
logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
|
|
244
|
-
resampled_data = balance_undersample_time_series(
|
|
245
|
-
df,
|
|
246
|
-
id_columns=id_columns,
|
|
247
|
-
date_column=date_column,
|
|
248
|
-
sample_size=sample_size,
|
|
249
|
-
random_state=random_state,
|
|
250
|
-
logger=logger,
|
|
251
|
-
)
|
|
252
|
-
elif task_type in [ModelTaskType.MULTICLASS, ModelTaskType.REGRESSION]:
|
|
236
|
+
if task_type in [ModelTaskType.MULTICLASS, ModelTaskType.REGRESSION, ModelTaskType.TIMESERIES]:
|
|
253
237
|
logger.warning(f"Sampling dataset from {len(df)} to {sample_size}")
|
|
254
238
|
resampled_data = df.sample(n=sample_size, random_state=random_state)
|
|
255
239
|
else:
|
|
@@ -280,65 +264,6 @@ def balance_undersample_forced(
|
|
|
280
264
|
return resampled_data
|
|
281
265
|
|
|
282
266
|
|
|
283
|
-
def balance_undersample_time_series(
|
|
284
|
-
df: pd.DataFrame,
|
|
285
|
-
id_columns: List[str],
|
|
286
|
-
date_column: str,
|
|
287
|
-
sample_size: int,
|
|
288
|
-
random_state: int = 42,
|
|
289
|
-
min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO,
|
|
290
|
-
prefer_recent_dates: bool = True,
|
|
291
|
-
logger: Optional[logging.Logger] = None,
|
|
292
|
-
):
|
|
293
|
-
def ensure_tuple(x):
|
|
294
|
-
return tuple([x]) if not isinstance(x, tuple) else x
|
|
295
|
-
|
|
296
|
-
random_state = np.random.RandomState(random_state)
|
|
297
|
-
|
|
298
|
-
if not id_columns:
|
|
299
|
-
id_columns = [date_column]
|
|
300
|
-
ids_sort = df.groupby(id_columns)[date_column].aggregate(["max", "count"]).T.to_dict()
|
|
301
|
-
ids_sort = {
|
|
302
|
-
ensure_tuple(k): (
|
|
303
|
-
(v["max"], v["count"], random_state.rand()) if prefer_recent_dates else (v["count"], random_state.rand())
|
|
304
|
-
)
|
|
305
|
-
for k, v in ids_sort.items()
|
|
306
|
-
}
|
|
307
|
-
id_counts = df[id_columns].value_counts()
|
|
308
|
-
id_counts.index = [ensure_tuple(i) for i in id_counts.index]
|
|
309
|
-
id_counts = id_counts.sort_index(key=lambda x: [ids_sort[y] for y in x], ascending=False).cumsum()
|
|
310
|
-
id_counts = id_counts[id_counts <= sample_size]
|
|
311
|
-
min_different_ids = max(int(len(df[id_columns].drop_duplicates()) * min_different_ids_ratio), 1)
|
|
312
|
-
|
|
313
|
-
def id_mask(sample_index: pd.Index) -> pd.Index:
|
|
314
|
-
if isinstance(sample_index, pd.MultiIndex):
|
|
315
|
-
return pd.MultiIndex.from_frame(df[id_columns]).isin(sample_index)
|
|
316
|
-
else:
|
|
317
|
-
return df[id_columns[0]].isin(sample_index)
|
|
318
|
-
|
|
319
|
-
if len(id_counts) < min_different_ids:
|
|
320
|
-
if logger is not None:
|
|
321
|
-
logger.info(
|
|
322
|
-
f"Different ids count {len(id_counts)} for sample size {sample_size} is less than min different ids {min_different_ids}, sampling time window"
|
|
323
|
-
)
|
|
324
|
-
date_counts = df.groupby(id_columns)[date_column].nunique().sort_values(ascending=False)
|
|
325
|
-
ids_to_sample = date_counts.index[:min_different_ids] if len(id_counts) > 0 else date_counts.index
|
|
326
|
-
mask = id_mask(ids_to_sample)
|
|
327
|
-
df = df[mask]
|
|
328
|
-
sample_date_counts = df[date_column].value_counts().sort_index(ascending=False).cumsum()
|
|
329
|
-
sample_date_counts = sample_date_counts[sample_date_counts <= sample_size]
|
|
330
|
-
df = df[df[date_column].isin(sample_date_counts.index)]
|
|
331
|
-
else:
|
|
332
|
-
if len(id_columns) > 1:
|
|
333
|
-
id_counts.index = pd.MultiIndex.from_tuples(id_counts.index)
|
|
334
|
-
else:
|
|
335
|
-
id_counts.index = [i[0] for i in id_counts.index]
|
|
336
|
-
mask = id_mask(id_counts.index)
|
|
337
|
-
df = df[mask]
|
|
338
|
-
|
|
339
|
-
return df
|
|
340
|
-
|
|
341
|
-
|
|
342
267
|
def calculate_psi(expected: pd.Series, actual: pd.Series) -> Union[float, Exception]:
|
|
343
268
|
try:
|
|
344
269
|
df = pd.concat([expected, actual])
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.38a3769.dev8"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.2.38a3769.dev8 → upgini-1.2.39a1}/src/upgini/resource_bundle/strings_widget.properties
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|