upgini 1.2.38a3769.dev7__py3-none-any.whl → 1.2.39a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/dataset.py +2 -24
- upgini/features_enricher.py +36 -55
- upgini/metadata.py +0 -3
- upgini/resource_bundle/strings.properties +1 -0
- upgini/utils/target_utils.py +3 -78
- {upgini-1.2.38a3769.dev7.dist-info → upgini-1.2.39a1.dist-info}/METADATA +1 -1
- {upgini-1.2.38a3769.dev7.dist-info → upgini-1.2.39a1.dist-info}/RECORD +10 -10
- {upgini-1.2.38a3769.dev7.dist-info → upgini-1.2.39a1.dist-info}/WHEEL +1 -1
- {upgini-1.2.38a3769.dev7.dist-info → upgini-1.2.39a1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.39a1"
|
upgini/dataset.py
CHANGED
|
@@ -22,7 +22,6 @@ from upgini.metadata import (
|
|
|
22
22
|
EVAL_SET_INDEX,
|
|
23
23
|
SYSTEM_RECORD_ID,
|
|
24
24
|
TARGET,
|
|
25
|
-
CVType,
|
|
26
25
|
DataType,
|
|
27
26
|
FeaturesFilter,
|
|
28
27
|
FileColumnMeaningType,
|
|
@@ -33,12 +32,11 @@ from upgini.metadata import (
|
|
|
33
32
|
NumericInterval,
|
|
34
33
|
RuntimeParameters,
|
|
35
34
|
SearchCustomization,
|
|
36
|
-
SearchKey,
|
|
37
35
|
)
|
|
38
36
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
39
37
|
from upgini.search_task import SearchTask
|
|
40
38
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
|
41
|
-
from upgini.utils.target_utils import balance_undersample, balance_undersample_forced
|
|
39
|
+
from upgini.utils.target_utils import balance_undersample, balance_undersample_forced
|
|
42
40
|
|
|
43
41
|
try:
|
|
44
42
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
|
@@ -76,8 +74,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
76
74
|
search_keys: Optional[List[Tuple[str, ...]]] = None,
|
|
77
75
|
unnest_search_keys: Optional[Dict[str, str]] = None,
|
|
78
76
|
model_task_type: Optional[ModelTaskType] = None,
|
|
79
|
-
cv_type: Optional[CVType] = None,
|
|
80
|
-
id_columns: Optional[List[str]] = None,
|
|
81
77
|
random_state: Optional[int] = None,
|
|
82
78
|
rest_client: Optional[_RestClient] = None,
|
|
83
79
|
logger: Optional[logging.Logger] = None,
|
|
@@ -108,7 +104,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
108
104
|
|
|
109
105
|
self.dataset_name = dataset_name
|
|
110
106
|
self.task_type = model_task_type
|
|
111
|
-
self.cv_type = cv_type
|
|
112
107
|
self.description = description
|
|
113
108
|
self.meaning_types = meaning_types
|
|
114
109
|
self.search_keys = search_keys
|
|
@@ -121,7 +116,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
121
116
|
self.random_state = random_state
|
|
122
117
|
self.columns_renaming: Dict[str, str] = {}
|
|
123
118
|
self.imbalanced: bool = False
|
|
124
|
-
self.id_columns = id_columns
|
|
125
119
|
if logger is not None:
|
|
126
120
|
self.logger = logger
|
|
127
121
|
else:
|
|
@@ -231,8 +225,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
231
225
|
df=self.data,
|
|
232
226
|
target_column=target_column,
|
|
233
227
|
task_type=self.task_type,
|
|
234
|
-
cv_type=self.cv_type,
|
|
235
|
-
id_columns=self.id_columns,
|
|
236
228
|
random_state=self.random_state,
|
|
237
229
|
sample_size=self.FORCE_SAMPLE_SIZE,
|
|
238
230
|
logger=self.logger,
|
|
@@ -305,21 +297,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
305
297
|
f"Etalon has size {len(self.data)} more than threshold {sample_threshold} "
|
|
306
298
|
f"and will be downsampled to {sample_rows}"
|
|
307
299
|
)
|
|
308
|
-
|
|
309
|
-
resampled_data = balance_undersample_time_series(
|
|
310
|
-
df=self.data,
|
|
311
|
-
id_columns=self.id_columns,
|
|
312
|
-
date_column=next(
|
|
313
|
-
k
|
|
314
|
-
for k, v in self.meaning_types.items()
|
|
315
|
-
if v in [FileColumnMeaningType.DATE, FileColumnMeaningType.DATETIME]
|
|
316
|
-
),
|
|
317
|
-
sample_size=sample_rows,
|
|
318
|
-
random_state=self.random_state,
|
|
319
|
-
logger=self.logger,
|
|
320
|
-
)
|
|
321
|
-
else:
|
|
322
|
-
resampled_data = self.data.sample(n=sample_rows, random_state=self.random_state)
|
|
300
|
+
resampled_data = self.data.sample(n=sample_rows, random_state=self.random_state)
|
|
323
301
|
self.data = resampled_data
|
|
324
302
|
self.logger.info(f"Shape after threshold resampling: {self.data.shape}")
|
|
325
303
|
|
upgini/features_enricher.py
CHANGED
|
@@ -237,7 +237,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
237
237
|
add_date_if_missing: bool = True,
|
|
238
238
|
select_features: bool = False,
|
|
239
239
|
disable_force_downsampling: bool = False,
|
|
240
|
-
id_columns: Optional[List[str]] = None,
|
|
241
240
|
**kwargs,
|
|
242
241
|
):
|
|
243
242
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -278,12 +277,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
278
277
|
)
|
|
279
278
|
|
|
280
279
|
validate_version(self.logger, self.__log_warning)
|
|
281
|
-
|
|
282
280
|
self.search_keys = search_keys or {}
|
|
283
|
-
self.id_columns = id_columns
|
|
284
281
|
self.country_code = country_code
|
|
285
282
|
self.__validate_search_keys(search_keys, search_id)
|
|
286
|
-
|
|
287
283
|
self.model_task_type = model_task_type
|
|
288
284
|
self.endpoint = endpoint
|
|
289
285
|
self._search_task: Optional[SearchTask] = None
|
|
@@ -932,7 +928,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
932
928
|
cat_features, search_keys_for_metrics = self._get_client_cat_features(
|
|
933
929
|
estimator, validated_X, self.search_keys
|
|
934
930
|
)
|
|
935
|
-
search_keys_for_metrics.extend([c for c in self.id_columns or [] if c not in search_keys_for_metrics])
|
|
936
931
|
|
|
937
932
|
prepared_data = self._prepare_data_for_metrics(
|
|
938
933
|
trace_id=trace_id,
|
|
@@ -988,7 +983,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
988
983
|
with Spinner():
|
|
989
984
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
|
990
985
|
|
|
991
|
-
has_date =
|
|
986
|
+
has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
|
|
992
987
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
|
993
988
|
|
|
994
989
|
wrapper = EstimatorWrapper.create(
|
|
@@ -1190,7 +1185,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1190
1185
|
)
|
|
1191
1186
|
|
|
1192
1187
|
uplift_col = self.bundle.get("quality_metrics_uplift_header")
|
|
1193
|
-
date_column =
|
|
1188
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1194
1189
|
if (
|
|
1195
1190
|
uplift_col in metrics_df.columns
|
|
1196
1191
|
and (metrics_df[uplift_col] < 0).any()
|
|
@@ -1359,7 +1354,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1359
1354
|
groups = None
|
|
1360
1355
|
|
|
1361
1356
|
if not isinstance(_cv, BaseCrossValidator):
|
|
1362
|
-
date_column =
|
|
1357
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1363
1358
|
date_series = X[date_column] if date_column is not None else None
|
|
1364
1359
|
_cv, groups = CVConfig(
|
|
1365
1360
|
_cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
|
|
@@ -1672,7 +1667,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1672
1667
|
search_keys = self.search_keys.copy()
|
|
1673
1668
|
search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1674
1669
|
|
|
1675
|
-
date_column =
|
|
1670
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1676
1671
|
generated_features = []
|
|
1677
1672
|
if date_column is not None:
|
|
1678
1673
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
@@ -1746,7 +1741,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1746
1741
|
search_keys = self.fit_search_keys
|
|
1747
1742
|
|
|
1748
1743
|
rows_to_drop = None
|
|
1749
|
-
has_date =
|
|
1744
|
+
has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
|
|
1750
1745
|
self.model_task_type = self.model_task_type or define_task(
|
|
1751
1746
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
|
1752
1747
|
)
|
|
@@ -1858,10 +1853,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1858
1853
|
df = balance_undersample_forced(
|
|
1859
1854
|
df=df,
|
|
1860
1855
|
target_column=TARGET,
|
|
1861
|
-
id_columns=self.id_columns,
|
|
1862
|
-
date_column=self._get_date_column(self.search_keys),
|
|
1863
1856
|
task_type=self.model_task_type,
|
|
1864
|
-
cv_type=self.cv,
|
|
1865
1857
|
random_state=self.random_state,
|
|
1866
1858
|
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1867
1859
|
logger=self.logger,
|
|
@@ -2003,7 +1995,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2003
1995
|
trace_id = trace_id or uuid.uuid4()
|
|
2004
1996
|
return search_task.get_progress(trace_id)
|
|
2005
1997
|
|
|
2006
|
-
def get_transactional_transform_api(self):
|
|
1998
|
+
def get_transactional_transform_api(self, only_online_sources=False):
|
|
2007
1999
|
if self.api_key is None:
|
|
2008
2000
|
raise ValidationError(self.bundle.get("transactional_transform_unregistered"))
|
|
2009
2001
|
if self._search_task is None:
|
|
@@ -2061,7 +2053,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2061
2053
|
api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
|
|
2062
2054
|
-H 'Authorization: {self.api_key}' \\
|
|
2063
2055
|
-H 'Content-Type: application/json' \\
|
|
2064
|
-
-d '{{"search_keys": {keys}{features_section}}}'"""
|
|
2056
|
+
-d '{{"search_keys": {keys}{features_section}, "only_online_sources": {str(only_online_sources).lower()}}}'"""
|
|
2065
2057
|
return api_example
|
|
2066
2058
|
|
|
2067
2059
|
def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
|
|
@@ -2105,13 +2097,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2105
2097
|
return None, {c: c for c in X.columns}, []
|
|
2106
2098
|
|
|
2107
2099
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
2108
|
-
online_api_features = [fm.name for fm in features_meta if fm.from_online_api]
|
|
2100
|
+
online_api_features = [fm.name for fm in features_meta if fm.from_online_api and fm.shap_value > 0]
|
|
2109
2101
|
if len(online_api_features) > 0:
|
|
2110
2102
|
self.logger.warning(
|
|
2111
2103
|
f"There are important features for transform, that generated by online API: {online_api_features}"
|
|
2112
2104
|
)
|
|
2113
|
-
|
|
2114
|
-
|
|
2105
|
+
msg = self.bundle.get("online_api_features_transform").format(online_api_features)
|
|
2106
|
+
self.logger.warning(msg)
|
|
2107
|
+
print(msg)
|
|
2108
|
+
print(self.get_transactional_transform_api(only_online_sources=True))
|
|
2115
2109
|
|
|
2116
2110
|
if not metrics_calculation:
|
|
2117
2111
|
transform_usage = self.rest_client.get_current_transform_usage(trace_id)
|
|
@@ -2161,7 +2155,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2161
2155
|
df = self.__add_country_code(df, search_keys)
|
|
2162
2156
|
|
|
2163
2157
|
generated_features = []
|
|
2164
|
-
date_column =
|
|
2158
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2165
2159
|
if date_column is not None:
|
|
2166
2160
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2167
2161
|
df = converter.convert(df, keep_time=True)
|
|
@@ -2169,7 +2163,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2169
2163
|
generated_features.extend(converter.generated_features)
|
|
2170
2164
|
else:
|
|
2171
2165
|
self.logger.info("Input dataset hasn't date column")
|
|
2172
|
-
if self.
|
|
2166
|
+
if self.add_date_if_missing:
|
|
2173
2167
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
2174
2168
|
|
|
2175
2169
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
@@ -2300,7 +2294,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2300
2294
|
meaning_types=meaning_types,
|
|
2301
2295
|
search_keys=combined_search_keys,
|
|
2302
2296
|
unnest_search_keys=unnest_search_keys,
|
|
2303
|
-
id_columns=self.__get_renamed_id_columns(),
|
|
2304
2297
|
date_format=self.date_format,
|
|
2305
2298
|
rest_client=self.rest_client,
|
|
2306
2299
|
logger=self.logger,
|
|
@@ -2453,14 +2446,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2453
2446
|
# Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
|
|
2454
2447
|
multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
|
|
2455
2448
|
for multi_key in multi_keys:
|
|
2456
|
-
if multi_key not in [
|
|
2457
|
-
SearchKey.PHONE,
|
|
2458
|
-
SearchKey.IP,
|
|
2459
|
-
SearchKey.POSTAL_CODE,
|
|
2460
|
-
SearchKey.EMAIL,
|
|
2461
|
-
SearchKey.HEM,
|
|
2462
|
-
SearchKey.CUSTOM_KEY,
|
|
2463
|
-
]:
|
|
2449
|
+
if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
|
|
2464
2450
|
msg = self.bundle.get("unsupported_multi_key").format(multi_key)
|
|
2465
2451
|
self.logger.warning(msg)
|
|
2466
2452
|
raise ValidationError(msg)
|
|
@@ -2624,7 +2610,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2624
2610
|
self.fit_generated_features.extend(converter.generated_features)
|
|
2625
2611
|
else:
|
|
2626
2612
|
self.logger.info("Input dataset hasn't date column")
|
|
2627
|
-
if self.
|
|
2613
|
+
if self.add_date_if_missing:
|
|
2628
2614
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2629
2615
|
|
|
2630
2616
|
email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
|
|
@@ -2657,12 +2643,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2657
2643
|
|
|
2658
2644
|
self.__adjust_cv(df)
|
|
2659
2645
|
|
|
2660
|
-
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
|
2661
|
-
id_columns = self.__get_renamed_id_columns()
|
|
2662
|
-
if id_columns:
|
|
2663
|
-
self.fit_search_keys.update({col: SearchKey.CUSTOM_KEY for col in id_columns})
|
|
2664
|
-
self.runtime_parameters.properties["id_columns"] = ",".join(id_columns)
|
|
2665
|
-
|
|
2666
2646
|
df, fintech_warnings = remove_fintech_duplicates(
|
|
2667
2647
|
df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
|
|
2668
2648
|
)
|
|
@@ -2692,6 +2672,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2692
2672
|
self.fit_search_keys,
|
|
2693
2673
|
self.fit_columns_renaming,
|
|
2694
2674
|
list(unnest_search_keys.keys()),
|
|
2675
|
+
self.bundle,
|
|
2695
2676
|
self.logger,
|
|
2696
2677
|
)
|
|
2697
2678
|
df = converter.convert(df)
|
|
@@ -2784,8 +2765,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2784
2765
|
search_keys=combined_search_keys,
|
|
2785
2766
|
unnest_search_keys=unnest_search_keys,
|
|
2786
2767
|
model_task_type=self.model_task_type,
|
|
2787
|
-
cv_type=self.cv,
|
|
2788
|
-
id_columns=self.__get_renamed_id_columns(),
|
|
2789
2768
|
date_format=self.date_format,
|
|
2790
2769
|
random_state=self.random_state,
|
|
2791
2770
|
rest_client=self.rest_client,
|
|
@@ -2942,13 +2921,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2942
2921
|
if not self.warning_counter.has_warnings():
|
|
2943
2922
|
self.__display_support_link(self.bundle.get("all_ok_community_invite"))
|
|
2944
2923
|
|
|
2945
|
-
def __should_add_date_column(self):
|
|
2946
|
-
return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
|
|
2947
|
-
|
|
2948
|
-
def __get_renamed_id_columns(self):
|
|
2949
|
-
reverse_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
|
|
2950
|
-
return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
|
|
2951
|
-
|
|
2952
2924
|
def __adjust_cv(self, df: pd.DataFrame):
|
|
2953
2925
|
date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2954
2926
|
# Check Multivariate time series
|
|
@@ -3194,7 +3166,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3194
3166
|
if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
|
|
3195
3167
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3196
3168
|
else:
|
|
3197
|
-
date_column =
|
|
3169
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
3198
3170
|
sort_columns = [date_column] if date_column is not None else []
|
|
3199
3171
|
|
|
3200
3172
|
# Xy = pd.concat([X, y], axis=1)
|
|
@@ -3257,6 +3229,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3257
3229
|
f"Generate features: {self.generate_features}\n"
|
|
3258
3230
|
f"Round embeddings: {self.round_embeddings}\n"
|
|
3259
3231
|
f"Detect missing search keys: {self.detect_missing_search_keys}\n"
|
|
3232
|
+
f"Exclude columns: {self.exclude_columns}\n"
|
|
3260
3233
|
f"Exclude features sources: {exclude_features_sources}\n"
|
|
3261
3234
|
f"Calculate metrics: {calculate_metrics}\n"
|
|
3262
3235
|
f"Scoring: {scoring}\n"
|
|
@@ -3264,6 +3237,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3264
3237
|
f"Remove target outliers: {remove_outliers_calc_metrics}\n"
|
|
3265
3238
|
f"Exclude columns: {self.exclude_columns}\n"
|
|
3266
3239
|
f"Search id: {self.search_id}\n"
|
|
3240
|
+
f"Custom loss: {self.loss}\n"
|
|
3241
|
+
f"Logs enabled: {self.logs_enabled}\n"
|
|
3242
|
+
f"Raise validation error: {self.raise_validation_error}\n"
|
|
3243
|
+
f"Baseline score column: {self.baseline_score_column}\n"
|
|
3244
|
+
f"Client ip: {self.client_ip}\n"
|
|
3245
|
+
f"Client visitorId: {self.client_visitorid}\n"
|
|
3246
|
+
f"Add date if missing: {self.add_date_if_missing}\n"
|
|
3247
|
+
f"Select features: {self.select_features}\n"
|
|
3248
|
+
f"Disable force downsampling: {self.disable_force_downsampling}\n"
|
|
3267
3249
|
)
|
|
3268
3250
|
|
|
3269
3251
|
def sample(df):
|
|
@@ -3386,10 +3368,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3386
3368
|
if t == SearchKey.POSTAL_CODE:
|
|
3387
3369
|
return col
|
|
3388
3370
|
|
|
3389
|
-
@staticmethod
|
|
3390
|
-
def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3391
|
-
return SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
3392
|
-
|
|
3393
3371
|
def _explode_multiple_search_keys(
|
|
3394
3372
|
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
|
|
3395
3373
|
) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
|
|
@@ -3398,9 +3376,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3398
3376
|
for key_name, key_type in search_keys.items():
|
|
3399
3377
|
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
3400
3378
|
search_key_names_by_type = {
|
|
3401
|
-
key_type: key_names
|
|
3402
|
-
for key_type, key_names in search_key_names_by_type.items()
|
|
3403
|
-
if len(key_names) > 1 and key_type != SearchKey.CUSTOM_KEY
|
|
3379
|
+
key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
|
|
3404
3380
|
}
|
|
3405
3381
|
if len(search_key_names_by_type) == 0:
|
|
3406
3382
|
return df, {}
|
|
@@ -3453,9 +3429,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3453
3429
|
]
|
|
3454
3430
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3455
3431
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3456
|
-
sort_exclude_columns.append(
|
|
3432
|
+
sort_exclude_columns.append(SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]))
|
|
3457
3433
|
else:
|
|
3458
|
-
date_column =
|
|
3434
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
3459
3435
|
sort_columns = [date_column] if date_column is not None else []
|
|
3460
3436
|
|
|
3461
3437
|
sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
|
|
@@ -3891,6 +3867,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3891
3867
|
self.logger.warning(msg + f" Provided search keys: {search_keys}")
|
|
3892
3868
|
raise ValidationError(msg)
|
|
3893
3869
|
|
|
3870
|
+
if SearchKey.CUSTOM_KEY in valid_search_keys.values():
|
|
3871
|
+
custom_keys = [column for column, key in valid_search_keys.items() if key == SearchKey.CUSTOM_KEY]
|
|
3872
|
+
for key in custom_keys:
|
|
3873
|
+
del valid_search_keys[key]
|
|
3874
|
+
|
|
3894
3875
|
if (
|
|
3895
3876
|
len(valid_search_keys.values()) == 1
|
|
3896
3877
|
and self.country_code is None
|
upgini/metadata.py
CHANGED
|
@@ -216,6 +216,7 @@ imbalanced_target=\nTarget is imbalanced and will be undersampled. Frequency of
|
|
|
216
216
|
loss_selection_info=Using loss `{}` for feature selection
|
|
217
217
|
loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
|
|
218
218
|
forced_balance_undersample=For quick data retrieval, your dataset has been sampled. To use data search without data sampling please contact support (sales@upgini.com)
|
|
219
|
+
online_api_features_transform=Please note that some of the selected features {} are provided through a slow enrichment interface and are not available via transformation. However, they can be accessed via the API:
|
|
219
220
|
|
|
220
221
|
# Validation table
|
|
221
222
|
validation_column_name_header=Column name
|
upgini/utils/target_utils.py
CHANGED
|
@@ -1,18 +1,15 @@
|
|
|
1
|
-
import itertools
|
|
2
1
|
import logging
|
|
3
|
-
from typing import Callable,
|
|
2
|
+
from typing import Callable, Optional, Union
|
|
4
3
|
|
|
5
4
|
import numpy as np
|
|
6
5
|
import pandas as pd
|
|
7
6
|
from pandas.api.types import is_numeric_dtype, is_bool_dtype
|
|
8
7
|
|
|
9
8
|
from upgini.errors import ValidationError
|
|
10
|
-
from upgini.metadata import SYSTEM_RECORD_ID,
|
|
9
|
+
from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
|
|
11
10
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
|
12
11
|
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
|
13
12
|
|
|
14
|
-
TS_MIN_DIFFERENT_IDS_RATIO = 0.2
|
|
15
|
-
|
|
16
13
|
|
|
17
14
|
def correct_string_target(y: Union[pd.Series, np.ndarray]) -> Union[pd.Series, np.ndarray]:
|
|
18
15
|
if isinstance(y, pd.Series):
|
|
@@ -204,10 +201,7 @@ def balance_undersample(
|
|
|
204
201
|
def balance_undersample_forced(
|
|
205
202
|
df: pd.DataFrame,
|
|
206
203
|
target_column: str,
|
|
207
|
-
id_columns: List[str],
|
|
208
|
-
date_column: str,
|
|
209
204
|
task_type: ModelTaskType,
|
|
210
|
-
cv_type: CVType | None,
|
|
211
205
|
random_state: int,
|
|
212
206
|
sample_size: int = 7000,
|
|
213
207
|
logger: Optional[logging.Logger] = None,
|
|
@@ -239,17 +233,7 @@ def balance_undersample_forced(
|
|
|
239
233
|
|
|
240
234
|
resampled_data = df
|
|
241
235
|
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
242
|
-
if
|
|
243
|
-
logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
|
|
244
|
-
resampled_data = balance_undersample_time_series(
|
|
245
|
-
df,
|
|
246
|
-
id_columns=id_columns,
|
|
247
|
-
date_column=date_column,
|
|
248
|
-
sample_size=sample_size,
|
|
249
|
-
random_state=random_state,
|
|
250
|
-
logger=logger,
|
|
251
|
-
)
|
|
252
|
-
elif task_type in [ModelTaskType.MULTICLASS, ModelTaskType.REGRESSION]:
|
|
236
|
+
if task_type in [ModelTaskType.MULTICLASS, ModelTaskType.REGRESSION, ModelTaskType.TIMESERIES]:
|
|
253
237
|
logger.warning(f"Sampling dataset from {len(df)} to {sample_size}")
|
|
254
238
|
resampled_data = df.sample(n=sample_size, random_state=random_state)
|
|
255
239
|
else:
|
|
@@ -280,65 +264,6 @@ def balance_undersample_forced(
|
|
|
280
264
|
return resampled_data
|
|
281
265
|
|
|
282
266
|
|
|
283
|
-
def balance_undersample_time_series(
|
|
284
|
-
df: pd.DataFrame,
|
|
285
|
-
id_columns: List[str],
|
|
286
|
-
date_column: str,
|
|
287
|
-
sample_size: int,
|
|
288
|
-
random_state: int = 42,
|
|
289
|
-
min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO,
|
|
290
|
-
prefer_recent_dates: bool = True,
|
|
291
|
-
logger: Optional[logging.Logger] = None,
|
|
292
|
-
):
|
|
293
|
-
def ensure_tuple(x):
|
|
294
|
-
return tuple([x]) if not isinstance(x, tuple) else x
|
|
295
|
-
|
|
296
|
-
random_state = np.random.RandomState(random_state)
|
|
297
|
-
|
|
298
|
-
if not id_columns:
|
|
299
|
-
id_columns = [date_column]
|
|
300
|
-
ids_sort = df.groupby(id_columns)[date_column].aggregate(["max", "count"]).T.to_dict()
|
|
301
|
-
ids_sort = {
|
|
302
|
-
ensure_tuple(k): (
|
|
303
|
-
(v["max"], v["count"], random_state.rand()) if prefer_recent_dates else (v["count"], random_state.rand())
|
|
304
|
-
)
|
|
305
|
-
for k, v in ids_sort.items()
|
|
306
|
-
}
|
|
307
|
-
id_counts = df[id_columns].value_counts()
|
|
308
|
-
id_counts.index = [ensure_tuple(i) for i in id_counts.index]
|
|
309
|
-
id_counts = id_counts.sort_index(key=lambda x: [ids_sort[y] for y in x], ascending=False).cumsum()
|
|
310
|
-
id_counts = id_counts[id_counts <= sample_size]
|
|
311
|
-
min_different_ids = max(int(len(df[id_columns].drop_duplicates()) * min_different_ids_ratio), 1)
|
|
312
|
-
|
|
313
|
-
def id_mask(sample_index: pd.Index) -> pd.Index:
|
|
314
|
-
if isinstance(sample_index, pd.MultiIndex):
|
|
315
|
-
return pd.MultiIndex.from_frame(df[id_columns]).isin(sample_index)
|
|
316
|
-
else:
|
|
317
|
-
return df[id_columns[0]].isin(sample_index)
|
|
318
|
-
|
|
319
|
-
if len(id_counts) < min_different_ids:
|
|
320
|
-
if logger is not None:
|
|
321
|
-
logger.info(
|
|
322
|
-
f"Different ids count {len(id_counts)} for sample size {sample_size} is less than min different ids {min_different_ids}, sampling time window"
|
|
323
|
-
)
|
|
324
|
-
date_counts = df.groupby(id_columns)[date_column].nunique().sort_values(ascending=False)
|
|
325
|
-
ids_to_sample = date_counts.index[:min_different_ids] if len(id_counts) > 0 else date_counts.index
|
|
326
|
-
mask = id_mask(ids_to_sample)
|
|
327
|
-
df = df[mask]
|
|
328
|
-
sample_date_counts = df[date_column].value_counts().sort_index(ascending=False).cumsum()
|
|
329
|
-
sample_date_counts = sample_date_counts[sample_date_counts <= sample_size]
|
|
330
|
-
df = df[df[date_column].isin(sample_date_counts.index)]
|
|
331
|
-
else:
|
|
332
|
-
if len(id_columns) > 1:
|
|
333
|
-
id_counts.index = pd.MultiIndex.from_tuples(id_counts.index)
|
|
334
|
-
else:
|
|
335
|
-
id_counts.index = [i[0] for i in id_counts.index]
|
|
336
|
-
mask = id_mask(id_counts.index)
|
|
337
|
-
df = df[mask]
|
|
338
|
-
|
|
339
|
-
return df
|
|
340
|
-
|
|
341
|
-
|
|
342
267
|
def calculate_psi(expected: pd.Series, actual: pd.Series) -> Union[float, Exception]:
|
|
343
268
|
try:
|
|
344
269
|
df = pd.concat([expected, actual])
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=_wWeNiK5_JiwKIlVeEktsOM2zu0cB2l3qXursaGZU9U,25
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256
|
|
4
|
+
upgini/dataset.py,sha256=rUBE7_G7CLaaHAviFEyVPqjVSsX1DaLmi1dGFQR-eEo,32279
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=h17dmuAucpbkZs6E2T59-R9m-p8gW9bkXLY7NzvObKA,196002
|
|
7
7
|
upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
|
-
upgini/metadata.py,sha256
|
|
9
|
+
upgini/metadata.py,sha256=sB5uU-fdz_dA6g-PO6A8FzwIfDbkcFOewcpNs2xZzoY,11943
|
|
10
10
|
upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
|
|
11
11
|
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
30
30
|
upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=uQWmbcd9TJh-xE0QpmHpHYKw-20utvXeHwFA-U_iTLw,27302
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -56,10 +56,10 @@ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,1
|
|
|
56
56
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
57
57
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
58
58
|
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
59
|
-
upgini/utils/target_utils.py,sha256=
|
|
59
|
+
upgini/utils/target_utils.py,sha256=Ed5IXkPjV9AfAZQAwCYksAmKaPGQliplvDYS_yeWdfk,11330
|
|
60
60
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
61
61
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
62
|
+
upgini-1.2.39a1.dist-info/METADATA,sha256=qvNcejSCxKiITZbFqsGiaewkRsolxpy6OiePNwzqf90,48596
|
|
63
|
+
upgini-1.2.39a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
64
|
+
upgini-1.2.39a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
65
|
+
upgini-1.2.39a1.dist-info/RECORD,,
|
|
File without changes
|