upgini 1.1.244a18__tar.gz → 1.1.244a20__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.244a18/src/upgini.egg-info → upgini-1.1.244a20}/PKG-INFO +1 -1
- {upgini-1.1.244a18 → upgini-1.1.244a20}/setup.py +1 -1
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/features_enricher.py +3 -2
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/metrics.py +15 -18
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/deduplicate_utils.py +2 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20/src/upgini.egg-info}/PKG-INFO +1 -1
- {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_blocked_time_series.py +13 -6
- {upgini-1.1.244a18 → upgini-1.1.244a20}/LICENSE +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/README.md +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/pyproject.toml +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/setup.cfg +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/__init__.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/ads.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/dataset.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/errors.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/fingerprint.js +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/http.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/metadata.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/normalizer/phone_normalizer.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/search_task.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/spinner.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/version_validator.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini.egg-info/SOURCES.txt +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini.egg-info/dependency_links.txt +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini.egg-info/requires.txt +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini.egg-info/top_level.txt +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_binary_dataset.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_categorical_dataset.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_continuous_dataset.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_country_utils.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_custom_loss_utils.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_datetime_utils.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_email_utils.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_etalon_validation.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_features_enricher.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_metrics.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_phone_utils.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_postal_code_utils.py +0 -0
- {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_widget.py +0 -0
|
@@ -1855,8 +1855,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1855
1855
|
)
|
|
1856
1856
|
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
1857
1857
|
|
|
1858
|
-
df = clean_full_duplicates(df, self.logger, silent=silent_mode)
|
|
1859
|
-
|
|
1860
1858
|
df = df.reset_index(drop=True)
|
|
1861
1859
|
system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
|
|
1862
1860
|
df_with_original_index = df[system_columns_with_original_index].copy()
|
|
@@ -1865,6 +1863,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1865
1863
|
|
|
1866
1864
|
df_without_features = df.drop(columns=non_keys_columns)
|
|
1867
1865
|
|
|
1866
|
+
df_without_features = clean_full_duplicates(df_without_features, self.logger, silent=silent_mode)
|
|
1867
|
+
|
|
1868
1868
|
del df
|
|
1869
1869
|
gc.collect()
|
|
1870
1870
|
|
|
@@ -2092,6 +2092,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2092
2092
|
msg = bundle.get("missing_generate_feature").format(gen_feature, x_columns)
|
|
2093
2093
|
print(msg)
|
|
2094
2094
|
self.logger.warning(msg)
|
|
2095
|
+
self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
|
|
2095
2096
|
|
|
2096
2097
|
validate_scoring_argument(scoring)
|
|
2097
2098
|
|
|
@@ -414,6 +414,9 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
414
414
|
self.emb_features = [c for c in X.columns if re.match(emb_pattern, c) and is_numeric_dtype(X[c])]
|
|
415
415
|
embedding_features = []
|
|
416
416
|
if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
|
|
417
|
+
self.logger.info(
|
|
418
|
+
f"Embedding features count more than 3, so group them into one vector for CatBoost: {self.emb_features}"
|
|
419
|
+
)
|
|
417
420
|
X, embedding_features = self.group_embeddings(X)
|
|
418
421
|
params["embedding_features"] = embedding_features
|
|
419
422
|
else:
|
|
@@ -421,15 +424,13 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
421
424
|
|
|
422
425
|
# Find text features from passed in generate_features
|
|
423
426
|
if self.text_features is not None:
|
|
424
|
-
self.
|
|
427
|
+
self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
|
|
428
|
+
self.text_features = [f for f in self.text_features if f in X.columns and not is_numeric_dtype(X[f])]
|
|
429
|
+
self.logger.info(f"Rest text features after checks: {self.text_features}")
|
|
425
430
|
params["text_features"] = self.text_features
|
|
426
431
|
|
|
427
432
|
# Find rest categorical features
|
|
428
|
-
self.cat_features = _get_cat_features(X)
|
|
429
|
-
if self.text_features is not None:
|
|
430
|
-
self.cat_features = [
|
|
431
|
-
f for f in self.cat_features if f not in self.text_features and f not in embedding_features
|
|
432
|
-
]
|
|
433
|
+
self.cat_features = _get_cat_features(X, self.text_features, embedding_features)
|
|
433
434
|
X = fill_na_cat_features(X, self.cat_features)
|
|
434
435
|
unique_cat_features = []
|
|
435
436
|
for name in self.cat_features:
|
|
@@ -456,6 +457,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
456
457
|
|
|
457
458
|
del self.estimator._init_params["cat_features"]
|
|
458
459
|
|
|
460
|
+
self.logger.info(f"Selected categorical features: {self.cat_features}")
|
|
459
461
|
params["cat_features"] = self.cat_features
|
|
460
462
|
|
|
461
463
|
return X, y, groups, params
|
|
@@ -473,13 +475,11 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
473
475
|
X, y, params = super()._prepare_to_calculate(X, y)
|
|
474
476
|
if self.text_features:
|
|
475
477
|
params["text_features"] = self.text_features
|
|
476
|
-
# if self.emb_groups:
|
|
477
478
|
if self.emb_features:
|
|
478
479
|
X, emb_columns = self.group_embeddings(X)
|
|
479
480
|
params["embedding_features"] = emb_columns
|
|
480
481
|
if self.cat_features:
|
|
481
482
|
X = fill_na_cat_features(X, self.cat_features)
|
|
482
|
-
if self.cat_features:
|
|
483
483
|
params["cat_features"] = self.cat_features
|
|
484
484
|
|
|
485
485
|
return X, y, params
|
|
@@ -633,8 +633,13 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
|
|
|
633
633
|
return scoring, metric_name, multiplier
|
|
634
634
|
|
|
635
635
|
|
|
636
|
-
def _get_cat_features(
|
|
637
|
-
|
|
636
|
+
def _get_cat_features(
|
|
637
|
+
X: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
|
|
638
|
+
) -> List[str]:
|
|
639
|
+
text_features = text_features or []
|
|
640
|
+
emb_features = emb_features or []
|
|
641
|
+
exclude_features = text_features + emb_features
|
|
642
|
+
return [c for c in X.columns if c not in exclude_features and not is_numeric_dtype(X[c])]
|
|
638
643
|
|
|
639
644
|
|
|
640
645
|
def _get_add_params(input_params, add_params):
|
|
@@ -731,11 +736,3 @@ def fill_na_cat_features(df: pd.DataFrame, cat_features: List[str]) -> pd.DataFr
|
|
|
731
736
|
na_filter = df[c].str.lower().isin(NA_VALUES)
|
|
732
737
|
df.loc[na_filter, c] = NA_REPLACEMENT
|
|
733
738
|
return df
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
def _is_too_many_categorical_values(X: pd.DataFrame) -> bool:
|
|
737
|
-
many_values_features_count = 0
|
|
738
|
-
for f in _get_cat_features(X):
|
|
739
|
-
if X[f].astype("string").nunique() > 100:
|
|
740
|
-
many_values_features_count += 1
|
|
741
|
-
return many_values_features_count >= 2
|
|
@@ -103,6 +103,8 @@ def clean_full_duplicates(
|
|
|
103
103
|
unique_columns = df.columns.tolist()
|
|
104
104
|
if SYSTEM_RECORD_ID in unique_columns:
|
|
105
105
|
unique_columns.remove(SYSTEM_RECORD_ID)
|
|
106
|
+
if "sort_id" in unique_columns:
|
|
107
|
+
unique_columns.remove("sort_id")
|
|
106
108
|
logger.info(f"Dataset shape before clean duplicates: {df.shape}")
|
|
107
109
|
df = df.drop_duplicates(subset=unique_columns)
|
|
108
110
|
logger.info(f"Dataset shape after clean duplicates: {df.shape}")
|
|
@@ -33,13 +33,20 @@ def test_bts_split_logic():
|
|
|
33
33
|
def test_bts_metrics():
|
|
34
34
|
X, y, cv, _, model = _prepare_data()
|
|
35
35
|
cv_result = set(cross_val_score(model, X, y, cv=cv, scoring="roc_auc"))
|
|
36
|
-
assert cv_result == {
|
|
37
|
-
0.
|
|
38
|
-
0.
|
|
39
|
-
0.
|
|
40
|
-
0.
|
|
41
|
-
0.
|
|
36
|
+
assert {round(r, 3) for r in cv_result} == {
|
|
37
|
+
0.456,
|
|
38
|
+
0.477,
|
|
39
|
+
0.481,
|
|
40
|
+
0.489,
|
|
41
|
+
0.515,
|
|
42
42
|
}
|
|
43
|
+
# assert cv_result == {
|
|
44
|
+
# 0.4559664254320743,
|
|
45
|
+
# 0.4767320313326982,
|
|
46
|
+
# 0.4811855209016638,
|
|
47
|
+
# 0.48947924927306374,
|
|
48
|
+
# 0.5150543675843606,
|
|
49
|
+
# }
|
|
43
50
|
|
|
44
51
|
|
|
45
52
|
def test_bts_exceptions():
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|