upgini 1.1.244a19__py3-none-any.whl → 1.1.244a20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/features_enricher.py +3 -2
- upgini/metrics.py +7 -1
- upgini/utils/deduplicate_utils.py +2 -0
- {upgini-1.1.244a19.dist-info → upgini-1.1.244a20.dist-info}/METADATA +1 -1
- {upgini-1.1.244a19.dist-info → upgini-1.1.244a20.dist-info}/RECORD +8 -8
- {upgini-1.1.244a19.dist-info → upgini-1.1.244a20.dist-info}/LICENSE +0 -0
- {upgini-1.1.244a19.dist-info → upgini-1.1.244a20.dist-info}/WHEEL +0 -0
- {upgini-1.1.244a19.dist-info → upgini-1.1.244a20.dist-info}/top_level.txt +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -1855,8 +1855,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1855
1855
|
)
|
|
1856
1856
|
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
1857
1857
|
|
|
1858
|
-
df = clean_full_duplicates(df, self.logger, silent=silent_mode)
|
|
1859
|
-
|
|
1860
1858
|
df = df.reset_index(drop=True)
|
|
1861
1859
|
system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
|
|
1862
1860
|
df_with_original_index = df[system_columns_with_original_index].copy()
|
|
@@ -1865,6 +1863,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1865
1863
|
|
|
1866
1864
|
df_without_features = df.drop(columns=non_keys_columns)
|
|
1867
1865
|
|
|
1866
|
+
df_without_features = clean_full_duplicates(df_without_features, self.logger, silent=silent_mode)
|
|
1867
|
+
|
|
1868
1868
|
del df
|
|
1869
1869
|
gc.collect()
|
|
1870
1870
|
|
|
@@ -2092,6 +2092,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2092
2092
|
msg = bundle.get("missing_generate_feature").format(gen_feature, x_columns)
|
|
2093
2093
|
print(msg)
|
|
2094
2094
|
self.logger.warning(msg)
|
|
2095
|
+
self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
|
|
2095
2096
|
|
|
2096
2097
|
validate_scoring_argument(scoring)
|
|
2097
2098
|
|
upgini/metrics.py
CHANGED
|
@@ -414,6 +414,9 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
414
414
|
self.emb_features = [c for c in X.columns if re.match(emb_pattern, c) and is_numeric_dtype(X[c])]
|
|
415
415
|
embedding_features = []
|
|
416
416
|
if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
|
|
417
|
+
self.logger.info(
|
|
418
|
+
f"Embedding features count more than 3, so group them into one vector for CatBoost: {self.emb_features}"
|
|
419
|
+
)
|
|
417
420
|
X, embedding_features = self.group_embeddings(X)
|
|
418
421
|
params["embedding_features"] = embedding_features
|
|
419
422
|
else:
|
|
@@ -421,7 +424,9 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
421
424
|
|
|
422
425
|
# Find text features from passed in generate_features
|
|
423
426
|
if self.text_features is not None:
|
|
424
|
-
self.
|
|
427
|
+
self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
|
|
428
|
+
self.text_features = [f for f in self.text_features if f in X.columns and not is_numeric_dtype(X[f])]
|
|
429
|
+
self.logger.info(f"Rest text features after checks: {self.text_features}")
|
|
425
430
|
params["text_features"] = self.text_features
|
|
426
431
|
|
|
427
432
|
# Find rest categorical features
|
|
@@ -452,6 +457,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
452
457
|
|
|
453
458
|
del self.estimator._init_params["cat_features"]
|
|
454
459
|
|
|
460
|
+
self.logger.info(f"Selected categorical features: {self.cat_features}")
|
|
455
461
|
params["cat_features"] = self.cat_features
|
|
456
462
|
|
|
457
463
|
return X, y, groups, params
|
|
@@ -103,6 +103,8 @@ def clean_full_duplicates(
|
|
|
103
103
|
unique_columns = df.columns.tolist()
|
|
104
104
|
if SYSTEM_RECORD_ID in unique_columns:
|
|
105
105
|
unique_columns.remove(SYSTEM_RECORD_ID)
|
|
106
|
+
if "sort_id" in unique_columns:
|
|
107
|
+
unique_columns.remove("sort_id")
|
|
106
108
|
logger.info(f"Dataset shape before clean duplicates: {df.shape}")
|
|
107
109
|
df = df.drop_duplicates(subset=unique_columns)
|
|
108
110
|
logger.info(f"Dataset shape after clean duplicates: {df.shape}")
|
|
@@ -2,11 +2,11 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
|
|
|
2
2
|
upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
|
|
3
3
|
upgini/dataset.py,sha256=WGpnmpnmfdyB2DAwaj7mkk2s0e-6Z6bg5BWj1lUE2p0,49960
|
|
4
4
|
upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
|
|
5
|
-
upgini/features_enricher.py,sha256
|
|
5
|
+
upgini/features_enricher.py,sha256=xN-j01kEbph__ZycO2knKeDlscWdAf_vLzCpO0ERLFo,165759
|
|
6
6
|
upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
|
|
7
7
|
upgini/http.py,sha256=eSG4gOpmCGlXmB6KIPNzAG8tRZNUjyYpMeUeHw_2li4,42264
|
|
8
8
|
upgini/metadata.py,sha256=55t0uQI910tzTcnwxZCUL1413BhTiSm8oqiwp-94NyA,9613
|
|
9
|
-
upgini/metrics.py,sha256=
|
|
9
|
+
upgini/metrics.py,sha256=0km-A0TNRqHy8jdf_VqfV5iuwA-nZmlJGhQ4fLd8rKg,26340
|
|
10
10
|
upgini/search_task.py,sha256=5n4qGJmtu48s0-FHAtF3L5qVLMd1JVW3FJlM8dFbh-s,17063
|
|
11
11
|
upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
|
|
12
12
|
upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
|
|
@@ -40,7 +40,7 @@ upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU
|
|
|
40
40
|
upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
|
|
41
41
|
upgini/utils/cv_utils.py,sha256=6pSSL_Ft_8C6n6aInJeiyeSBD7McjsMxKZpHqSBV0uY,2491
|
|
42
42
|
upgini/utils/datetime_utils.py,sha256=awsLpnFjBNcrsCDyyiiJLicHgHiGCNAwi0UOwRKGD7s,8645
|
|
43
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
|
43
|
+
upgini/utils/deduplicate_utils.py,sha256=D5zdbnpo0qW23Sy04hRbntaliImiLi76KS7Du1l-ea8,5888
|
|
44
44
|
upgini/utils/display_utils.py,sha256=tiq5sFOfMwkKCjQ7OGdyK_twe0Qdr9F3mzkW1QXSDog,10664
|
|
45
45
|
upgini/utils/email_utils.py,sha256=3CvHXTSzlgLyGsQOXfRYVfFhfPy6OXG4uXOBWRaLfHg,3479
|
|
46
46
|
upgini/utils/fallback_progress_bar.py,sha256=cdbd1XGcWm4Ed4eAqV2_St3z7uC_kkH22gEyrN5ub6M,1090
|
|
@@ -54,8 +54,8 @@ upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,4
|
|
|
54
54
|
upgini/utils/target_utils.py,sha256=qyj-bGsIEl9X2Vc5gwXtsuRaocvId8bn46F7mZ9dy9A,1707
|
|
55
55
|
upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
|
|
56
56
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
57
|
-
upgini-1.1.
|
|
58
|
-
upgini-1.1.
|
|
59
|
-
upgini-1.1.
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
57
|
+
upgini-1.1.244a20.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
58
|
+
upgini-1.1.244a20.dist-info/METADATA,sha256=rpTVYdRU4VsFxnVRtiLyVREU_u3NwSEHQy-guOoaKOM,48265
|
|
59
|
+
upgini-1.1.244a20.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
|
|
60
|
+
upgini-1.1.244a20.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
|
|
61
|
+
upgini-1.1.244a20.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|