upgini 1.1.244a19__py3-none-any.whl → 1.1.244a20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -1855,8 +1855,6 @@ class FeaturesEnricher(TransformerMixin):
1855
1855
  )
1856
1856
  meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
1857
1857
 
1858
- df = clean_full_duplicates(df, self.logger, silent=silent_mode)
1859
-
1860
1858
  df = df.reset_index(drop=True)
1861
1859
  system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
1862
1860
  df_with_original_index = df[system_columns_with_original_index].copy()
@@ -1865,6 +1863,8 @@ class FeaturesEnricher(TransformerMixin):
1865
1863
 
1866
1864
  df_without_features = df.drop(columns=non_keys_columns)
1867
1865
 
1866
+ df_without_features = clean_full_duplicates(df_without_features, self.logger, silent=silent_mode)
1867
+
1868
1868
  del df
1869
1869
  gc.collect()
1870
1870
 
@@ -2092,6 +2092,7 @@ class FeaturesEnricher(TransformerMixin):
2092
2092
  msg = bundle.get("missing_generate_feature").format(gen_feature, x_columns)
2093
2093
  print(msg)
2094
2094
  self.logger.warning(msg)
2095
+ self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
2095
2096
 
2096
2097
  validate_scoring_argument(scoring)
2097
2098
 
upgini/metrics.py CHANGED
@@ -414,6 +414,9 @@ class CatBoostWrapper(EstimatorWrapper):
414
414
  self.emb_features = [c for c in X.columns if re.match(emb_pattern, c) and is_numeric_dtype(X[c])]
415
415
  embedding_features = []
416
416
  if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
417
+ self.logger.info(
418
+ f"Embedding features count more than 3, so group them into one vector for CatBoost: {self.emb_features}"
419
+ )
417
420
  X, embedding_features = self.group_embeddings(X)
418
421
  params["embedding_features"] = embedding_features
419
422
  else:
@@ -421,7 +424,9 @@ class CatBoostWrapper(EstimatorWrapper):
421
424
 
422
425
  # Find text features from passed in generate_features
423
426
  if self.text_features is not None:
424
- self.text_features = [f for f in self.text_features if not is_numeric_dtype(X[f])]
427
+ self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
428
+ self.text_features = [f for f in self.text_features if f in X.columns and not is_numeric_dtype(X[f])]
429
+ self.logger.info(f"Rest text features after checks: {self.text_features}")
425
430
  params["text_features"] = self.text_features
426
431
 
427
432
  # Find rest categorical features
@@ -452,6 +457,7 @@ class CatBoostWrapper(EstimatorWrapper):
452
457
 
453
458
  del self.estimator._init_params["cat_features"]
454
459
 
460
+ self.logger.info(f"Selected categorical features: {self.cat_features}")
455
461
  params["cat_features"] = self.cat_features
456
462
 
457
463
  return X, y, groups, params
@@ -103,6 +103,8 @@ def clean_full_duplicates(
103
103
  unique_columns = df.columns.tolist()
104
104
  if SYSTEM_RECORD_ID in unique_columns:
105
105
  unique_columns.remove(SYSTEM_RECORD_ID)
106
+ if "sort_id" in unique_columns:
107
+ unique_columns.remove("sort_id")
106
108
  logger.info(f"Dataset shape before clean duplicates: {df.shape}")
107
109
  df = df.drop_duplicates(subset=unique_columns)
108
110
  logger.info(f"Dataset shape after clean duplicates: {df.shape}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.244a19
3
+ Version: 1.1.244a20
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -2,11 +2,11 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
3
  upgini/dataset.py,sha256=WGpnmpnmfdyB2DAwaj7mkk2s0e-6Z6bg5BWj1lUE2p0,49960
4
4
  upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
5
- upgini/features_enricher.py,sha256=-5Ofx0f3OHuwP2dgg4O2xL8zaxCb2F-9E1Nc1XnGdZE,165622
5
+ upgini/features_enricher.py,sha256=xN-j01kEbph__ZycO2knKeDlscWdAf_vLzCpO0ERLFo,165759
6
6
  upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
7
7
  upgini/http.py,sha256=eSG4gOpmCGlXmB6KIPNzAG8tRZNUjyYpMeUeHw_2li4,42264
8
8
  upgini/metadata.py,sha256=55t0uQI910tzTcnwxZCUL1413BhTiSm8oqiwp-94NyA,9613
9
- upgini/metrics.py,sha256=b9efALHojYiAm_XHfCznZq7TVCU76ZM_zrwU6LI0yY0,25900
9
+ upgini/metrics.py,sha256=0km-A0TNRqHy8jdf_VqfV5iuwA-nZmlJGhQ4fLd8rKg,26340
10
10
  upgini/search_task.py,sha256=5n4qGJmtu48s0-FHAtF3L5qVLMd1JVW3FJlM8dFbh-s,17063
11
11
  upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
12
12
  upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
@@ -40,7 +40,7 @@ upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU
40
40
  upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
41
41
  upgini/utils/cv_utils.py,sha256=6pSSL_Ft_8C6n6aInJeiyeSBD7McjsMxKZpHqSBV0uY,2491
42
42
  upgini/utils/datetime_utils.py,sha256=awsLpnFjBNcrsCDyyiiJLicHgHiGCNAwi0UOwRKGD7s,8645
43
- upgini/utils/deduplicate_utils.py,sha256=UEk160TGAqShDYeofAJJehpmEqMSj-I82o4GqDOSGnk,5811
43
+ upgini/utils/deduplicate_utils.py,sha256=D5zdbnpo0qW23Sy04hRbntaliImiLi76KS7Du1l-ea8,5888
44
44
  upgini/utils/display_utils.py,sha256=tiq5sFOfMwkKCjQ7OGdyK_twe0Qdr9F3mzkW1QXSDog,10664
45
45
  upgini/utils/email_utils.py,sha256=3CvHXTSzlgLyGsQOXfRYVfFhfPy6OXG4uXOBWRaLfHg,3479
46
46
  upgini/utils/fallback_progress_bar.py,sha256=cdbd1XGcWm4Ed4eAqV2_St3z7uC_kkH22gEyrN5ub6M,1090
@@ -54,8 +54,8 @@ upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,4
54
54
  upgini/utils/target_utils.py,sha256=qyj-bGsIEl9X2Vc5gwXtsuRaocvId8bn46F7mZ9dy9A,1707
55
55
  upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
56
56
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
57
- upgini-1.1.244a19.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
58
- upgini-1.1.244a19.dist-info/METADATA,sha256=i0jRNkFom9uR1I0tImg7U_PJFquFj5dq6LhqkAGC-bA,48265
59
- upgini-1.1.244a19.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
60
- upgini-1.1.244a19.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
61
- upgini-1.1.244a19.dist-info/RECORD,,
57
+ upgini-1.1.244a20.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
58
+ upgini-1.1.244a20.dist-info/METADATA,sha256=rpTVYdRU4VsFxnVRtiLyVREU_u3NwSEHQy-guOoaKOM,48265
59
+ upgini-1.1.244a20.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
60
+ upgini-1.1.244a20.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
61
+ upgini-1.1.244a20.dist-info/RECORD,,