upgini 1.1.258__py3-none-any.whl → 1.1.260a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/dataset.py CHANGED
@@ -223,11 +223,11 @@ class Dataset: # (pd.DataFrame):
223
223
  self.data[col] = self.data[col].astype("str").str.slice(stop=self.MAX_STRING_FEATURE_LENGTH)
224
224
 
225
225
  def __convert_bools(self):
226
- """Convert bool columns True -> 1, False -> 0"""
226
+ """Convert bool columns to string"""
227
227
  # self.logger.info("Converting bool to int")
228
228
  for col in self.data.columns:
229
229
  if is_bool(self.data[col]):
230
- self.data[col] = self.data[col].astype("Int64")
230
+ self.data[col] = self.data[col].astype("str")
231
231
 
232
232
  def __convert_float16(self):
233
233
  """Convert float16 to float"""
@@ -1681,9 +1681,12 @@ class FeaturesEnricher(TransformerMixin):
1681
1681
  eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
1682
1682
  else:
1683
1683
  self.logger.info("Transform without eval_set")
1684
- df = self.X.copy()
1684
+ df = validated_X.copy()
1685
1685
 
1686
1686
  df[TARGET] = validated_y
1687
+
1688
+ df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
1689
+
1687
1690
  num_samples = _num_samples(df)
1688
1691
  if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
1689
1692
  self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
@@ -2884,19 +2887,20 @@ class FeaturesEnricher(TransformerMixin):
2884
2887
  sort_columns = [date_column] if date_column is not None else []
2885
2888
 
2886
2889
  other_search_keys = sorted(
2887
- [
2888
- sk
2889
- for sk, key_type in search_keys.items()
2890
- if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
2891
- and sk in df.columns
2892
- and df[sk].nunique() > 1 # don't use constant keys for hash
2893
- ]
2890
+ [c for c in df.columns if c not in sort_columns and df[c].nunique() > 1]
2891
+ # [
2892
+ # sk
2893
+ # for sk, key_type in search_keys.items()
2894
+ # if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
2895
+ # and sk in df.columns
2896
+ # and df[sk].nunique() > 1 # don't use constant keys for hash
2897
+ # ]
2894
2898
  )
2895
2899
 
2896
2900
  search_keys_hash = "search_keys_hash"
2897
2901
  if len(other_search_keys) > 0:
2898
2902
  sort_columns.append(search_keys_hash)
2899
- df[search_keys_hash] = pd.util.hash_pandas_object(df[sorted(other_search_keys)], index=False)
2903
+ df[search_keys_hash] = pd.util.hash_pandas_object(df[other_search_keys], index=False)
2900
2904
 
2901
2905
  df = df.sort_values(by=sort_columns)
2902
2906
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.258
3
+ Version: 1.1.260a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -1,8 +1,8 @@
1
1
  upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
- upgini/dataset.py,sha256=JSkscuhL0P6-ae6aaxnwvHTqlb9urBe8YrA4DPTdcIw,48163
3
+ upgini/dataset.py,sha256=ywBwf93d0IH39ZGfmNDlAwe1ILQtt1WzJ87WfIOMI2g,48149
4
4
  upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
5
- upgini/features_enricher.py,sha256=-uHhey6NMPMQIW16ctEqfWFh421_aaVUX85JeudIZQE,172113
5
+ upgini/features_enricher.py,sha256=RD2EHGkWK30K82ELQmTMzGfcc5Fa1eoXvNNRR4MAssQ,172311
6
6
  upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
7
7
  upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
8
8
  upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
@@ -55,8 +55,8 @@ upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,4
55
55
  upgini/utils/target_utils.py,sha256=DH812qcZ7Pvf9WVVb33fbwQjb1W9h1hXRNCCiG7Y6tI,2563
56
56
  upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
57
57
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
58
- upgini-1.1.258.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
59
- upgini-1.1.258.dist-info/METADATA,sha256=K0skkdAz4yv8GS1IJKq-qktHBm5S82o7qND2I_Wmb9g,48156
60
- upgini-1.1.258.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
61
- upgini-1.1.258.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
62
- upgini-1.1.258.dist-info/RECORD,,
58
+ upgini-1.1.260a1.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
59
+ upgini-1.1.260a1.dist-info/METADATA,sha256=jZ9q9pcKmPU9mVhYBKywk3Hwd31C26RG_H8T3XoQFIM,48158
60
+ upgini-1.1.260a1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
61
+ upgini-1.1.260a1.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
62
+ upgini-1.1.260a1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.42.0)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5