upgini 1.2.13a4__py3-none-any.whl → 1.2.14a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.13a4"
1
+ __version__ = "1.2.14a1"
upgini/autofe/binary.py CHANGED
@@ -140,13 +140,9 @@ class Distance(PandasOperand):
140
140
  has_symmetry_importance: bool = True
141
141
 
142
142
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
143
- # Handle None values by replacing them with 0 in the dot product and norm calculations
144
- left = left.apply(lambda x: np.array(x) if x is not None else np.zeros_like(right[0]))
145
- right = right.apply(lambda x: np.array(x) if x is not None else np.zeros_like(left[0]))
146
-
147
143
  return pd.Series(
148
144
  1 - self.__dot(left, right) / (self.__norm(left) * self.__norm(right)), index=left.index
149
- )
145
+ ).astype(np.float64)
150
146
 
151
147
  # row-wise dot product, handling None values
152
148
  def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
upgini/dataset.py CHANGED
@@ -253,6 +253,7 @@ class Dataset: # (pd.DataFrame):
253
253
  min_class_percent = self.IMBALANCE_THESHOLD / target_classes_count
254
254
  min_class_threshold = min_class_percent * count
255
255
 
256
+ # If min class count less than 30% for binary or (60 / classes_count)% for multiclass
256
257
  if min_class_count < min_class_threshold:
257
258
  self.imbalanced = True
258
259
  self.data = balance_undersample(
@@ -150,30 +150,34 @@ def balance_undersample(
150
150
  # fill up to min_sample_threshold by majority class
151
151
  minority_class = df[df[target_column] == min_class_value]
152
152
  majority_class = df[df[target_column] != min_class_value]
153
- sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
153
+ # sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
154
+ sample_size = min(
155
+ max_class_count,
156
+ binary_bootstrap_loops * (min_class_count + max(min_sample_threshold - 2 * min_class_count, 0)),
157
+ )
154
158
  sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
155
159
  resampled_data = df[
156
160
  (df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
157
161
  | (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
158
162
  ]
159
163
 
160
- elif max_class_count > min_class_count * binary_bootstrap_loops:
161
- msg = bundle.get("dataset_rarest_class_less_threshold").format(
162
- min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
163
- )
164
- logger.warning(msg)
165
- print(msg)
166
- if warning_counter:
167
- warning_counter.increment()
168
-
169
- sampler = RandomUnderSampler(
170
- sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
171
- )
172
- X = df[SYSTEM_RECORD_ID]
173
- X = X.to_frame(SYSTEM_RECORD_ID)
174
- new_x, _ = sampler.fit_resample(X, target) # type: ignore
175
-
176
- resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
164
+ # elif max_class_count > min_class_count * binary_bootstrap_loops:
165
+ # msg = bundle.get("dataset_rarest_class_less_threshold").format(
166
+ # min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
167
+ # )
168
+ # logger.warning(msg)
169
+ # print(msg)
170
+ # if warning_counter:
171
+ # warning_counter.increment()
172
+
173
+ # sampler = RandomUnderSampler(
174
+ # sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
175
+ # )
176
+ # X = df[SYSTEM_RECORD_ID]
177
+ # X = X.to_frame(SYSTEM_RECORD_ID)
178
+ # new_x, _ = sampler.fit_resample(X, target) # type: ignore
179
+
180
+ # resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
177
181
 
178
182
  logger.info(f"Shape after rebalance resampling: {resampled_data}")
179
183
  return resampled_data
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.13a4
3
+ Version: 1.2.14a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,7 +1,7 @@
1
- upgini/__about__.py,sha256=FCz2XUQlXzrSJIZwpE7MVdBagpn7lwlAw754ujNNr2Q,25
1
+ upgini/__about__.py,sha256=HLgDtt9GUzL7hBuQQkMyI0Uz-5Ms9FchFLQelf9FVWU,25
2
2
  upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=olZ-OHSfBNoBSCo7R5t7uCLukI2nO7afpx_A-HCiJLk,31067
4
+ upgini/dataset.py,sha256=OEcmfHYUDbV5idrx4zMVI4yZ6bFpoiyP7EvBhPXlgeA,31165
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
6
  upgini/features_enricher.py,sha256=HJJZbZScVrl6ugDBQE71m7om5-ahvMyEnAqZNw-OEJ0,188058
7
7
  upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
@@ -15,7 +15,7 @@ upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9Jvf
15
15
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  upgini/autofe/all_operands.py,sha256=cCCB44qvkmuWyiRM5Xykx8tkHPIjQthrWyj67STWN80,2578
18
- upgini/autofe/binary.py,sha256=i2Y0uAOXVORt-RgnkO0gM7jZz2l5j3jqYz_yBOT2gxk,7927
18
+ upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
19
19
  upgini/autofe/date.py,sha256=OpFc3Al0xO3qlESn2Uokfxw51ArVqmh3xngWwdrsaqE,9762
20
20
  upgini/autofe/feature.py,sha256=eL7wABUhDKZzv3E-RPJNcyGwSfB0UptcfU2RbvsOks4,15082
21
21
  upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
@@ -54,10 +54,10 @@ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,1
54
54
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
55
55
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
56
56
  upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
57
- upgini/utils/target_utils.py,sha256=BVtDmrmFMKerSUWaNOIEdzsYHIFiODdpnWbE50QDPDc,7864
57
+ upgini/utils/target_utils.py,sha256=SbO9CmvWIwR2pNzPZAMVDRI3cb0O-3NFggLs5cfGPxY,8071
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.2.13a4.dist-info/METADATA,sha256=WqeEXF0ava0eEP1wD566T8AIgMX4V74H404kcc7k0DY,48579
61
- upgini-1.2.13a4.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
- upgini-1.2.13a4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.2.13a4.dist-info/RECORD,,
60
+ upgini-1.2.14a1.dist-info/METADATA,sha256=jkOw-XYOr5RrO16cLakpeXDP-fl2Zs4xx2hAVgNYUow,48579
61
+ upgini-1.2.14a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
+ upgini-1.2.14a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.2.14a1.dist-info/RECORD,,