upgini 1.2.45__py3-none-any.whl → 1.2.47__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.45"
1
+ __version__ = "1.2.47"
upgini/dataset.py CHANGED
@@ -646,7 +646,7 @@ class Dataset: # (pd.DataFrame):
646
646
  parquet_file_path = self.prepare_uploading_file(tmp_dir)
647
647
  time.sleep(1) # this is neccesary to avoid requests rate limit restrictions
648
648
  # If previous steps were too fast, time estimation could be calculated incorrectly
649
- time_left = max(time.time() - start_time, 20)
649
+ time_left = max(time.time() - start_time, 20.0)
650
650
  search_progress = SearchProgress(1.0, ProgressStage.CREATING_FIT, time_left)
651
651
  if progress_bar is not None:
652
652
  progress_bar.progress = search_progress.to_progress_bar()
@@ -699,7 +699,7 @@ class Dataset: # (pd.DataFrame):
699
699
  runtime_parameters=runtime_parameters,
700
700
  metrics_calculation=metrics_calculation,
701
701
  )
702
- seconds_left = max(time.time() - start_time, 20)
702
+ seconds_left = max(time.time() - start_time, 20.0)
703
703
  search_progress = SearchProgress(1.0, ProgressStage.CREATING_TRANSFORM, seconds_left)
704
704
  if progress_bar is not None:
705
705
  progress_bar.progress = search_progress.to_progress_bar()
@@ -165,10 +165,6 @@ class FeaturesEnricher(TransformerMixin):
165
165
 
166
166
  shared_datasets: list of str, optional (default=None)
167
167
  List of private shared dataset ids for custom search
168
-
169
- select_features: bool, optional (default=False)
170
- If True, return only selected features both from input and data sources.
171
- Otherwise, return all features from input and only selected features from data sources.
172
168
  """
173
169
 
174
170
  TARGET_NAME = "target"
@@ -235,7 +231,6 @@ class FeaturesEnricher(TransformerMixin):
235
231
  client_visitorid: Optional[str] = None,
236
232
  custom_bundle_config: Optional[str] = None,
237
233
  add_date_if_missing: bool = True,
238
- select_features: bool = False,
239
234
  disable_force_downsampling: bool = False,
240
235
  id_columns: Optional[List[str]] = None,
241
236
  **kwargs,
@@ -273,6 +268,7 @@ class FeaturesEnricher(TransformerMixin):
273
268
  self.eval_set: Optional[List[Tuple]] = None
274
269
  self.autodetected_search_keys: Dict[str, SearchKey] = {}
275
270
  self.imbalanced = False
271
+ self.fit_select_features = False
276
272
  self.__cached_sampled_datasets: Dict[str, Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict, Dict]] = (
277
273
  dict()
278
274
  )
@@ -297,7 +293,6 @@ class FeaturesEnricher(TransformerMixin):
297
293
  self.dropped_client_feature_names_ = []
298
294
  self.feature_importances_ = []
299
295
  self.search_id = search_id
300
- self.select_features = select_features
301
296
  self.disable_force_downsampling = disable_force_downsampling
302
297
 
303
298
  if search_id:
@@ -405,6 +400,7 @@ class FeaturesEnricher(TransformerMixin):
405
400
  remove_outliers_calc_metrics: Optional[bool] = None,
406
401
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
407
402
  search_id_callback: Optional[Callable[[str], Any]] = None,
403
+ select_features: bool = False,
408
404
  **kwargs,
409
405
  ):
410
406
  """Fit to data.
@@ -440,6 +436,10 @@ class FeaturesEnricher(TransformerMixin):
440
436
 
441
437
  remove_outliers_calc_metrics, optional (default=True)
442
438
  If True then rows with target ouliers will be dropped on metrics calculation
439
+
440
+ select_features: bool, optional (default=False)
441
+ If True, return only selected features both from input and data sources.
442
+ Otherwise, return all features from input and only selected features from data sources.
443
443
  """
444
444
  trace_id = str(uuid.uuid4())
445
445
  start_time = time.time()
@@ -474,6 +474,7 @@ class FeaturesEnricher(TransformerMixin):
474
474
  self.y = y
475
475
  self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
476
476
  self.dump_input(trace_id, X, y, self.eval_set)
477
+ self.__set_select_features(select_features)
477
478
  self.__inner_fit(
478
479
  trace_id,
479
480
  X,
@@ -523,6 +524,10 @@ class FeaturesEnricher(TransformerMixin):
523
524
  finally:
524
525
  self.logger.info(f"Fit elapsed time: {time.time() - start_time}")
525
526
 
527
+ def __set_select_features(self, select_features: bool):
528
+ self.fit_select_features = select_features
529
+ self.runtime_parameters.properties["select_features"] = select_features
530
+
526
531
  def fit_transform(
527
532
  self,
528
533
  X: Union[pd.DataFrame, pd.Series, np.ndarray],
@@ -538,6 +543,7 @@ class FeaturesEnricher(TransformerMixin):
538
543
  estimator: Optional[Any] = None,
539
544
  remove_outliers_calc_metrics: Optional[bool] = None,
540
545
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
546
+ select_features: bool = False,
541
547
  **kwargs,
542
548
  ) -> pd.DataFrame:
543
549
  """Fit to data, then transform it.
@@ -578,6 +584,10 @@ class FeaturesEnricher(TransformerMixin):
578
584
  remove_outliers_calc_metrics, optional (default=True)
579
585
  If True then rows with target ouliers will be dropped on metrics calculation
580
586
 
587
+ select_features: bool, optional (default=False)
588
+ If True, return only selected features both from input and data sources.
589
+ Otherwise, return all features from input and only selected features from data sources.
590
+
581
591
  Returns
582
592
  -------
583
593
  X_new: pandas.DataFrame of shape (n_samples, n_features_new)
@@ -613,6 +623,7 @@ class FeaturesEnricher(TransformerMixin):
613
623
  self.X = X
614
624
  self.y = y
615
625
  self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
626
+ self.__set_select_features(select_features)
616
627
  self.dump_input(trace_id, X, y, self.eval_set)
617
628
 
618
629
  if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
@@ -1096,7 +1107,8 @@ class FeaturesEnricher(TransformerMixin):
1096
1107
  ):
1097
1108
  train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1098
1109
  # np.mean(validated_y), 4
1099
- np.mean(y_sorted), 4
1110
+ np.mean(y_sorted),
1111
+ 4,
1100
1112
  )
1101
1113
  if etalon_metric is not None:
1102
1114
  train_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
@@ -1174,7 +1186,8 @@ class FeaturesEnricher(TransformerMixin):
1174
1186
  eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1175
1187
  # np.mean(validated_eval_set[idx][1]), 4
1176
1188
  # Use actually used for metrics dataset
1177
- np.mean(eval_y_sorted), 4
1189
+ np.mean(eval_y_sorted),
1190
+ 4,
1178
1191
  )
1179
1192
  if etalon_eval_metric is not None:
1180
1193
  eval_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = (
@@ -1238,8 +1251,11 @@ class FeaturesEnricher(TransformerMixin):
1238
1251
  self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
1239
1252
 
1240
1253
  def _update_shap_values(self, trace_id: str, x_columns: List[str], new_shaps: Dict[str, float]):
1254
+ renaming = self.fit_columns_renaming or {}
1241
1255
  new_shaps = {
1242
- feature: _round_shap_value(shap) for feature, shap in new_shaps.items() if feature in self.feature_names_
1256
+ renaming.get(feature, feature): _round_shap_value(shap)
1257
+ for feature, shap in new_shaps.items()
1258
+ if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
1243
1259
  }
1244
1260
  self.__prepare_feature_importances(trace_id, x_columns, new_shaps, silent=True)
1245
1261
 
@@ -1458,7 +1474,8 @@ class FeaturesEnricher(TransformerMixin):
1458
1474
  if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
1459
1475
  excluded = set()
1460
1476
  for sk in excluding_search_keys:
1461
- if columns_renaming.get(sk) in search_keys_for_metrics:
1477
+ renamed_sk = columns_renaming.get(sk)
1478
+ if renamed_sk in search_keys_for_metrics or renamed_sk in self.feature_names_:
1462
1479
  excluded.add(sk)
1463
1480
  excluding_search_keys = [sk for sk in excluding_search_keys if sk not in excluded]
1464
1481
 
@@ -1468,7 +1485,7 @@ class FeaturesEnricher(TransformerMixin):
1468
1485
  c
1469
1486
  for c in X_sampled.columns.to_list()
1470
1487
  if (
1471
- not self.select_features
1488
+ not self.fit_select_features
1472
1489
  or c in self.feature_names_
1473
1490
  or (self.fit_columns_renaming is not None and self.fit_columns_renaming.get(c) in self.feature_names_)
1474
1491
  )
@@ -3315,8 +3332,8 @@ if response.status_code == 200:
3315
3332
  f"Client ip: {self.client_ip}\n"
3316
3333
  f"Client visitorId: {self.client_visitorid}\n"
3317
3334
  f"Add date if missing: {self.add_date_if_missing}\n"
3318
- f"Select features: {self.select_features}\n"
3319
3335
  f"Disable force downsampling: {self.disable_force_downsampling}\n"
3336
+ f"Id columns: {self.id_columns}\n"
3320
3337
  )
3321
3338
 
3322
3339
  def sample(df):
@@ -3703,7 +3720,7 @@ if response.status_code == 200:
3703
3720
  is_client_feature = feature_meta.name in x_columns
3704
3721
 
3705
3722
  if feature_meta.shap_value == 0.0:
3706
- if self.select_features:
3723
+ if self.fit_select_features:
3707
3724
  self.dropped_client_feature_names_.append(feature_meta.name)
3708
3725
  continue
3709
3726
 
@@ -3712,7 +3729,7 @@ if response.status_code == 200:
3712
3729
  feature_meta.name in self.fit_generated_features
3713
3730
  or feature_meta.name == COUNTRY
3714
3731
  # In select_features mode we select also from etalon features and need to show them
3715
- or (not self.select_features and is_client_feature)
3732
+ or (not self.fit_select_features and is_client_feature)
3716
3733
  ):
3717
3734
  continue
3718
3735
 
upgini/http.py CHANGED
@@ -16,7 +16,7 @@ from typing import Any, Dict, List, Optional, Tuple
16
16
  from urllib.parse import urljoin
17
17
 
18
18
  import jwt
19
- import pandas as pd
19
+ # import pandas as pd
20
20
  import requests
21
21
  from pydantic import BaseModel
22
22
  from pythonjsonlogger import jsonlogger
@@ -422,6 +422,16 @@ class _RestClient:
422
422
  lambda: self._send_post_file_req_v2(api_path, files, trace_id=trace_id, need_json_response=False)
423
423
  )
424
424
 
425
+ @staticmethod
426
+ def compute_file_digest(filepath: str, algorithm="sha256", chunk_size=4096) -> str:
427
+ hash_func = getattr(hashlib, algorithm)()
428
+
429
+ with open(filepath, "rb") as f:
430
+ for chunk in iter(lambda: f.read(chunk_size), b""):
431
+ hash_func.update(chunk)
432
+
433
+ return hash_func.hexdigest()
434
+
425
435
  def initial_search_v2(
426
436
  self,
427
437
  trace_id: str,
@@ -442,9 +452,10 @@ class _RestClient:
442
452
  digest = md5_hash.hexdigest()
443
453
  metadata_with_md5 = metadata.copy(update={"checksumMD5": digest})
444
454
 
445
- digest_sha256 = hashlib.sha256(
446
- pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
447
- ).hexdigest()
455
+ # digest_sha256 = hashlib.sha256(
456
+ # pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
457
+ # ).hexdigest()
458
+ digest_sha256 = self.compute_file_digest(file_path)
448
459
  metadata_with_md5 = metadata_with_md5.copy(update={"digest": digest_sha256})
449
460
 
450
461
  with open(file_path, "rb") as file:
@@ -530,9 +541,10 @@ class _RestClient:
530
541
  digest = md5_hash.hexdigest()
531
542
  metadata_with_md5 = metadata.copy(update={"checksumMD5": digest})
532
543
 
533
- digest_sha256 = hashlib.sha256(
534
- pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
535
- ).hexdigest()
544
+ # digest_sha256 = hashlib.sha256(
545
+ # pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
546
+ # ).hexdigest()
547
+ digest_sha256 = self.compute_file_digest(file_path)
536
548
  metadata_with_md5 = metadata_with_md5.copy(update={"digest": digest_sha256})
537
549
 
538
550
  with open(file_path, "rb") as file:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.45
3
+ Version: 1.2.47
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,10 +1,10 @@
1
- upgini/__about__.py,sha256=TdZZy40X_55kxtHKjwbGqPrDr5qQe_uUz_Fvup_EuNI,23
1
+ upgini/__about__.py,sha256=o2NRe9gScRz1I1oB_R5MjkQ4w7BrDovQP2Z_Mq2c6bo,23
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=WfCg7x_HREGwbFNlF-CTSk-KZzkGYlT7PtZdiJfuzLM,33452
4
+ upgini/dataset.py,sha256=QC3jncWS3wHe4CY7pWWDMO_3HKxGbi0EyPHXMdBtoQM,33456
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=QQaK682uFjrkvDOt-ub7UFMAEy6SVjPQubb6dY_7moE,199109
7
- upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
6
+ upgini/features_enricher.py,sha256=NWYNZtSgAR05zOZp_Wq1ltVGThCttTbVN_TP2RaWFSI,200008
7
+ upgini/http.py,sha256=danPeX7nTMa_70S-pk-4UUm5yOvXYlR84jgyjoHYBkU,43367
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=-ibqiNjD7dTagqg53FoEJNEqvAYbwgfyn9PGTRQ_YKU,12054
10
10
  upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
59
59
  upgini/utils/target_utils.py,sha256=RlpKGss9kMibVSlA8iZuO_qxmyeplqzn7X8g6hiGGGs,14341
60
60
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
61
61
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
62
- upgini-1.2.45.dist-info/METADATA,sha256=nv_TxkRQegnVyCPp8Wj5iyVbA08rQ2FWTKdoPTm20Gg,49055
63
- upgini-1.2.45.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
64
- upgini-1.2.45.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
- upgini-1.2.45.dist-info/RECORD,,
62
+ upgini-1.2.47.dist-info/METADATA,sha256=4pKaboM3TxupdS6iw1Uh_IW9Dw0X88LnDh1pGjsc3fs,49055
63
+ upgini-1.2.47.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
64
+ upgini-1.2.47.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
+ upgini-1.2.47.dist-info/RECORD,,