upgini 1.2.120a1__py3-none-any.whl → 1.2.121a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.120a1"
1
+ __version__ = "1.2.121a2"
@@ -519,21 +519,24 @@ class DataSourcePublisher:
519
519
  description: str = "",
520
520
  ):
521
521
  if model_type is not None and model_type not in ["ONNX", "CATBOOST"]:
522
- raise ValueError(f"Invalid model type: {model_type}. Available values: ONNX")
522
+ raise ValueError(f"Invalid model type: {model_type}. Available values: ONNX, CATBOOST")
523
523
  metadata = {
524
524
  "modelName": name,
525
525
  "inputNames": input_names,
526
526
  "dateColumn": date_column,
527
527
  "scoreName": score_name,
528
528
  "searchTaskId": search_id,
529
- "modelType": model_type or "ONNX",
529
+ "modelType": model_type or "CATBOOST",
530
530
  "description": description,
531
531
  }
532
532
 
533
533
  trace_id = str(uuid.uuid4())
534
534
  with MDC(trace_id=trace_id):
535
535
  try:
536
- self._rest_client.upload_autofe_model(file_path, metadata, trace_id)
536
+ result = self._rest_client.upload_autofe_model(file_path, metadata, trace_id)
537
+ if "ERROR" in result:
538
+ raise Exception(result)
539
+ print(result)
537
540
  except Exception:
538
541
  self.logger.exception("Failed to upload autofe model")
539
542
  raise
upgini/dataset.py CHANGED
@@ -694,9 +694,7 @@ class Dataset:
694
694
 
695
695
  def prepare_uploading_file(self, base_path: str) -> str:
696
696
  parquet_file_path = f"{base_path}/{self.dataset_name}.parquet"
697
- print("Before saving parquet file")
698
697
  self.data.to_parquet(path=parquet_file_path, index=False, compression="gzip", engine="fastparquet")
699
- print("After saving parquet file")
700
698
  uploading_file_size = Path(parquet_file_path).stat().st_size
701
699
  self.logger.info(f"Size of prepared uploading file: {uploading_file_size}. {len(self.data)} rows")
702
700
  if uploading_file_size > self.MAX_UPLOADING_FILE_SIZE:
@@ -1028,13 +1028,7 @@ class FeaturesEnricher(TransformerMixin):
1028
1028
  columns_renaming,
1029
1029
  _,
1030
1030
  ) = prepared_data
1031
-
1032
- # rename baseline_score_column
1033
- reversed_renaming = {v: k for k, v in columns_renaming.items()}
1034
- baseline_score_column = self.baseline_score_column
1035
- if baseline_score_column is not None:
1036
- baseline_score_column = reversed_renaming[baseline_score_column]
1037
-
1031
+
1038
1032
  gc.collect()
1039
1033
 
1040
1034
  if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
@@ -1089,7 +1083,7 @@ class FeaturesEnricher(TransformerMixin):
1089
1083
  has_time=has_time,
1090
1084
  )
1091
1085
  baseline_cv_result = baseline_estimator.cross_val_predict(
1092
- fitting_X, y_sorted, baseline_score_column
1086
+ fitting_X, y_sorted, self.baseline_score_column
1093
1087
  )
1094
1088
  baseline_metric = baseline_cv_result.get_display_metric()
1095
1089
  if baseline_metric is None:
@@ -1192,7 +1186,7 @@ class FeaturesEnricher(TransformerMixin):
1192
1186
  f"on client features: {eval_X_sorted.columns.to_list()}"
1193
1187
  )
1194
1188
  etalon_eval_results = baseline_estimator.calculate_metric(
1195
- eval_X_sorted, eval_y_sorted, baseline_score_column
1189
+ eval_X_sorted, eval_y_sorted, self.baseline_score_column
1196
1190
  )
1197
1191
  etalon_eval_metric = etalon_eval_results.get_display_metric()
1198
1192
  self.logger.info(
@@ -2502,6 +2496,9 @@ if response.status_code == 200:
2502
2496
  ) -> tuple[pd.DataFrame, dict[str, str], list[str], dict[str, SearchKey]]:
2503
2497
  if self._search_task is None:
2504
2498
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
2499
+ features_meta = self._search_task.get_all_features_metadata_v2()
2500
+ if features_meta is None:
2501
+ raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
2505
2502
 
2506
2503
  start_time = time.time()
2507
2504
  search_id = self.search_id or (self._search_task.search_task_id if self._search_task is not None else None)
@@ -2531,7 +2528,6 @@ if response.status_code == 200:
2531
2528
  self.__display_support_link(msg)
2532
2529
  return None, {}, [], self.search_keys
2533
2530
 
2534
- features_meta = self._search_task.get_all_features_metadata_v2()
2535
2531
  online_api_features = [fm.name for fm in features_meta if fm.from_online_api and fm.shap_value > 0]
2536
2532
  if len(online_api_features) > 0:
2537
2533
  self.logger.warning(
@@ -3382,6 +3378,7 @@ if response.status_code == 200:
3382
3378
  except KeyboardInterrupt as e:
3383
3379
  print(self.bundle.get("search_stopping"))
3384
3380
  self.rest_client.stop_search_task_v2(trace_id, self._search_task.search_task_id)
3381
+ self._search_task = None
3385
3382
  self.logger.warning(f"Search {self._search_task.search_task_id} stopped by user")
3386
3383
  print(self.bundle.get("search_stopped"))
3387
3384
  raise e
upgini/http.py CHANGED
@@ -426,26 +426,19 @@ class _RestClient:
426
426
  ) -> SearchTaskResponse:
427
427
  api_path = self.INITIAL_SEARCH_URI_FMT_V2
428
428
 
429
- print("Before getting track metrics")
430
429
  track_metrics = get_track_metrics(self.client_ip, self.client_visitorid)
431
- print("After getting track metrics")
432
430
 
433
431
  def open_and_send():
434
432
  md5_hash = hashlib.md5()
435
- print("Before opening file to calculate hashes")
436
433
  with open(file_path, "rb") as file:
437
434
  content = file.read()
438
435
  md5_hash.update(content)
439
436
  digest = md5_hash.hexdigest()
440
437
  metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
441
- print("After calculating md5")
442
438
 
443
- print("Before calculating sha256")
444
439
  digest_sha256 = file_hash(file_path)
445
- print("After calculating sha256")
446
440
  metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
447
441
 
448
- print("Before opening file to send")
449
442
  with open(file_path, "rb") as file:
450
443
  files = {
451
444
  "metadata": (
@@ -473,12 +466,9 @@ class _RestClient:
473
466
  )
474
467
  additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
475
468
 
476
- print("Before sending request for initial search")
477
- response = self._send_post_file_req_v2(
469
+ return self._send_post_file_req_v2(
478
470
  api_path, files, trace_id=trace_id, additional_headers=additional_headers
479
471
  )
480
- print("After sending request")
481
- return response
482
472
 
483
473
  response = self._with_unauth_retry(open_and_send)
484
474
  return SearchTaskResponse(response)
@@ -155,7 +155,7 @@ target_outliers_warning=We detected {} outliers in your sample.\nExamples of out
155
155
  # features validation
156
156
  empty_or_contant_features=Columns {} has value with frequency more than 99%, removed from X
157
157
  high_cardinality_features=Columns {} has high cardinality (>90% unique values), removed from X
158
- # one_hot_encoded_features=One hot encoded features detected. Use int encoding for correct results of fit.\n{}
158
+ one_hot_encoded_features=One hot encoded features detected: {}
159
159
 
160
160
  # Dataset validation
161
161
  dataset_too_few_rows=X size should be at least {} rows after validation
@@ -24,7 +24,7 @@ class FeaturesValidator:
24
24
  features_for_generate: Optional[List[str]] = None,
25
25
  columns_renaming: Optional[Dict[str, str]] = None,
26
26
  ) -> Tuple[List[str], List[str]]:
27
- # one_hot_encoded_features = []
27
+ one_hot_encoded_features = []
28
28
  empty_or_constant_features = []
29
29
  high_cardinality_features = []
30
30
  warnings = []
@@ -36,23 +36,17 @@ class FeaturesValidator:
36
36
  value_counts = column.value_counts(dropna=False, normalize=True)
37
37
  most_frequent_percent = value_counts.iloc[0]
38
38
 
39
- if most_frequent_percent >= 0.99:
39
+ if len(value_counts) == 1:
40
40
  empty_or_constant_features.append(f)
41
+ elif most_frequent_percent >= 0.99:
42
+ if self.is_one_hot_encoded(column):
43
+ one_hot_encoded_features.append(f)
44
+ else:
45
+ empty_or_constant_features.append(f)
41
46
 
42
- # TODO implement one-hot encoding check
43
- # if len(value_counts) == 1:
44
- # empty_or_constant_features.append(f)
45
- # elif most_frequent_percent >= 0.99:
46
- # empty_or_constant_features.append(f)
47
- # if set(value_counts.index.to_list()) == {0, 1}:
48
- # one_hot_encoded_features.append(f)
49
- # else:
50
- # empty_or_constant_features.append(f)
51
- # continue
52
-
53
- # if one_hot_encoded_features:
54
- # msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
55
- # warnings.append(msg)
47
+ if one_hot_encoded_features:
48
+ msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
49
+ self.logger.info(msg)
56
50
 
57
51
  columns_renaming = columns_renaming or {}
58
52
 
@@ -102,3 +96,30 @@ class FeaturesValidator:
102
96
  @staticmethod
103
97
  def find_constant_features(df: pd.DataFrame) -> List[str]:
104
98
  return [i for i in df if df[i].nunique() <= 1]
99
+
100
+ @staticmethod
101
+ def is_one_hot_encoded(series: pd.Series) -> bool:
102
+ try:
103
+ # Column contains only 0 and 1 (as strings or numbers)
104
+ series = series.astype(float)
105
+ if set(series.unique()) != {0.0, 1.0}:
106
+ return False
107
+
108
+ series = series.astype(int)
109
+
110
+ # Column doesn't contain any NaN, np.NaN, space, null, etc.
111
+ if not (series.isin([0, 1])).all():
112
+ return False
113
+
114
+ vc = series.value_counts()
115
+ # Column should contain both 0 and 1
116
+ if len(vc) != 2:
117
+ return False
118
+
119
+ # Minority class is 1
120
+ if vc[1] >= vc[0]:
121
+ return False
122
+
123
+ return True
124
+ except ValueError:
125
+ return False
@@ -1301,6 +1301,7 @@ def _encode_cat_features(X_train, y_train, X_test, y_test, cat_features, estimat
1301
1301
  encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
1302
1302
  encoder.fit(X_train[cat_features], y_train)
1303
1303
 
1304
+ # OrdinalEncoder doesn't support progressive encoding with target
1304
1305
  X_train[cat_features] = encoder.transform(X_train[cat_features]).astype(int)
1305
1306
  X_test[cat_features] = encoder.transform(X_test[cat_features]).astype(int)
1306
1307
 
@@ -1314,10 +1315,8 @@ def _encode_cat_features(X_train, y_train, X_test, y_test, cat_features, estimat
1314
1315
  encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
1315
1316
  encoder.fit(X_train[cat_features], y_train)
1316
1317
 
1317
- # Progressive encoding on train (using y)
1318
- X_train[cat_features] = encoder.transform(X_train[cat_features], y_train).astype(int)
1319
-
1320
- # Static encoding on validation (no y)
1318
+ # OrdinalEncoder doesn't support progressive encoding with target
1319
+ X_train[cat_features] = encoder.transform(X_train[cat_features]).astype(int)
1321
1320
  X_test[cat_features] = encoder.transform(X_test[cat_features]).astype(int)
1322
1321
 
1323
1322
  return X_train, y_train, X_test, y_test, [], encoder
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.120a1
3
+ Version: 1.2.121a2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,10 +1,10 @@
1
- upgini/__about__.py,sha256=J4ou6xfTwIgzTXi7mnxG9WD4vn49_cFGZVdB8RZEIPM,26
1
+ upgini/__about__.py,sha256=Dv8DzHbPAHs_fY_MACW4HNqnYW7CilejShdVPFkTaYM,26
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=9xYeqp-Ti3-QcsucyxlDFOHQef6ZQsBX7bOZMCyT2rM,31665
4
+ upgini/dataset.py,sha256=pQ8JQe0cdygD-W9GefJmfE6bnj4EYzXsjlgWdIS9nS8,31578
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=C9pZKusj_QnG9coPVAa1a_88VC-lLR4Tre4uC10yt04,231852
7
- upgini/http.py,sha256=CzDgSrYH6-R14G0d8xPyLalb-w42fjj9XOHVXh7leyM,44835
6
+ upgini/features_enricher.py,sha256=Du1S72F55cqyKbHT3VGSPnJO3XicWABFVkA2-G3chdA,231696
7
+ upgini/http.py,sha256=-J_wOpnwVnT0ebPC6sOs6fN3AWtCD0LJLu6nlYmxaqk,44348
8
8
  upgini/metadata.py,sha256=VzgtgEbPPtNxTrj9LM5qSDP3DujHwAXqbUSKBjPcb9c,12477
9
9
  upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
10
10
  upgini/search_task.py,sha256=SAiUd1AytbA2Q6PSnnztr7oTRKpud1wQZ5YtKjsmQHU,18256
@@ -31,14 +31,14 @@ upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_a
31
31
  upgini/autofe/timeseries/trend.py,sha256=K1_iw2ko_LIUU8YCUgrvN3n0MkHtsi7-63-8x9er1k4,2129
32
32
  upgini/autofe/timeseries/volatility.py,sha256=SvZfhM_ZAWCNpTf87WjSnZsnlblARgruDlu4By4Zvhc,8078
33
33
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
- upgini/data_source/data_source_publisher.py,sha256=suRmAF1i7yiZ8vJjpEKdVr5Wqtr7o1_vjAhaN9B4DU0,26518
34
+ upgini/data_source/data_source_publisher.py,sha256=qXQUYErhCmkWHm2FWgTL0FYZ2aJbxtSDV94OCM3eqUU,26653
35
35
  upgini/mdc/__init__.py,sha256=iHJlXQg6xRM1-ZOUtaPSJqw5SpQDszvxp4LyqviNLIQ,1027
36
36
  upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
37
37
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
38
  upgini/normalizer/normalize_utils.py,sha256=mDh2mBW3aQMB4EFP2aHbf2dGMVkOcWnp4sKKvKDBh8w,8511
39
39
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
40
40
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
41
- upgini/resource_bundle/strings.properties,sha256=cNeVkWZMyjGCYGqmOOeJqisqPSEBtmfIw_U1rmgQw4w,29285
41
+ upgini/resource_bundle/strings.properties,sha256=Kmc6ZHpo0hK-bEQuoQkU0SPIQCnIDYRKqkfN3a_gvRU,29237
42
42
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
43
43
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  upgini/sampler/base.py,sha256=Fva2FEhLiNRPZ9Q6uOtJRtRzwsayjv7aphalAZO_4lc,6452
@@ -58,7 +58,7 @@ upgini/utils/display_utils.py,sha256=uSG3JwpwCIgRJXsp-8ktuJ0Dh-WFti7IrRLMUfHfoDc
58
58
  upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
59
59
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
60
60
  upgini/utils/feature_info.py,sha256=6vihytwKma_TlXtTn4l6Aj4kqlOj0ouLy-yWVV6VUw8,7551
61
- upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
61
+ upgini/utils/features_validator.py,sha256=RAnfX80GBFcz6-SlTSR0DF6BZzf7A7IL8dlIqEoSz_s,4265
62
62
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
63
63
  upgini/utils/hash_utils.py,sha256=mP2yHyzvDNdpa5g3B4MHzulxBeEz_ZSoGl1YF_VnAyE,5538
64
64
  upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
@@ -68,13 +68,13 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
68
68
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
69
69
  upgini/utils/psi.py,sha256=vw8QEktXSx29IiMJMxmDeFU_4lJInJBXt_XL5Muekzo,11114
70
70
  upgini/utils/sample_utils.py,sha256=xpfYaZ2cYP7I2JrcooVc13QNBFawB81cJRuh38451Q4,15123
71
- upgini/utils/sklearn_ext.py,sha256=jLJWAKkqQinV15Z4y1ZnsN3c-fKFwXTsprs00COnyVU,49315
71
+ upgini/utils/sklearn_ext.py,sha256=Pcy8sWD6f4YcE5Bu0UmXD4j0ICmXtrT8DJlTArM-_a0,49356
72
72
  upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
73
73
  upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,10882
74
74
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
75
75
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
76
76
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
77
- upgini-1.2.120a1.dist-info/METADATA,sha256=Ai4c0bpRvXFgEYB78zVltQNbWv6HpPdc96IAw85kPJI,50745
78
- upgini-1.2.120a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
79
- upgini-1.2.120a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
- upgini-1.2.120a1.dist-info/RECORD,,
77
+ upgini-1.2.121a2.dist-info/METADATA,sha256=1XVh2jWKC2I3ElN4ftyEveTny9C1pU5z69Osnp6q7_s,50745
78
+ upgini-1.2.121a2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
79
+ upgini-1.2.121a2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
+ upgini-1.2.121a2.dist-info/RECORD,,