upgini 1.2.113a3974.dev2__py3-none-any.whl → 1.2.114__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/http.py CHANGED
@@ -45,6 +45,7 @@ from upgini.metadata import (
45
45
  SearchCustomization,
46
46
  )
47
47
  from upgini.resource_bundle import bundle
48
+ from upgini.utils.hash_utils import file_hash
48
49
  from upgini.utils.track_info import get_track_metrics
49
50
 
50
51
  UPGINI_URL: str = "UPGINI_URL"
@@ -276,6 +277,7 @@ class _RestClient:
276
277
  SEARCH_DUMP_INPUT_FMT_V2 = SERVICE_ROOT_V2 + "search/dump-input"
277
278
  SEARCH_DUMP_INPUT_FILE_FMT = SERVICE_ROOT_V2 + "search/dump-input-file?digest={0}"
278
279
  TRANSFORM_USAGE_FMT = SERVICE_ROOT_V2 + "user/transform-usage"
280
+ SEARCH_SELECTED_FEATURES_URI_FMT = SERVICE_ROOT_V2 + "search/{0}/selected-features"
279
281
 
280
282
  UPLOAD_USER_ADS_URI = SERVICE_ROOT + "ads/upload"
281
283
  SEND_LOG_EVENT_URI = "private/api/v2/events/send"
@@ -427,7 +429,7 @@ class _RestClient:
427
429
  api_path = self.SEARCH_DUMP_INPUT_FILE_FMT
428
430
 
429
431
  def upload_with_check(path: str, file_name: str):
430
- digest_sha256 = self.compute_file_digest(path)
432
+ digest_sha256 = file_hash(path)
431
433
  if self.is_file_uploaded(trace_id, digest_sha256):
432
434
  # print(f"File {path} was already uploaded with digest {digest_sha256}, skipping")
433
435
  return
@@ -448,16 +450,6 @@ class _RestClient:
448
450
  if eval_y_path:
449
451
  upload_with_check(eval_y_path, "eval_y.parquet")
450
452
 
451
- @staticmethod
452
- def compute_file_digest(filepath: str, algorithm="sha256", chunk_size=4096) -> str:
453
- hash_func = getattr(hashlib, algorithm)()
454
-
455
- with open(filepath, "rb") as f:
456
- for chunk in iter(lambda: f.read(chunk_size), b""):
457
- hash_func.update(chunk)
458
-
459
- return hash_func.hexdigest()
460
-
461
453
  def initial_search_v2(
462
454
  self,
463
455
  trace_id: str,
@@ -478,10 +470,7 @@ class _RestClient:
478
470
  digest = md5_hash.hexdigest()
479
471
  metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
480
472
 
481
- # digest_sha256 = hashlib.sha256(
482
- # pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
483
- # ).hexdigest()
484
- digest_sha256 = self.compute_file_digest(file_path)
473
+ digest_sha256 = file_hash(file_path)
485
474
  metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
486
475
 
487
476
  with open(file_path, "rb") as file:
@@ -576,10 +565,7 @@ class _RestClient:
576
565
  digest = md5_hash.hexdigest()
577
566
  metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
578
567
 
579
- # digest_sha256 = hashlib.sha256(
580
- # pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
581
- # ).hexdigest()
582
- digest_sha256 = self.compute_file_digest(file_path)
568
+ digest_sha256 = file_hash(file_path)
583
569
  metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
584
570
 
585
571
  with open(file_path, "rb") as file:
@@ -729,6 +715,16 @@ class _RestClient:
729
715
  )
730
716
  return TransformUsage(response)
731
717
 
718
+ def update_selected_features(self, trace_id: str, search_task_id: str, selected_features: list[str]):
719
+ api_path = self.SEARCH_SELECTED_FEATURES_URI_FMT.format(search_task_id)
720
+ request = {"features": selected_features}
721
+ self._with_unauth_retry(lambda: self._send_post_req(api_path, trace_id, request, result_format=None))
722
+
723
+ def get_selected_features(self, trace_id: str, search_task_id: str) -> list[str] | None:
724
+ api_path = self.SEARCH_SELECTED_FEATURES_URI_FMT.format(search_task_id)
725
+ response = self._with_unauth_retry(lambda: self._send_get_req(api_path, trace_id))
726
+ return response.get("features")
727
+
732
728
  def send_log_event(self, log_event: LogEvent):
733
729
  api_path = self.SEND_LOG_EVENT_URI
734
730
  try:
upgini/metadata.py CHANGED
@@ -285,6 +285,7 @@ class FeaturesMetadataV2(BaseModel):
285
285
  doc_link: Optional[str] = None
286
286
  update_frequency: Optional[str] = None
287
287
  from_online_api: Optional[bool] = None
288
+ psi_value: Optional[float] = None
288
289
 
289
290
 
290
291
  class HitRateMetrics(BaseModel):
@@ -326,13 +327,6 @@ class ProviderTaskMetadataV2(BaseModel):
326
327
  generated_features: Optional[List[GeneratedFeatureMetadata]] = None
327
328
 
328
329
 
329
- class FeaturesFilter(BaseModel):
330
- minImportance: Optional[float] = None
331
- maxPSI: Optional[float] = None
332
- maxCount: Optional[int] = None
333
- selectedFeatures: Optional[List[str]] = None
334
-
335
-
336
330
  class RuntimeParameters(BaseModel):
337
331
  properties: Dict[str, Any] = {}
338
332
 
@@ -342,11 +336,8 @@ class AutoFEParameters(BaseModel):
342
336
 
343
337
 
344
338
  class SearchCustomization(BaseModel):
345
- featuresFilter: Optional[FeaturesFilter] = None
346
339
  extractFeatures: Optional[bool] = None
347
340
  accurateModel: Optional[bool] = None
348
- importanceThreshold: Optional[float] = None
349
- maxFeatures: Optional[int] = None
350
341
  returnScores: Optional[bool] = None
351
342
  runtimeParameters: Optional[RuntimeParameters] = None
352
343
  metricsCalculation: Optional[bool] = None
upgini/metrics.py CHANGED
@@ -816,7 +816,8 @@ class CatBoostWrapper(EstimatorWrapper):
816
816
  else:
817
817
  encoded = cat_encoder.transform(x[self.cat_features])
818
818
  cat_features = encoded.columns.to_list()
819
- x.loc[:, self.cat_features] = encoded
819
+ x.drop(columns=encoded.columns, inplace=True, errors="ignore")
820
+ x[encoded.columns] = encoded
820
821
  else:
821
822
  cat_features = self.cat_features
822
823
 
@@ -1175,7 +1176,10 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
1175
1176
  >>> mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7])
1176
1177
  0.060...
1177
1178
  """
1178
- _, y_true, y_pred, multioutput = _check_reg_targets(y_true, y_pred, multioutput)
1179
+ try:
1180
+ _, y_true, y_pred, multioutput = _check_reg_targets(y_true, y_pred, multioutput)
1181
+ except TypeError:
1182
+ _, y_true, y_pred, sample_weight, multioutput = _check_reg_targets(y_true, y_pred, sample_weight, multioutput)
1179
1183
  check_consistent_length(y_true, y_pred, sample_weight)
1180
1184
 
1181
1185
  if (y_true < 0).any():
@@ -72,9 +72,6 @@ binary_target_unique_count_not_2=Binary target should contain only 2 unique valu
72
72
  binary_target_eval_unique_count_not_2=Binary target should contain only 2 unique values, but {} found in eval_set
73
73
 
74
74
  # Validation errors
75
- # params validation
76
- invalid_importance_threshold=importance_threshold must be float
77
- invalid_max_features=max_features must be int
78
75
  # search keys validation
79
76
  search_key_differ_from_fit=With search_id passed as a parameter, search_keys should same as for fit call\nSee docs https://github.com/upgini/upgini#61-reuse-completed-search-for-enrichment-without-fit-run
80
77
  empty_search_keys=At least one column with a search key required\nSee docs https://github.com/upgini/upgini#3--choose-one-or-multiple-columns-as-a-search-keys
@@ -123,7 +120,7 @@ train_unstable_target=Your training sample contains an unstable target event, PS
123
120
  eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
124
121
  # eval set validation
125
122
  unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
126
- eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
123
+ eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y or X only
127
124
  unsupported_x_type_eval_set=Unsupported type of X in eval_set: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list.
128
125
  eval_x_and_x_diff_shape=The column set in eval_set are differ from the column set in X
129
126
  unsupported_y_type_eval_set=Unsupported type of y in eval_set: {}. Use pandas.Series, numpy.ndarray or list
@@ -139,6 +136,8 @@ eval_x_is_empty=X in eval_set is empty.
139
136
  eval_y_is_empty=y in eval_set is empty.
140
137
  x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
141
138
  eval_x_has_train_samples=Eval set X has rows that are present in train set X
139
+ oot_without_date_not_supported=Eval set {} provided as OOT but date column is missing. It will be ignored for stability check
140
+ oot_with_online_sources_not_supported=Eval set {} provided as OOT and also provided columns for online API. It will be ignored for stability check
142
141
 
143
142
  baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
144
143
  baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
@@ -163,6 +162,7 @@ dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample
163
162
  dataset_empty_column_names=Some column names are empty. Add names please
164
163
  dataset_full_duplicates={:.5f}% of the rows are fully duplicated
165
164
  dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nSample of incorrect row indexes: {}
165
+ dataset_diff_target_duplicates_oot={:.4f}% of rows ({}) in OOT eval_set are duplicates with train or another eval_set. These rows will be deleted from OOT\nSample of incorrect row indexes: {}
166
166
  dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
167
167
  dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
168
168
  dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset
@@ -183,6 +183,7 @@ dataset_invalid_column_type=Unsupported data type of column {}: {}
183
183
  dataset_invalid_filter=Unknown field in filter_features. Should be {'min_importance', 'max_psi', 'max_count', 'selected_features'}.
184
184
  dataset_too_big_file=Too big size of dataframe X for processing. Please reduce number of rows or columns
185
185
  dataset_transform_diff_fit=You try to enrich dataset that column names are different from the train dataset column names that you used on the fit stage. Please make the column names the same as in the train dataset and restart.
186
+ oot_eval_set_too_small_after_dedup=OOT eval set {} has less than 1000 rows after deduplication. It will be ignored for stability check
186
187
  binary_small_dataset=The least populated class in Target contains less than 1000 rows.\nSmall numbers of observations may negatively affect the number of selected features and quality of your ML model.\nUpgini recommends you increase the number of observations in the least populated class.\n
187
188
  all_search_keys_invalid=All search keys are invalid
188
189
  all_emails_invalid=All values in column {} are invalid emails # Metrics validation
@@ -240,7 +241,7 @@ validation_all_valid_status=All valid
240
241
  validation_all_valid_message= -
241
242
  validation_drop_message= Invalid rows will be dropped.
242
243
  validation_some_invalid_status=Some invalid
243
- validation_invalid_message={:.1f}% values failed validation and removed from dataframe, invalid values: {}
244
+ validation_invalid_message={:.2f}% values failed validation and removed from dataframe, invalid values: {}
244
245
  validation_all_invalid_status=All invalid
245
246
  validation_all_valid_color=#DAF7A6
246
247
  validation_some_invalid_color=#FFC300
@@ -250,11 +251,12 @@ validation_text_color=black
250
251
 
251
252
  # Features info table
252
253
  features_info_header=\n{} relevant feature(s) found with the search keys: {}
253
- relevant_features_header=Relevant features
254
+ relevant_features_header=Relevant features ({})
254
255
  features_info_provider=Provider
255
256
  features_info_source=Source
256
257
  features_info_name=Feature name
257
258
  features_info_shap=SHAP value
259
+ features_info_psi=PSI value
258
260
  features_info_hitrate=Coverage %
259
261
  features_info_type=Type
260
262
  # Deprecated
upgini/sampler/base.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """
2
2
  Base class for the under-sampling method.
3
3
  """
4
+
4
5
  # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
5
6
  # License: MIT
6
7
 
@@ -12,6 +13,7 @@ import numpy as np
12
13
  from sklearn.base import BaseEstimator
13
14
  from sklearn.preprocessing import label_binarize
14
15
  from sklearn.utils.multiclass import check_classification_targets
16
+ from sklearn.utils.validation import check_X_y
15
17
 
16
18
  from .utils import ArraysTransformer, check_sampling_strategy, check_target_type
17
19
 
@@ -125,7 +127,7 @@ class BaseSampler(SamplerMixin):
125
127
  if accept_sparse is None:
126
128
  accept_sparse = ["csr", "csc"]
127
129
  y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
128
- X, y = self._validate_data(X, y, reset=True, accept_sparse=accept_sparse)
130
+ X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=None, ensure_all_finite=False)
129
131
  return X, y, binarize_y
130
132
 
131
133
  def _more_tags(self):
@@ -80,14 +80,24 @@ RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE
80
80
 
81
81
  def _check_X_y(self, X, y):
82
82
  y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
83
- X, y = self._validate_data(
84
- X,
85
- y,
86
- reset=True,
87
- accept_sparse=["csr", "csc"],
88
- dtype=None,
89
- force_all_finite=False,
90
- )
83
+ try:
84
+ X, y = self._validate_data(
85
+ X,
86
+ y,
87
+ reset=True,
88
+ accept_sparse=["csr", "csc"],
89
+ dtype=None,
90
+ force_all_finite=False,
91
+ )
92
+ except AttributeError:
93
+ from sklearn.utils.validation import check_X_y
94
+ X, y = check_X_y(
95
+ X,
96
+ y,
97
+ accept_sparse=["csr", "csc"],
98
+ dtype=None,
99
+ ensure_all_finite=False,
100
+ )
91
101
  return X, y, binarize_y
92
102
 
93
103
  def _fit_resample(self, X, y):
upgini/search_task.py CHANGED
@@ -312,6 +312,12 @@ class SearchTask:
312
312
  def get_file_metadata(self, trace_id: str) -> FileMetadata:
313
313
  return self.rest_client.get_search_file_metadata(self.search_task_id, trace_id)
314
314
 
315
+ def update_selected_features(self, trace_id: str, selected_features: list[str]):
316
+ self.rest_client.update_selected_features(trace_id, self.search_task_id, selected_features)
317
+
318
+ def get_selected_features(self, trace_id: str) -> list[str] | None:
319
+ return self.rest_client.get_selected_features(trace_id, self.search_task_id)
320
+
315
321
 
316
322
  @lru_cache
317
323
  def _get_all_initial_raw_features_cached(
upgini/utils/config.py ADDED
@@ -0,0 +1,43 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import List
3
+
4
+ import pandas as pd
5
+
6
+ # Constants for SampleConfig
7
+ TS_MIN_DIFFERENT_IDS_RATIO = 0.2
8
+ TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
9
+ TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
10
+ TS_DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
11
+ FIT_SAMPLE_ROWS_TS = 100_000
12
+
13
+ BINARY_MIN_SAMPLE_THRESHOLD = 5_000
14
+ MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
15
+ BINARY_BOOTSTRAP_LOOPS = 5
16
+ MULTICLASS_BOOTSTRAP_LOOPS = 2
17
+
18
+ FIT_SAMPLE_THRESHOLD = 100_000
19
+ FIT_SAMPLE_ROWS = 100_000
20
+ FIT_SAMPLE_ROWS_WITH_EVAL_SET = 100_000
21
+ FIT_SAMPLE_THRESHOLD_WITH_EVAL_SET = 100_000
22
+
23
+
24
+ @dataclass
25
+ class SampleConfig:
26
+ force_sample_size: int = 7000
27
+ ts_min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO
28
+ ts_default_high_freq_trunc_lengths: List[pd.DateOffset] = field(
29
+ default_factory=TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS.copy
30
+ )
31
+ ts_default_low_freq_trunc_lengths: List[pd.DateOffset] = field(
32
+ default_factory=TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS.copy
33
+ )
34
+ ts_default_time_unit_threshold: pd.Timedelta = TS_DEFAULT_TIME_UNIT_THRESHOLD
35
+ binary_min_sample_threshold: int = BINARY_MIN_SAMPLE_THRESHOLD
36
+ multiclass_min_sample_threshold: int = MULTICLASS_MIN_SAMPLE_THRESHOLD
37
+ binary_bootstrap_loops: int = BINARY_BOOTSTRAP_LOOPS
38
+ multiclass_bootstrap_loops: int = MULTICLASS_BOOTSTRAP_LOOPS
39
+ fit_sample_threshold: int = FIT_SAMPLE_THRESHOLD
40
+ fit_sample_rows: int = FIT_SAMPLE_ROWS
41
+ fit_sample_rows_with_eval_set: int = FIT_SAMPLE_ROWS_WITH_EVAL_SET
42
+ fit_sample_threshold_with_eval_set: int = FIT_SAMPLE_THRESHOLD_WITH_EVAL_SET
43
+ fit_sample_rows_ts: int = FIT_SAMPLE_ROWS_TS
@@ -134,8 +134,13 @@ def remove_fintech_duplicates(
134
134
  logger.info(f"Train dataset shape after clean fintech duplicates: {train_df.shape}")
135
135
 
136
136
  # Process each eval_set part separately
137
+ oot_eval_dfs = []
137
138
  new_eval_dfs = []
138
139
  for i, eval_df in enumerate(eval_dfs, 1):
140
+ # Skip OOT
141
+ if eval_df[TARGET].isna().all():
142
+ oot_eval_dfs.append(eval_df)
143
+ continue
139
144
  logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
140
145
  cleaned_eval_df, eval_warning = process_df(eval_df, i)
141
146
  if eval_warning:
@@ -145,8 +150,8 @@ def remove_fintech_duplicates(
145
150
 
146
151
  # Combine the processed train and eval parts back into one dataset
147
152
  logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
148
- if new_eval_dfs:
149
- df = pd.concat([train_df] + new_eval_dfs)
153
+ if new_eval_dfs or oot_eval_dfs:
154
+ df = pd.concat([train_df] + new_eval_dfs + oot_eval_dfs, ignore_index=False)
150
155
  else:
151
156
  df = train_df
152
157
  logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
@@ -190,16 +195,59 @@ def clean_full_duplicates(
190
195
  msg = None
191
196
  if TARGET in df.columns:
192
197
  unique_columns.remove(TARGET)
193
- marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
198
+
199
+ # Separate rows to exclude from deduplication:
200
+ # for each eval_set_index != 0 check separately, all TARGET values are NaN
201
+ df_for_dedup = df
202
+ oot_df = None
203
+
204
+ if EVAL_SET_INDEX in df.columns:
205
+ oot_eval_dfs = []
206
+ other_dfs = []
207
+ for eval_idx in df[EVAL_SET_INDEX].unique():
208
+ eval_subset = df[df[EVAL_SET_INDEX] == eval_idx]
209
+ # Check that all TARGET values for this specific eval_set_index are NaN
210
+ if eval_idx != 0 and eval_subset[TARGET].isna().all():
211
+ oot_eval_dfs.append(eval_subset)
212
+ logger.info(
213
+ f"Excluded {len(eval_subset)} rows from deduplication "
214
+ f"(eval_set_index={eval_idx} and all TARGET values are NaN)"
215
+ )
216
+ else:
217
+ other_dfs.append(eval_subset)
218
+
219
+ if oot_eval_dfs:
220
+ oot_df = pd.concat(oot_eval_dfs, ignore_index=False)
221
+ df_for_dedup = pd.concat(other_dfs, ignore_index=False)
222
+ else:
223
+ df_for_dedup = df
224
+
225
+ marked_duplicates = df_for_dedup.duplicated(subset=unique_columns, keep=False)
194
226
  if marked_duplicates.sum() > 0:
195
- dups_indices = df[marked_duplicates].index.to_list()[:100]
196
- nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
197
- num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
198
- share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
227
+ dups_indices = df_for_dedup[marked_duplicates].index.to_list()[:100]
228
+ nrows_after_tgt_dedup = len(df_for_dedup.drop_duplicates(subset=unique_columns, keep=False))
229
+ num_dup_rows = len(df_for_dedup) - nrows_after_tgt_dedup
230
+ share_tgt_dedup = 100 * num_dup_rows / len(df_for_dedup)
199
231
 
200
232
  msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
201
- df = df.drop_duplicates(subset=unique_columns, keep=False)
202
- logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
233
+ df_for_dedup = df_for_dedup.drop_duplicates(subset=unique_columns, keep=False)
234
+ logger.info(f"Dataset shape after clean invalid target duplicates: {df_for_dedup.shape}")
235
+ # Combine back excluded rows
236
+ if oot_df is not None:
237
+ df = pd.concat([df_for_dedup, oot_df], ignore_index=False)
238
+ marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
239
+ if marked_duplicates.sum() > 0:
240
+ dups_indices = df[marked_duplicates].index.to_list()[:100]
241
+ nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
242
+ num_dup_rows = len(df) - nrows_after_tgt_dedup
243
+ share_tgt_dedup = 100 * num_dup_rows / len(df)
244
+ msg = bundle.get("dataset_diff_target_duplicates_oot").format(
245
+ share_tgt_dedup, num_dup_rows, dups_indices
246
+ )
247
+ df = df.drop_duplicates(subset=unique_columns, keep="first")
248
+ logger.info(f"Final dataset shape after adding back excluded rows: {df.shape}")
249
+ else:
250
+ df = df_for_dedup
203
251
 
204
252
  return df, msg
205
253
 
@@ -137,7 +137,7 @@ def display_html_dataframe(
137
137
  {table_html}
138
138
  </div>
139
139
  """
140
- if display_handle:
140
+ if display_handle is not None:
141
141
  return display_handle.update(HTML(result_html))
142
142
  else:
143
143
  return display(HTML(result_html), display_id=display_id)
@@ -27,6 +27,7 @@ class FeatureInfo:
27
27
  doc_link: str
28
28
  data_provider_link: str
29
29
  data_source_link: str
30
+ psi_value: Optional[float] = None
30
31
 
31
32
  @staticmethod
32
33
  def from_metadata(
@@ -47,12 +48,14 @@ class FeatureInfo:
47
48
  doc_link=feature_meta.doc_link,
48
49
  data_provider_link=feature_meta.data_provider_link,
49
50
  data_source_link=feature_meta.data_source_link,
51
+ psi_value=feature_meta.psi_value,
50
52
  )
51
53
 
52
54
  def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
53
55
  return {
54
56
  bundle.get("features_info_name"): self.name,
55
57
  bundle.get("features_info_shap"): self.rounded_shap,
58
+ bundle.get("features_info_psi"): self.psi_value,
56
59
  bundle.get("features_info_hitrate"): self.hitrate,
57
60
  bundle.get("features_info_value_preview"): self.value_preview,
58
61
  bundle.get("features_info_provider"): self.provider,
@@ -64,6 +67,7 @@ class FeatureInfo:
64
67
  return {
65
68
  bundle.get("features_info_name"): self.internal_name,
66
69
  bundle.get("features_info_shap"): self.rounded_shap,
70
+ bundle.get("features_info_psi"): self.psi_value,
67
71
  bundle.get("features_info_hitrate"): self.hitrate,
68
72
  bundle.get("features_info_value_preview"): self.value_preview,
69
73
  bundle.get("features_info_provider"): self.internal_provider,
@@ -76,6 +80,7 @@ class FeatureInfo:
76
80
  bundle.get("features_info_name"): self.internal_name,
77
81
  "feature_link": self.doc_link,
78
82
  bundle.get("features_info_shap"): self.rounded_shap,
83
+ bundle.get("features_info_psi"): self.psi_value,
79
84
  bundle.get("features_info_hitrate"): self.hitrate,
80
85
  bundle.get("features_info_value_preview"): self.value_preview,
81
86
  bundle.get("features_info_provider"): self.internal_provider,
@@ -0,0 +1,159 @@
1
+ import hashlib
2
+ import os
3
+ import platform
4
+ import shutil
5
+ import subprocess
6
+ from pathlib import Path
7
+ from typing import List, Optional, Tuple
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+
12
+
13
+ def file_hash(path: str | os.PathLike, algo: str = "sha256") -> str:
14
+ """
15
+ Returns file hash using system utilities, working consistently on Windows/macOS/Linux.
16
+ If no suitable utility is found, gracefully falls back to hashlib.
17
+
18
+ Supported algo values (depend on OS and available utilities):
19
+ - "md5", "sha1", "sha224", "sha256", "sha384", "sha512"
20
+ On Windows uses `certutil`.
21
+ On Linux uses `sha*sum` (e.g., sha256sum) or `shasum -a N`.
22
+ On macOS uses `shasum -a N` or `md5` for MD5.
23
+ """
24
+ p = str(Path(path))
25
+
26
+ sysname = platform.system().lower()
27
+ algo = algo.lower()
28
+
29
+ # -------- command attempts depending on OS --------
30
+ candidates: list[list[str]] = []
31
+
32
+ if sysname == "windows":
33
+ # certutil supports: MD5, SHA1, SHA256, SHA384, SHA512
34
+ name_map = {
35
+ "md5": "MD5",
36
+ "sha1": "SHA1",
37
+ "sha224": None, # certutil doesn't support
38
+ "sha256": "SHA256",
39
+ "sha384": "SHA384",
40
+ "sha512": "SHA512",
41
+ }
42
+ cert_name = name_map.get(algo)
43
+ if cert_name:
44
+ candidates.append(["certutil", "-hashfile", p, cert_name])
45
+ else:
46
+ # Unix-like systems
47
+ # 1) specialized *sum utility if available (usually present on Linux)
48
+ sum_cmd = f"{algo}sum" # md5sum, sha256sum, etc.
49
+ if shutil.which(sum_cmd):
50
+ candidates.append([sum_cmd, p])
51
+
52
+ # 2) universal shasum with -a parameter (available on macOS and often on Linux)
53
+ shasum_bits = {
54
+ "sha1": "1",
55
+ "sha224": "224",
56
+ "sha256": "256",
57
+ "sha384": "384",
58
+ "sha512": "512",
59
+ }
60
+ if algo in shasum_bits and shutil.which("shasum"):
61
+ candidates.append(["shasum", "-a", shasum_bits[algo], p])
62
+
63
+ # 3) for MD5 on macOS there's often a separate `md5` utility
64
+ if algo == "md5" and shutil.which("md5"):
65
+ candidates.append(["md5", p])
66
+
67
+ # -------- try system utilities --------
68
+ for cmd in candidates:
69
+ try:
70
+ out = subprocess.check_output(cmd, text=True, stderr=subprocess.STDOUT)
71
+ digest = _parse_hash_output(out, cmd[0])
72
+ if digest:
73
+ return digest.lower()
74
+ except (subprocess.CalledProcessError, FileNotFoundError):
75
+ continue # try next candidate
76
+
77
+ # -------- reliable fallback to hashlib --------
78
+ import hashlib
79
+
80
+ try:
81
+ h = getattr(hashlib, algo)
82
+ except AttributeError:
83
+ raise ValueError(f"Algorithm not supported: {algo}")
84
+
85
+ hasher = h()
86
+ with open(p, "rb") as f:
87
+ for chunk in iter(lambda: f.read(1024 * 1024), b""):
88
+ hasher.update(chunk)
89
+ return hasher.hexdigest().lower()
90
+
91
+
92
+ def _parse_hash_output(output: str, tool: str) -> Optional[str]:
93
+ """
94
+ Converts output from different utilities to clean hash.
95
+ Supports:
96
+ - sha*sum / shasum: '<hex> <filename>'
97
+ - certutil (Windows): line with second element as hash (spaces inside are removed)
98
+ - md5 (macOS): 'MD5 (file) = <hex>'
99
+ """
100
+ tool = tool.lower()
101
+ lines = [ln.strip() for ln in output.splitlines() if ln.strip()]
102
+
103
+ if not lines:
104
+ return None
105
+
106
+ if tool in {"sha1sum", "sha224sum", "sha256sum", "sha384sum", "sha512sum", "md5sum", "shasum"}:
107
+ # format: '<hex> <filename>'
108
+ first = lines[0]
109
+ parts = first.split()
110
+ return parts[0] if parts else None
111
+
112
+ if tool == "certutil":
113
+ # format:
114
+ # SHA256 hash of file <path>:
115
+ # <AA BB CC ...>
116
+ # CertUtil: -hashfile command completed successfully.
117
+ if len(lines) >= 2:
118
+ # Second line contains hex with spaces
119
+ candidate = lines[1].replace(" ", "")
120
+ # ensure it's hex
121
+ if all(c in "0123456789abcdefABCDEF" for c in candidate):
122
+ return candidate
123
+ return None
124
+
125
+ if tool == "md5":
126
+ # format: 'MD5 (<file>) = <hex>'
127
+ last = lines[-1]
128
+ if "=" in last:
129
+ return last.split("=", 1)[1].strip()
130
+ # sometimes md5 can return just the hash
131
+ parts = last.split()
132
+ if parts and all(c in "0123456789abcdefABCDEF" for c in parts[-1]):
133
+ return parts[-1]
134
+ return None
135
+
136
+ # as a last resort: take the first "looks like hash" word
137
+ for ln in lines:
138
+ for token in ln.split():
139
+ if all(c in "0123456789abcdefABCDEF" for c in token) and len(token) >= 32:
140
+ return token
141
+ return None
142
+
143
+
144
+ def hash_input(X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[List[Tuple]] = None) -> str:
145
+ hashed_objects = []
146
+ try:
147
+ hashed_objects.append(pd.util.hash_pandas_object(X, index=False).values)
148
+ if y is not None:
149
+ hashed_objects.append(pd.util.hash_pandas_object(y, index=False).values)
150
+ if eval_set is not None:
151
+ if isinstance(eval_set, tuple):
152
+ eval_set = [eval_set]
153
+ for eval_X, eval_y in eval_set:
154
+ hashed_objects.append(pd.util.hash_pandas_object(eval_X, index=False).values)
155
+ hashed_objects.append(pd.util.hash_pandas_object(eval_y, index=False).values)
156
+ common_hash = hashlib.sha256(np.concatenate(hashed_objects)).hexdigest()
157
+ return common_hash
158
+ except Exception:
159
+ return ""