upgini 1.2.113a3974.dev2__py3-none-any.whl → 1.2.114__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/dataset.py +48 -78
- upgini/features_enricher.py +726 -516
- upgini/http.py +15 -19
- upgini/metadata.py +1 -10
- upgini/metrics.py +6 -2
- upgini/resource_bundle/strings.properties +8 -6
- upgini/sampler/base.py +3 -1
- upgini/sampler/random_under_sampler.py +18 -8
- upgini/search_task.py +6 -0
- upgini/utils/config.py +43 -0
- upgini/utils/deduplicate_utils.py +57 -9
- upgini/utils/display_utils.py +1 -1
- upgini/utils/feature_info.py +5 -0
- upgini/utils/hash_utils.py +159 -0
- upgini/utils/psi.py +300 -0
- upgini/utils/sample_utils.py +45 -42
- upgini/utils/target_utils.py +53 -2
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114.dist-info}/METADATA +62 -32
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114.dist-info}/RECORD +22 -19
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114.dist-info}/WHEEL +1 -1
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114.dist-info}/licenses/LICENSE +0 -0
upgini/http.py
CHANGED
@@ -45,6 +45,7 @@ from upgini.metadata import (
|
|
45
45
|
SearchCustomization,
|
46
46
|
)
|
47
47
|
from upgini.resource_bundle import bundle
|
48
|
+
from upgini.utils.hash_utils import file_hash
|
48
49
|
from upgini.utils.track_info import get_track_metrics
|
49
50
|
|
50
51
|
UPGINI_URL: str = "UPGINI_URL"
|
@@ -276,6 +277,7 @@ class _RestClient:
|
|
276
277
|
SEARCH_DUMP_INPUT_FMT_V2 = SERVICE_ROOT_V2 + "search/dump-input"
|
277
278
|
SEARCH_DUMP_INPUT_FILE_FMT = SERVICE_ROOT_V2 + "search/dump-input-file?digest={0}"
|
278
279
|
TRANSFORM_USAGE_FMT = SERVICE_ROOT_V2 + "user/transform-usage"
|
280
|
+
SEARCH_SELECTED_FEATURES_URI_FMT = SERVICE_ROOT_V2 + "search/{0}/selected-features"
|
279
281
|
|
280
282
|
UPLOAD_USER_ADS_URI = SERVICE_ROOT + "ads/upload"
|
281
283
|
SEND_LOG_EVENT_URI = "private/api/v2/events/send"
|
@@ -427,7 +429,7 @@ class _RestClient:
|
|
427
429
|
api_path = self.SEARCH_DUMP_INPUT_FILE_FMT
|
428
430
|
|
429
431
|
def upload_with_check(path: str, file_name: str):
|
430
|
-
digest_sha256 =
|
432
|
+
digest_sha256 = file_hash(path)
|
431
433
|
if self.is_file_uploaded(trace_id, digest_sha256):
|
432
434
|
# print(f"File {path} was already uploaded with digest {digest_sha256}, skipping")
|
433
435
|
return
|
@@ -448,16 +450,6 @@ class _RestClient:
|
|
448
450
|
if eval_y_path:
|
449
451
|
upload_with_check(eval_y_path, "eval_y.parquet")
|
450
452
|
|
451
|
-
@staticmethod
|
452
|
-
def compute_file_digest(filepath: str, algorithm="sha256", chunk_size=4096) -> str:
|
453
|
-
hash_func = getattr(hashlib, algorithm)()
|
454
|
-
|
455
|
-
with open(filepath, "rb") as f:
|
456
|
-
for chunk in iter(lambda: f.read(chunk_size), b""):
|
457
|
-
hash_func.update(chunk)
|
458
|
-
|
459
|
-
return hash_func.hexdigest()
|
460
|
-
|
461
453
|
def initial_search_v2(
|
462
454
|
self,
|
463
455
|
trace_id: str,
|
@@ -478,10 +470,7 @@ class _RestClient:
|
|
478
470
|
digest = md5_hash.hexdigest()
|
479
471
|
metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
|
480
472
|
|
481
|
-
|
482
|
-
# pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
|
483
|
-
# ).hexdigest()
|
484
|
-
digest_sha256 = self.compute_file_digest(file_path)
|
473
|
+
digest_sha256 = file_hash(file_path)
|
485
474
|
metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
|
486
475
|
|
487
476
|
with open(file_path, "rb") as file:
|
@@ -576,10 +565,7 @@ class _RestClient:
|
|
576
565
|
digest = md5_hash.hexdigest()
|
577
566
|
metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
|
578
567
|
|
579
|
-
|
580
|
-
# pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
|
581
|
-
# ).hexdigest()
|
582
|
-
digest_sha256 = self.compute_file_digest(file_path)
|
568
|
+
digest_sha256 = file_hash(file_path)
|
583
569
|
metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
|
584
570
|
|
585
571
|
with open(file_path, "rb") as file:
|
@@ -729,6 +715,16 @@ class _RestClient:
|
|
729
715
|
)
|
730
716
|
return TransformUsage(response)
|
731
717
|
|
718
|
+
def update_selected_features(self, trace_id: str, search_task_id: str, selected_features: list[str]):
|
719
|
+
api_path = self.SEARCH_SELECTED_FEATURES_URI_FMT.format(search_task_id)
|
720
|
+
request = {"features": selected_features}
|
721
|
+
self._with_unauth_retry(lambda: self._send_post_req(api_path, trace_id, request, result_format=None))
|
722
|
+
|
723
|
+
def get_selected_features(self, trace_id: str, search_task_id: str) -> list[str] | None:
|
724
|
+
api_path = self.SEARCH_SELECTED_FEATURES_URI_FMT.format(search_task_id)
|
725
|
+
response = self._with_unauth_retry(lambda: self._send_get_req(api_path, trace_id))
|
726
|
+
return response.get("features")
|
727
|
+
|
732
728
|
def send_log_event(self, log_event: LogEvent):
|
733
729
|
api_path = self.SEND_LOG_EVENT_URI
|
734
730
|
try:
|
upgini/metadata.py
CHANGED
@@ -285,6 +285,7 @@ class FeaturesMetadataV2(BaseModel):
|
|
285
285
|
doc_link: Optional[str] = None
|
286
286
|
update_frequency: Optional[str] = None
|
287
287
|
from_online_api: Optional[bool] = None
|
288
|
+
psi_value: Optional[float] = None
|
288
289
|
|
289
290
|
|
290
291
|
class HitRateMetrics(BaseModel):
|
@@ -326,13 +327,6 @@ class ProviderTaskMetadataV2(BaseModel):
|
|
326
327
|
generated_features: Optional[List[GeneratedFeatureMetadata]] = None
|
327
328
|
|
328
329
|
|
329
|
-
class FeaturesFilter(BaseModel):
|
330
|
-
minImportance: Optional[float] = None
|
331
|
-
maxPSI: Optional[float] = None
|
332
|
-
maxCount: Optional[int] = None
|
333
|
-
selectedFeatures: Optional[List[str]] = None
|
334
|
-
|
335
|
-
|
336
330
|
class RuntimeParameters(BaseModel):
|
337
331
|
properties: Dict[str, Any] = {}
|
338
332
|
|
@@ -342,11 +336,8 @@ class AutoFEParameters(BaseModel):
|
|
342
336
|
|
343
337
|
|
344
338
|
class SearchCustomization(BaseModel):
|
345
|
-
featuresFilter: Optional[FeaturesFilter] = None
|
346
339
|
extractFeatures: Optional[bool] = None
|
347
340
|
accurateModel: Optional[bool] = None
|
348
|
-
importanceThreshold: Optional[float] = None
|
349
|
-
maxFeatures: Optional[int] = None
|
350
341
|
returnScores: Optional[bool] = None
|
351
342
|
runtimeParameters: Optional[RuntimeParameters] = None
|
352
343
|
metricsCalculation: Optional[bool] = None
|
upgini/metrics.py
CHANGED
@@ -816,7 +816,8 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
816
816
|
else:
|
817
817
|
encoded = cat_encoder.transform(x[self.cat_features])
|
818
818
|
cat_features = encoded.columns.to_list()
|
819
|
-
x.
|
819
|
+
x.drop(columns=encoded.columns, inplace=True, errors="ignore")
|
820
|
+
x[encoded.columns] = encoded
|
820
821
|
else:
|
821
822
|
cat_features = self.cat_features
|
822
823
|
|
@@ -1175,7 +1176,10 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
|
|
1175
1176
|
>>> mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7])
|
1176
1177
|
0.060...
|
1177
1178
|
"""
|
1178
|
-
|
1179
|
+
try:
|
1180
|
+
_, y_true, y_pred, multioutput = _check_reg_targets(y_true, y_pred, multioutput)
|
1181
|
+
except TypeError:
|
1182
|
+
_, y_true, y_pred, sample_weight, multioutput = _check_reg_targets(y_true, y_pred, sample_weight, multioutput)
|
1179
1183
|
check_consistent_length(y_true, y_pred, sample_weight)
|
1180
1184
|
|
1181
1185
|
if (y_true < 0).any():
|
@@ -72,9 +72,6 @@ binary_target_unique_count_not_2=Binary target should contain only 2 unique valu
|
|
72
72
|
binary_target_eval_unique_count_not_2=Binary target should contain only 2 unique values, but {} found in eval_set
|
73
73
|
|
74
74
|
# Validation errors
|
75
|
-
# params validation
|
76
|
-
invalid_importance_threshold=importance_threshold must be float
|
77
|
-
invalid_max_features=max_features must be int
|
78
75
|
# search keys validation
|
79
76
|
search_key_differ_from_fit=With search_id passed as a parameter, search_keys should same as for fit call\nSee docs https://github.com/upgini/upgini#61-reuse-completed-search-for-enrichment-without-fit-run
|
80
77
|
empty_search_keys=At least one column with a search key required\nSee docs https://github.com/upgini/upgini#3--choose-one-or-multiple-columns-as-a-search-keys
|
@@ -123,7 +120,7 @@ train_unstable_target=Your training sample contains an unstable target event, PS
|
|
123
120
|
eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
|
124
121
|
# eval set validation
|
125
122
|
unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
|
126
|
-
eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
|
123
|
+
eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y or X only
|
127
124
|
unsupported_x_type_eval_set=Unsupported type of X in eval_set: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list.
|
128
125
|
eval_x_and_x_diff_shape=The column set in eval_set are differ from the column set in X
|
129
126
|
unsupported_y_type_eval_set=Unsupported type of y in eval_set: {}. Use pandas.Series, numpy.ndarray or list
|
@@ -139,6 +136,8 @@ eval_x_is_empty=X in eval_set is empty.
|
|
139
136
|
eval_y_is_empty=y in eval_set is empty.
|
140
137
|
x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
|
141
138
|
eval_x_has_train_samples=Eval set X has rows that are present in train set X
|
139
|
+
oot_without_date_not_supported=Eval set {} provided as OOT but date column is missing. It will be ignored for stability check
|
140
|
+
oot_with_online_sources_not_supported=Eval set {} provided as OOT and also provided columns for online API. It will be ignored for stability check
|
142
141
|
|
143
142
|
baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
|
144
143
|
baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
|
@@ -163,6 +162,7 @@ dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample
|
|
163
162
|
dataset_empty_column_names=Some column names are empty. Add names please
|
164
163
|
dataset_full_duplicates={:.5f}% of the rows are fully duplicated
|
165
164
|
dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nSample of incorrect row indexes: {}
|
165
|
+
dataset_diff_target_duplicates_oot={:.4f}% of rows ({}) in OOT eval_set are duplicates with train or another eval_set. These rows will be deleted from OOT\nSample of incorrect row indexes: {}
|
166
166
|
dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
167
167
|
dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
168
168
|
dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset
|
@@ -183,6 +183,7 @@ dataset_invalid_column_type=Unsupported data type of column {}: {}
|
|
183
183
|
dataset_invalid_filter=Unknown field in filter_features. Should be {'min_importance', 'max_psi', 'max_count', 'selected_features'}.
|
184
184
|
dataset_too_big_file=Too big size of dataframe X for processing. Please reduce number of rows or columns
|
185
185
|
dataset_transform_diff_fit=You try to enrich dataset that column names are different from the train dataset column names that you used on the fit stage. Please make the column names the same as in the train dataset and restart.
|
186
|
+
oot_eval_set_too_small_after_dedup=OOT eval set {} has less than 1000 rows after deduplication. It will be ignored for stability check
|
186
187
|
binary_small_dataset=The least populated class in Target contains less than 1000 rows.\nSmall numbers of observations may negatively affect the number of selected features and quality of your ML model.\nUpgini recommends you increase the number of observations in the least populated class.\n
|
187
188
|
all_search_keys_invalid=All search keys are invalid
|
188
189
|
all_emails_invalid=All values in column {} are invalid emails # Metrics validation
|
@@ -240,7 +241,7 @@ validation_all_valid_status=All valid
|
|
240
241
|
validation_all_valid_message= -
|
241
242
|
validation_drop_message= Invalid rows will be dropped.
|
242
243
|
validation_some_invalid_status=Some invalid
|
243
|
-
validation_invalid_message={:.
|
244
|
+
validation_invalid_message={:.2f}% values failed validation and removed from dataframe, invalid values: {}
|
244
245
|
validation_all_invalid_status=All invalid
|
245
246
|
validation_all_valid_color=#DAF7A6
|
246
247
|
validation_some_invalid_color=#FFC300
|
@@ -250,11 +251,12 @@ validation_text_color=black
|
|
250
251
|
|
251
252
|
# Features info table
|
252
253
|
features_info_header=\n{} relevant feature(s) found with the search keys: {}
|
253
|
-
relevant_features_header=Relevant features
|
254
|
+
relevant_features_header=Relevant features ({})
|
254
255
|
features_info_provider=Provider
|
255
256
|
features_info_source=Source
|
256
257
|
features_info_name=Feature name
|
257
258
|
features_info_shap=SHAP value
|
259
|
+
features_info_psi=PSI value
|
258
260
|
features_info_hitrate=Coverage %
|
259
261
|
features_info_type=Type
|
260
262
|
# Deprecated
|
upgini/sampler/base.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
"""
|
2
2
|
Base class for the under-sampling method.
|
3
3
|
"""
|
4
|
+
|
4
5
|
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
|
5
6
|
# License: MIT
|
6
7
|
|
@@ -12,6 +13,7 @@ import numpy as np
|
|
12
13
|
from sklearn.base import BaseEstimator
|
13
14
|
from sklearn.preprocessing import label_binarize
|
14
15
|
from sklearn.utils.multiclass import check_classification_targets
|
16
|
+
from sklearn.utils.validation import check_X_y
|
15
17
|
|
16
18
|
from .utils import ArraysTransformer, check_sampling_strategy, check_target_type
|
17
19
|
|
@@ -125,7 +127,7 @@ class BaseSampler(SamplerMixin):
|
|
125
127
|
if accept_sparse is None:
|
126
128
|
accept_sparse = ["csr", "csc"]
|
127
129
|
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
|
128
|
-
X, y =
|
130
|
+
X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=None, ensure_all_finite=False)
|
129
131
|
return X, y, binarize_y
|
130
132
|
|
131
133
|
def _more_tags(self):
|
@@ -80,14 +80,24 @@ RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE
|
|
80
80
|
|
81
81
|
def _check_X_y(self, X, y):
|
82
82
|
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
|
83
|
-
|
84
|
-
X,
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
83
|
+
try:
|
84
|
+
X, y = self._validate_data(
|
85
|
+
X,
|
86
|
+
y,
|
87
|
+
reset=True,
|
88
|
+
accept_sparse=["csr", "csc"],
|
89
|
+
dtype=None,
|
90
|
+
force_all_finite=False,
|
91
|
+
)
|
92
|
+
except AttributeError:
|
93
|
+
from sklearn.utils.validation import check_X_y
|
94
|
+
X, y = check_X_y(
|
95
|
+
X,
|
96
|
+
y,
|
97
|
+
accept_sparse=["csr", "csc"],
|
98
|
+
dtype=None,
|
99
|
+
ensure_all_finite=False,
|
100
|
+
)
|
91
101
|
return X, y, binarize_y
|
92
102
|
|
93
103
|
def _fit_resample(self, X, y):
|
upgini/search_task.py
CHANGED
@@ -312,6 +312,12 @@ class SearchTask:
|
|
312
312
|
def get_file_metadata(self, trace_id: str) -> FileMetadata:
|
313
313
|
return self.rest_client.get_search_file_metadata(self.search_task_id, trace_id)
|
314
314
|
|
315
|
+
def update_selected_features(self, trace_id: str, selected_features: list[str]):
|
316
|
+
self.rest_client.update_selected_features(trace_id, self.search_task_id, selected_features)
|
317
|
+
|
318
|
+
def get_selected_features(self, trace_id: str) -> list[str] | None:
|
319
|
+
return self.rest_client.get_selected_features(trace_id, self.search_task_id)
|
320
|
+
|
315
321
|
|
316
322
|
@lru_cache
|
317
323
|
def _get_all_initial_raw_features_cached(
|
upgini/utils/config.py
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
from dataclasses import dataclass, field
|
2
|
+
from typing import List
|
3
|
+
|
4
|
+
import pandas as pd
|
5
|
+
|
6
|
+
# Constants for SampleConfig
|
7
|
+
TS_MIN_DIFFERENT_IDS_RATIO = 0.2
|
8
|
+
TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
|
9
|
+
TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
|
10
|
+
TS_DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
|
11
|
+
FIT_SAMPLE_ROWS_TS = 100_000
|
12
|
+
|
13
|
+
BINARY_MIN_SAMPLE_THRESHOLD = 5_000
|
14
|
+
MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
|
15
|
+
BINARY_BOOTSTRAP_LOOPS = 5
|
16
|
+
MULTICLASS_BOOTSTRAP_LOOPS = 2
|
17
|
+
|
18
|
+
FIT_SAMPLE_THRESHOLD = 100_000
|
19
|
+
FIT_SAMPLE_ROWS = 100_000
|
20
|
+
FIT_SAMPLE_ROWS_WITH_EVAL_SET = 100_000
|
21
|
+
FIT_SAMPLE_THRESHOLD_WITH_EVAL_SET = 100_000
|
22
|
+
|
23
|
+
|
24
|
+
@dataclass
|
25
|
+
class SampleConfig:
|
26
|
+
force_sample_size: int = 7000
|
27
|
+
ts_min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO
|
28
|
+
ts_default_high_freq_trunc_lengths: List[pd.DateOffset] = field(
|
29
|
+
default_factory=TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS.copy
|
30
|
+
)
|
31
|
+
ts_default_low_freq_trunc_lengths: List[pd.DateOffset] = field(
|
32
|
+
default_factory=TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS.copy
|
33
|
+
)
|
34
|
+
ts_default_time_unit_threshold: pd.Timedelta = TS_DEFAULT_TIME_UNIT_THRESHOLD
|
35
|
+
binary_min_sample_threshold: int = BINARY_MIN_SAMPLE_THRESHOLD
|
36
|
+
multiclass_min_sample_threshold: int = MULTICLASS_MIN_SAMPLE_THRESHOLD
|
37
|
+
binary_bootstrap_loops: int = BINARY_BOOTSTRAP_LOOPS
|
38
|
+
multiclass_bootstrap_loops: int = MULTICLASS_BOOTSTRAP_LOOPS
|
39
|
+
fit_sample_threshold: int = FIT_SAMPLE_THRESHOLD
|
40
|
+
fit_sample_rows: int = FIT_SAMPLE_ROWS
|
41
|
+
fit_sample_rows_with_eval_set: int = FIT_SAMPLE_ROWS_WITH_EVAL_SET
|
42
|
+
fit_sample_threshold_with_eval_set: int = FIT_SAMPLE_THRESHOLD_WITH_EVAL_SET
|
43
|
+
fit_sample_rows_ts: int = FIT_SAMPLE_ROWS_TS
|
@@ -134,8 +134,13 @@ def remove_fintech_duplicates(
|
|
134
134
|
logger.info(f"Train dataset shape after clean fintech duplicates: {train_df.shape}")
|
135
135
|
|
136
136
|
# Process each eval_set part separately
|
137
|
+
oot_eval_dfs = []
|
137
138
|
new_eval_dfs = []
|
138
139
|
for i, eval_df in enumerate(eval_dfs, 1):
|
140
|
+
# Skip OOT
|
141
|
+
if eval_df[TARGET].isna().all():
|
142
|
+
oot_eval_dfs.append(eval_df)
|
143
|
+
continue
|
139
144
|
logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
|
140
145
|
cleaned_eval_df, eval_warning = process_df(eval_df, i)
|
141
146
|
if eval_warning:
|
@@ -145,8 +150,8 @@ def remove_fintech_duplicates(
|
|
145
150
|
|
146
151
|
# Combine the processed train and eval parts back into one dataset
|
147
152
|
logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
|
148
|
-
if new_eval_dfs:
|
149
|
-
df = pd.concat([train_df] + new_eval_dfs)
|
153
|
+
if new_eval_dfs or oot_eval_dfs:
|
154
|
+
df = pd.concat([train_df] + new_eval_dfs + oot_eval_dfs, ignore_index=False)
|
150
155
|
else:
|
151
156
|
df = train_df
|
152
157
|
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
@@ -190,16 +195,59 @@ def clean_full_duplicates(
|
|
190
195
|
msg = None
|
191
196
|
if TARGET in df.columns:
|
192
197
|
unique_columns.remove(TARGET)
|
193
|
-
|
198
|
+
|
199
|
+
# Separate rows to exclude from deduplication:
|
200
|
+
# for each eval_set_index != 0 check separately, all TARGET values are NaN
|
201
|
+
df_for_dedup = df
|
202
|
+
oot_df = None
|
203
|
+
|
204
|
+
if EVAL_SET_INDEX in df.columns:
|
205
|
+
oot_eval_dfs = []
|
206
|
+
other_dfs = []
|
207
|
+
for eval_idx in df[EVAL_SET_INDEX].unique():
|
208
|
+
eval_subset = df[df[EVAL_SET_INDEX] == eval_idx]
|
209
|
+
# Check that all TARGET values for this specific eval_set_index are NaN
|
210
|
+
if eval_idx != 0 and eval_subset[TARGET].isna().all():
|
211
|
+
oot_eval_dfs.append(eval_subset)
|
212
|
+
logger.info(
|
213
|
+
f"Excluded {len(eval_subset)} rows from deduplication "
|
214
|
+
f"(eval_set_index={eval_idx} and all TARGET values are NaN)"
|
215
|
+
)
|
216
|
+
else:
|
217
|
+
other_dfs.append(eval_subset)
|
218
|
+
|
219
|
+
if oot_eval_dfs:
|
220
|
+
oot_df = pd.concat(oot_eval_dfs, ignore_index=False)
|
221
|
+
df_for_dedup = pd.concat(other_dfs, ignore_index=False)
|
222
|
+
else:
|
223
|
+
df_for_dedup = df
|
224
|
+
|
225
|
+
marked_duplicates = df_for_dedup.duplicated(subset=unique_columns, keep=False)
|
194
226
|
if marked_duplicates.sum() > 0:
|
195
|
-
dups_indices =
|
196
|
-
nrows_after_tgt_dedup = len(
|
197
|
-
num_dup_rows =
|
198
|
-
share_tgt_dedup = 100 * num_dup_rows /
|
227
|
+
dups_indices = df_for_dedup[marked_duplicates].index.to_list()[:100]
|
228
|
+
nrows_after_tgt_dedup = len(df_for_dedup.drop_duplicates(subset=unique_columns, keep=False))
|
229
|
+
num_dup_rows = len(df_for_dedup) - nrows_after_tgt_dedup
|
230
|
+
share_tgt_dedup = 100 * num_dup_rows / len(df_for_dedup)
|
199
231
|
|
200
232
|
msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
|
201
|
-
|
202
|
-
logger.info(f"Dataset shape after clean invalid target duplicates: {
|
233
|
+
df_for_dedup = df_for_dedup.drop_duplicates(subset=unique_columns, keep=False)
|
234
|
+
logger.info(f"Dataset shape after clean invalid target duplicates: {df_for_dedup.shape}")
|
235
|
+
# Combine back excluded rows
|
236
|
+
if oot_df is not None:
|
237
|
+
df = pd.concat([df_for_dedup, oot_df], ignore_index=False)
|
238
|
+
marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
|
239
|
+
if marked_duplicates.sum() > 0:
|
240
|
+
dups_indices = df[marked_duplicates].index.to_list()[:100]
|
241
|
+
nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
|
242
|
+
num_dup_rows = len(df) - nrows_after_tgt_dedup
|
243
|
+
share_tgt_dedup = 100 * num_dup_rows / len(df)
|
244
|
+
msg = bundle.get("dataset_diff_target_duplicates_oot").format(
|
245
|
+
share_tgt_dedup, num_dup_rows, dups_indices
|
246
|
+
)
|
247
|
+
df = df.drop_duplicates(subset=unique_columns, keep="first")
|
248
|
+
logger.info(f"Final dataset shape after adding back excluded rows: {df.shape}")
|
249
|
+
else:
|
250
|
+
df = df_for_dedup
|
203
251
|
|
204
252
|
return df, msg
|
205
253
|
|
upgini/utils/display_utils.py
CHANGED
upgini/utils/feature_info.py
CHANGED
@@ -27,6 +27,7 @@ class FeatureInfo:
|
|
27
27
|
doc_link: str
|
28
28
|
data_provider_link: str
|
29
29
|
data_source_link: str
|
30
|
+
psi_value: Optional[float] = None
|
30
31
|
|
31
32
|
@staticmethod
|
32
33
|
def from_metadata(
|
@@ -47,12 +48,14 @@ class FeatureInfo:
|
|
47
48
|
doc_link=feature_meta.doc_link,
|
48
49
|
data_provider_link=feature_meta.data_provider_link,
|
49
50
|
data_source_link=feature_meta.data_source_link,
|
51
|
+
psi_value=feature_meta.psi_value,
|
50
52
|
)
|
51
53
|
|
52
54
|
def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
|
53
55
|
return {
|
54
56
|
bundle.get("features_info_name"): self.name,
|
55
57
|
bundle.get("features_info_shap"): self.rounded_shap,
|
58
|
+
bundle.get("features_info_psi"): self.psi_value,
|
56
59
|
bundle.get("features_info_hitrate"): self.hitrate,
|
57
60
|
bundle.get("features_info_value_preview"): self.value_preview,
|
58
61
|
bundle.get("features_info_provider"): self.provider,
|
@@ -64,6 +67,7 @@ class FeatureInfo:
|
|
64
67
|
return {
|
65
68
|
bundle.get("features_info_name"): self.internal_name,
|
66
69
|
bundle.get("features_info_shap"): self.rounded_shap,
|
70
|
+
bundle.get("features_info_psi"): self.psi_value,
|
67
71
|
bundle.get("features_info_hitrate"): self.hitrate,
|
68
72
|
bundle.get("features_info_value_preview"): self.value_preview,
|
69
73
|
bundle.get("features_info_provider"): self.internal_provider,
|
@@ -76,6 +80,7 @@ class FeatureInfo:
|
|
76
80
|
bundle.get("features_info_name"): self.internal_name,
|
77
81
|
"feature_link": self.doc_link,
|
78
82
|
bundle.get("features_info_shap"): self.rounded_shap,
|
83
|
+
bundle.get("features_info_psi"): self.psi_value,
|
79
84
|
bundle.get("features_info_hitrate"): self.hitrate,
|
80
85
|
bundle.get("features_info_value_preview"): self.value_preview,
|
81
86
|
bundle.get("features_info_provider"): self.internal_provider,
|
@@ -0,0 +1,159 @@
|
|
1
|
+
import hashlib
|
2
|
+
import os
|
3
|
+
import platform
|
4
|
+
import shutil
|
5
|
+
import subprocess
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import List, Optional, Tuple
|
8
|
+
|
9
|
+
import numpy as np
|
10
|
+
import pandas as pd
|
11
|
+
|
12
|
+
|
13
|
+
def file_hash(path: str | os.PathLike, algo: str = "sha256") -> str:
|
14
|
+
"""
|
15
|
+
Returns file hash using system utilities, working consistently on Windows/macOS/Linux.
|
16
|
+
If no suitable utility is found, gracefully falls back to hashlib.
|
17
|
+
|
18
|
+
Supported algo values (depend on OS and available utilities):
|
19
|
+
- "md5", "sha1", "sha224", "sha256", "sha384", "sha512"
|
20
|
+
On Windows uses `certutil`.
|
21
|
+
On Linux uses `sha*sum` (e.g., sha256sum) or `shasum -a N`.
|
22
|
+
On macOS uses `shasum -a N` or `md5` for MD5.
|
23
|
+
"""
|
24
|
+
p = str(Path(path))
|
25
|
+
|
26
|
+
sysname = platform.system().lower()
|
27
|
+
algo = algo.lower()
|
28
|
+
|
29
|
+
# -------- command attempts depending on OS --------
|
30
|
+
candidates: list[list[str]] = []
|
31
|
+
|
32
|
+
if sysname == "windows":
|
33
|
+
# certutil supports: MD5, SHA1, SHA256, SHA384, SHA512
|
34
|
+
name_map = {
|
35
|
+
"md5": "MD5",
|
36
|
+
"sha1": "SHA1",
|
37
|
+
"sha224": None, # certutil doesn't support
|
38
|
+
"sha256": "SHA256",
|
39
|
+
"sha384": "SHA384",
|
40
|
+
"sha512": "SHA512",
|
41
|
+
}
|
42
|
+
cert_name = name_map.get(algo)
|
43
|
+
if cert_name:
|
44
|
+
candidates.append(["certutil", "-hashfile", p, cert_name])
|
45
|
+
else:
|
46
|
+
# Unix-like systems
|
47
|
+
# 1) specialized *sum utility if available (usually present on Linux)
|
48
|
+
sum_cmd = f"{algo}sum" # md5sum, sha256sum, etc.
|
49
|
+
if shutil.which(sum_cmd):
|
50
|
+
candidates.append([sum_cmd, p])
|
51
|
+
|
52
|
+
# 2) universal shasum with -a parameter (available on macOS and often on Linux)
|
53
|
+
shasum_bits = {
|
54
|
+
"sha1": "1",
|
55
|
+
"sha224": "224",
|
56
|
+
"sha256": "256",
|
57
|
+
"sha384": "384",
|
58
|
+
"sha512": "512",
|
59
|
+
}
|
60
|
+
if algo in shasum_bits and shutil.which("shasum"):
|
61
|
+
candidates.append(["shasum", "-a", shasum_bits[algo], p])
|
62
|
+
|
63
|
+
# 3) for MD5 on macOS there's often a separate `md5` utility
|
64
|
+
if algo == "md5" and shutil.which("md5"):
|
65
|
+
candidates.append(["md5", p])
|
66
|
+
|
67
|
+
# -------- try system utilities --------
|
68
|
+
for cmd in candidates:
|
69
|
+
try:
|
70
|
+
out = subprocess.check_output(cmd, text=True, stderr=subprocess.STDOUT)
|
71
|
+
digest = _parse_hash_output(out, cmd[0])
|
72
|
+
if digest:
|
73
|
+
return digest.lower()
|
74
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
75
|
+
continue # try next candidate
|
76
|
+
|
77
|
+
# -------- reliable fallback to hashlib --------
|
78
|
+
import hashlib
|
79
|
+
|
80
|
+
try:
|
81
|
+
h = getattr(hashlib, algo)
|
82
|
+
except AttributeError:
|
83
|
+
raise ValueError(f"Algorithm not supported: {algo}")
|
84
|
+
|
85
|
+
hasher = h()
|
86
|
+
with open(p, "rb") as f:
|
87
|
+
for chunk in iter(lambda: f.read(1024 * 1024), b""):
|
88
|
+
hasher.update(chunk)
|
89
|
+
return hasher.hexdigest().lower()
|
90
|
+
|
91
|
+
|
92
|
+
def _parse_hash_output(output: str, tool: str) -> Optional[str]:
|
93
|
+
"""
|
94
|
+
Converts output from different utilities to clean hash.
|
95
|
+
Supports:
|
96
|
+
- sha*sum / shasum: '<hex> <filename>'
|
97
|
+
- certutil (Windows): line with second element as hash (spaces inside are removed)
|
98
|
+
- md5 (macOS): 'MD5 (file) = <hex>'
|
99
|
+
"""
|
100
|
+
tool = tool.lower()
|
101
|
+
lines = [ln.strip() for ln in output.splitlines() if ln.strip()]
|
102
|
+
|
103
|
+
if not lines:
|
104
|
+
return None
|
105
|
+
|
106
|
+
if tool in {"sha1sum", "sha224sum", "sha256sum", "sha384sum", "sha512sum", "md5sum", "shasum"}:
|
107
|
+
# format: '<hex> <filename>'
|
108
|
+
first = lines[0]
|
109
|
+
parts = first.split()
|
110
|
+
return parts[0] if parts else None
|
111
|
+
|
112
|
+
if tool == "certutil":
|
113
|
+
# format:
|
114
|
+
# SHA256 hash of file <path>:
|
115
|
+
# <AA BB CC ...>
|
116
|
+
# CertUtil: -hashfile command completed successfully.
|
117
|
+
if len(lines) >= 2:
|
118
|
+
# Second line contains hex with spaces
|
119
|
+
candidate = lines[1].replace(" ", "")
|
120
|
+
# ensure it's hex
|
121
|
+
if all(c in "0123456789abcdefABCDEF" for c in candidate):
|
122
|
+
return candidate
|
123
|
+
return None
|
124
|
+
|
125
|
+
if tool == "md5":
|
126
|
+
# format: 'MD5 (<file>) = <hex>'
|
127
|
+
last = lines[-1]
|
128
|
+
if "=" in last:
|
129
|
+
return last.split("=", 1)[1].strip()
|
130
|
+
# sometimes md5 can return just the hash
|
131
|
+
parts = last.split()
|
132
|
+
if parts and all(c in "0123456789abcdefABCDEF" for c in parts[-1]):
|
133
|
+
return parts[-1]
|
134
|
+
return None
|
135
|
+
|
136
|
+
# as a last resort: take the first "looks like hash" word
|
137
|
+
for ln in lines:
|
138
|
+
for token in ln.split():
|
139
|
+
if all(c in "0123456789abcdefABCDEF" for c in token) and len(token) >= 32:
|
140
|
+
return token
|
141
|
+
return None
|
142
|
+
|
143
|
+
|
144
|
+
def hash_input(X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[List[Tuple]] = None) -> str:
|
145
|
+
hashed_objects = []
|
146
|
+
try:
|
147
|
+
hashed_objects.append(pd.util.hash_pandas_object(X, index=False).values)
|
148
|
+
if y is not None:
|
149
|
+
hashed_objects.append(pd.util.hash_pandas_object(y, index=False).values)
|
150
|
+
if eval_set is not None:
|
151
|
+
if isinstance(eval_set, tuple):
|
152
|
+
eval_set = [eval_set]
|
153
|
+
for eval_X, eval_y in eval_set:
|
154
|
+
hashed_objects.append(pd.util.hash_pandas_object(eval_X, index=False).values)
|
155
|
+
hashed_objects.append(pd.util.hash_pandas_object(eval_y, index=False).values)
|
156
|
+
common_hash = hashlib.sha256(np.concatenate(hashed_objects)).hexdigest()
|
157
|
+
return common_hash
|
158
|
+
except Exception:
|
159
|
+
return ""
|