upgini 1.2.114a2__tar.gz → 1.2.114a4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {upgini-1.2.114a2 → upgini-1.2.114a4}/PKG-INFO +1 -1
- upgini-1.2.114a4/src/upgini/__about__.py +1 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/features_enricher.py +83 -30
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/http.py +4 -19
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/resource_bundle/strings.properties +1 -1
- upgini-1.2.114a4/src/upgini/utils/hash_utils.py +137 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/psi.py +7 -4
- upgini-1.2.114a2/src/upgini/__about__.py +0 -1
- {upgini-1.2.114a2 → upgini-1.2.114a4}/.gitignore +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/LICENSE +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/README.md +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/pyproject.toml +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/__init__.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/ads.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/all_operators.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/operator.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/__init__.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/base.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/cross.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/delta.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/lag.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/roll.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/trend.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/volatility.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/utils.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/dataset.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/errors.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/metadata.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/metrics.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/search_task.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/spinner.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/mstats.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/sample_utils.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/sort.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/ts_utils.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/version_validator.py +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "1.2.114a4"
|
@@ -101,6 +101,7 @@ from upgini.utils.email_utils import (
|
|
101
101
|
from upgini.utils.feature_info import FeatureInfo, _round_shap_value
|
102
102
|
from upgini.utils.features_validator import FeaturesValidator
|
103
103
|
from upgini.utils.format import Format
|
104
|
+
from upgini.utils.hash_utils import file_hash
|
104
105
|
from upgini.utils.ip_utils import IpSearchKeyConverter
|
105
106
|
from upgini.utils.phone_utils import PhoneSearchKeyDetector
|
106
107
|
from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
|
@@ -1516,6 +1517,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1516
1517
|
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
|
1517
1518
|
)
|
1518
1519
|
|
1520
|
+
self.logger.info(f"PSI values by sparsity: {psi_values_sparse}")
|
1521
|
+
|
1519
1522
|
unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
|
1520
1523
|
if unstable_by_sparsity:
|
1521
1524
|
self.logger.info(f"Unstable by sparsity features: {sorted(unstable_by_sparsity)}")
|
@@ -1524,6 +1527,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1524
1527
|
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
|
1525
1528
|
)
|
1526
1529
|
|
1530
|
+
self.logger.info(f"PSI values by value: {psi_values}")
|
1531
|
+
|
1527
1532
|
unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
|
1528
1533
|
if unstable_by_value:
|
1529
1534
|
self.logger.info(f"Unstable by value features: {sorted(unstable_by_value)}")
|
@@ -2105,7 +2110,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
2105
2110
|
columns_renaming = normalizer.columns_renaming
|
2106
2111
|
|
2107
2112
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
2108
|
-
df = self.
|
2113
|
+
df = self._add_fit_system_record_id(
|
2114
|
+
df,
|
2115
|
+
search_keys,
|
2116
|
+
SYSTEM_RECORD_ID,
|
2117
|
+
TARGET,
|
2118
|
+
columns_renaming,
|
2119
|
+
self.id_columns,
|
2120
|
+
self.cv,
|
2121
|
+
self.model_task_type,
|
2122
|
+
self.logger,
|
2123
|
+
self.bundle,
|
2124
|
+
)
|
2109
2125
|
|
2110
2126
|
# Sample after sorting by system_record_id for idempotency
|
2111
2127
|
df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
|
@@ -2717,13 +2733,17 @@ if response.status_code == 200:
|
|
2717
2733
|
|
2718
2734
|
features_not_to_pass = []
|
2719
2735
|
if add_fit_system_record_id:
|
2720
|
-
df = self.
|
2736
|
+
df = self._add_fit_system_record_id(
|
2721
2737
|
df,
|
2722
2738
|
search_keys,
|
2723
2739
|
SYSTEM_RECORD_ID,
|
2724
2740
|
TARGET,
|
2725
2741
|
columns_renaming,
|
2726
|
-
|
2742
|
+
self.id_columns,
|
2743
|
+
self.cv,
|
2744
|
+
self.model_task_type,
|
2745
|
+
self.logger,
|
2746
|
+
self.bundle,
|
2727
2747
|
)
|
2728
2748
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
2729
2749
|
features_not_to_pass.append(SORT_ID)
|
@@ -3263,8 +3283,17 @@ if response.status_code == 200:
|
|
3263
3283
|
self.__log_warning(self.bundle.get("oot_eval_set_too_small_after_dedup").format(eval_set_index + 1))
|
3264
3284
|
|
3265
3285
|
# Explode multiple search keys
|
3266
|
-
df = self.
|
3267
|
-
df,
|
3286
|
+
df = self._add_fit_system_record_id(
|
3287
|
+
df,
|
3288
|
+
self.fit_search_keys,
|
3289
|
+
ENTITY_SYSTEM_RECORD_ID,
|
3290
|
+
TARGET,
|
3291
|
+
self.fit_columns_renaming,
|
3292
|
+
self.id_columns,
|
3293
|
+
self.cv,
|
3294
|
+
self.model_task_type,
|
3295
|
+
self.logger,
|
3296
|
+
self.bundle,
|
3268
3297
|
)
|
3269
3298
|
|
3270
3299
|
# TODO check that this is correct for enrichment
|
@@ -3298,8 +3327,17 @@ if response.status_code == 200:
|
|
3298
3327
|
if eval_set is not None and len(eval_set) > 0:
|
3299
3328
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
3300
3329
|
|
3301
|
-
df = self.
|
3302
|
-
df,
|
3330
|
+
df = self._add_fit_system_record_id(
|
3331
|
+
df,
|
3332
|
+
self.fit_search_keys,
|
3333
|
+
SYSTEM_RECORD_ID,
|
3334
|
+
TARGET,
|
3335
|
+
self.fit_columns_renaming,
|
3336
|
+
self.id_columns,
|
3337
|
+
self.cv,
|
3338
|
+
self.model_task_type,
|
3339
|
+
self.logger,
|
3340
|
+
self.bundle,
|
3303
3341
|
)
|
3304
3342
|
|
3305
3343
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
@@ -4130,14 +4168,18 @@ if response.status_code == 200:
|
|
4130
4168
|
self.logger.info(f"Finished explosion. Size after: {len(df)}")
|
4131
4169
|
return df, unnest_search_keys
|
4132
4170
|
|
4133
|
-
|
4134
|
-
|
4171
|
+
@staticmethod
|
4172
|
+
def _add_fit_system_record_id(
|
4135
4173
|
df: pd.DataFrame,
|
4136
4174
|
search_keys: Dict[str, SearchKey],
|
4137
4175
|
id_name: str,
|
4138
4176
|
target_name: str,
|
4139
4177
|
columns_renaming: Dict[str, str],
|
4140
|
-
|
4178
|
+
id_columns: Optional[List[str]],
|
4179
|
+
cv: Optional[CVType],
|
4180
|
+
model_task_type: ModelTaskType,
|
4181
|
+
logger: Optional[logging.Logger] = None,
|
4182
|
+
bundle: ResourceBundle = bundle,
|
4141
4183
|
) -> pd.DataFrame:
|
4142
4184
|
original_index_name = df.index.name
|
4143
4185
|
index_name = df.index.name or DEFAULT_INDEX
|
@@ -4166,32 +4208,33 @@ if response.status_code == 200:
|
|
4166
4208
|
columns_to_sort = [date_column] if date_column is not None else []
|
4167
4209
|
|
4168
4210
|
do_sorting = True
|
4169
|
-
if
|
4211
|
+
if id_columns and cv is not None and cv.is_time_series():
|
4170
4212
|
# Check duplicates by date and id_columns
|
4171
4213
|
reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
4172
|
-
renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in
|
4214
|
+
renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in id_columns]
|
4173
4215
|
duplicate_check_columns = [c for c in renamed_id_columns if c in df.columns]
|
4174
4216
|
if date_column is not None:
|
4175
4217
|
duplicate_check_columns.append(date_column)
|
4176
4218
|
|
4177
4219
|
duplicates = df.duplicated(subset=duplicate_check_columns, keep=False)
|
4178
4220
|
if duplicates.any():
|
4179
|
-
raise ValueError(
|
4221
|
+
raise ValueError(bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
|
4180
4222
|
else:
|
4181
4223
|
columns_to_hash = list(set(list(search_keys.keys()) + renamed_id_columns + [target_name]))
|
4182
4224
|
columns_to_hash = sort_columns(
|
4183
4225
|
df[columns_to_hash],
|
4184
4226
|
target_name,
|
4185
4227
|
search_keys,
|
4186
|
-
|
4228
|
+
model_task_type,
|
4187
4229
|
sort_exclude_columns,
|
4188
|
-
logger=
|
4230
|
+
logger=logger,
|
4189
4231
|
)
|
4190
4232
|
else:
|
4191
4233
|
columns_to_hash = sort_columns(
|
4192
|
-
df, target_name, search_keys,
|
4234
|
+
df, target_name, search_keys, model_task_type, sort_exclude_columns, logger=logger
|
4193
4235
|
)
|
4194
|
-
|
4236
|
+
|
4237
|
+
def sort_df(df: pd.DataFrame) -> pd.DataFrame:
|
4195
4238
|
search_keys_hash = "search_keys_hash"
|
4196
4239
|
if len(columns_to_hash) > 0:
|
4197
4240
|
factorized_df = df.copy()
|
@@ -4205,6 +4248,24 @@ if response.status_code == 200:
|
|
4205
4248
|
|
4206
4249
|
if search_keys_hash in df.columns:
|
4207
4250
|
df.drop(columns=search_keys_hash, inplace=True)
|
4251
|
+
return df
|
4252
|
+
|
4253
|
+
if do_sorting:
|
4254
|
+
sorted_dfs = []
|
4255
|
+
if EVAL_SET_INDEX in df.columns:
|
4256
|
+
# Sort train and eval sets separately
|
4257
|
+
train = df[df[EVAL_SET_INDEX] == 0].copy()
|
4258
|
+
sorted_dfs.append(sort_df(train))
|
4259
|
+
|
4260
|
+
for eval_set_index in df[EVAL_SET_INDEX].unique():
|
4261
|
+
if eval_set_index == 0:
|
4262
|
+
continue
|
4263
|
+
eval_set_df = df[df[EVAL_SET_INDEX] == eval_set_index].copy()
|
4264
|
+
sorted_dfs.append(sort_df(eval_set_df))
|
4265
|
+
|
4266
|
+
df = pd.concat(sorted_dfs)
|
4267
|
+
else:
|
4268
|
+
df = sort_df(df)
|
4208
4269
|
|
4209
4270
|
df = df.reset_index(drop=True).reset_index()
|
4210
4271
|
# system_record_id saves correct order for fit
|
@@ -4215,11 +4276,6 @@ if response.status_code == 200:
|
|
4215
4276
|
df.index.name = original_index_name
|
4216
4277
|
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
4217
4278
|
|
4218
|
-
# meaning_types[id_name] = (
|
4219
|
-
# FileColumnMeaningType.SYSTEM_RECORD_ID
|
4220
|
-
# if id_name == SYSTEM_RECORD_ID
|
4221
|
-
# else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
4222
|
-
# )
|
4223
4279
|
return df
|
4224
4280
|
|
4225
4281
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
@@ -4266,6 +4322,7 @@ if response.status_code == 200:
|
|
4266
4322
|
self.logger.error(f"result features not found by search_task_id: {self.get_search_id()}")
|
4267
4323
|
raise RuntimeError(self.bundle.get("features_wasnt_returned"))
|
4268
4324
|
|
4325
|
+
result_features = result_features.copy()
|
4269
4326
|
if EVAL_SET_INDEX in result_features.columns:
|
4270
4327
|
result_features = result_features.drop(columns=EVAL_SET_INDEX)
|
4271
4328
|
|
@@ -4993,7 +5050,7 @@ if response.status_code == 200:
|
|
4993
5050
|
|
4994
5051
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
4995
5052
|
X_.to_parquet(f"{tmp_dir}/x.parquet", compression="zstd")
|
4996
|
-
x_digest_sha256 =
|
5053
|
+
x_digest_sha256 = file_hash(f"{tmp_dir}/x.parquet")
|
4997
5054
|
if self.rest_client.is_file_uploaded(trace_id, x_digest_sha256):
|
4998
5055
|
self.logger.info(
|
4999
5056
|
f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping"
|
@@ -5007,7 +5064,7 @@ if response.status_code == 200:
|
|
5007
5064
|
if isinstance(y_, pd.Series):
|
5008
5065
|
y_ = y_.to_frame()
|
5009
5066
|
y_.to_parquet(f"{tmp_dir}/y.parquet", compression="zstd")
|
5010
|
-
y_digest_sha256 =
|
5067
|
+
y_digest_sha256 = file_hash(f"{tmp_dir}/y.parquet")
|
5011
5068
|
if self.rest_client.is_file_uploaded(trace_id, y_digest_sha256):
|
5012
5069
|
self.logger.info(
|
5013
5070
|
f"File y.parquet was already uploaded with digest {y_digest_sha256}, skipping"
|
@@ -5022,9 +5079,7 @@ if response.status_code == 200:
|
|
5022
5079
|
if isinstance(eval_x_, pd.Series):
|
5023
5080
|
eval_x_ = eval_x_.to_frame()
|
5024
5081
|
eval_x_.to_parquet(f"{tmp_dir}/eval_x_{idx}.parquet", compression="zstd")
|
5025
|
-
eval_x_digest_sha256 =
|
5026
|
-
f"{tmp_dir}/eval_x_{idx}.parquet"
|
5027
|
-
)
|
5082
|
+
eval_x_digest_sha256 = file_hash(f"{tmp_dir}/eval_x_{idx}.parquet")
|
5028
5083
|
if self.rest_client.is_file_uploaded(trace_id, eval_x_digest_sha256):
|
5029
5084
|
self.logger.info(
|
5030
5085
|
f"File eval_x_{idx}.parquet was already uploaded with"
|
@@ -5041,9 +5096,7 @@ if response.status_code == 200:
|
|
5041
5096
|
if isinstance(eval_y_, pd.Series):
|
5042
5097
|
eval_y_ = eval_y_.to_frame()
|
5043
5098
|
eval_y_.to_parquet(f"{tmp_dir}/eval_y_{idx}.parquet", compression="zstd")
|
5044
|
-
eval_y_digest_sha256 =
|
5045
|
-
f"{tmp_dir}/eval_y_{idx}.parquet"
|
5046
|
-
)
|
5099
|
+
eval_y_digest_sha256 = file_hash(f"{tmp_dir}/eval_y_{idx}.parquet")
|
5047
5100
|
if self.rest_client.is_file_uploaded(trace_id, eval_y_digest_sha256):
|
5048
5101
|
self.logger.info(
|
5049
5102
|
f"File eval_y_{idx}.parquet was already uploaded"
|
@@ -45,6 +45,7 @@ from upgini.metadata import (
|
|
45
45
|
SearchCustomization,
|
46
46
|
)
|
47
47
|
from upgini.resource_bundle import bundle
|
48
|
+
from upgini.utils.hash_utils import file_hash
|
48
49
|
from upgini.utils.track_info import get_track_metrics
|
49
50
|
|
50
51
|
UPGINI_URL: str = "UPGINI_URL"
|
@@ -427,7 +428,7 @@ class _RestClient:
|
|
427
428
|
api_path = self.SEARCH_DUMP_INPUT_FILE_FMT
|
428
429
|
|
429
430
|
def upload_with_check(path: str, file_name: str):
|
430
|
-
digest_sha256 =
|
431
|
+
digest_sha256 = file_hash(path)
|
431
432
|
if self.is_file_uploaded(trace_id, digest_sha256):
|
432
433
|
# print(f"File {path} was already uploaded with digest {digest_sha256}, skipping")
|
433
434
|
return
|
@@ -448,16 +449,6 @@ class _RestClient:
|
|
448
449
|
if eval_y_path:
|
449
450
|
upload_with_check(eval_y_path, "eval_y.parquet")
|
450
451
|
|
451
|
-
@staticmethod
|
452
|
-
def compute_file_digest(filepath: str, algorithm="sha256", chunk_size=4096) -> str:
|
453
|
-
hash_func = getattr(hashlib, algorithm)()
|
454
|
-
|
455
|
-
with open(filepath, "rb") as f:
|
456
|
-
for chunk in iter(lambda: f.read(chunk_size), b""):
|
457
|
-
hash_func.update(chunk)
|
458
|
-
|
459
|
-
return hash_func.hexdigest()
|
460
|
-
|
461
452
|
def initial_search_v2(
|
462
453
|
self,
|
463
454
|
trace_id: str,
|
@@ -478,10 +469,7 @@ class _RestClient:
|
|
478
469
|
digest = md5_hash.hexdigest()
|
479
470
|
metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
|
480
471
|
|
481
|
-
|
482
|
-
# pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
|
483
|
-
# ).hexdigest()
|
484
|
-
digest_sha256 = self.compute_file_digest(file_path)
|
472
|
+
digest_sha256 = file_hash(file_path)
|
485
473
|
metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
|
486
474
|
|
487
475
|
with open(file_path, "rb") as file:
|
@@ -576,10 +564,7 @@ class _RestClient:
|
|
576
564
|
digest = md5_hash.hexdigest()
|
577
565
|
metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
|
578
566
|
|
579
|
-
|
580
|
-
# pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
|
581
|
-
# ).hexdigest()
|
582
|
-
digest_sha256 = self.compute_file_digest(file_path)
|
567
|
+
digest_sha256 = file_hash(file_path)
|
583
568
|
metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
|
584
569
|
|
585
570
|
with open(file_path, "rb") as file:
|
@@ -244,7 +244,7 @@ validation_all_valid_status=All valid
|
|
244
244
|
validation_all_valid_message= -
|
245
245
|
validation_drop_message= Invalid rows will be dropped.
|
246
246
|
validation_some_invalid_status=Some invalid
|
247
|
-
validation_invalid_message={:.
|
247
|
+
validation_invalid_message={:.2f}% values failed validation and removed from dataframe, invalid values: {}
|
248
248
|
validation_all_invalid_status=All invalid
|
249
249
|
validation_all_valid_color=#DAF7A6
|
250
250
|
validation_some_invalid_color=#FFC300
|
@@ -0,0 +1,137 @@
|
|
1
|
+
import os
|
2
|
+
import platform
|
3
|
+
import shutil
|
4
|
+
import subprocess
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Optional
|
7
|
+
|
8
|
+
|
9
|
+
def file_hash(path: str | os.PathLike, algo: str = "sha256") -> str:
|
10
|
+
"""
|
11
|
+
Returns file hash using system utilities, working consistently on Windows/macOS/Linux.
|
12
|
+
If no suitable utility is found, gracefully falls back to hashlib.
|
13
|
+
|
14
|
+
Supported algo values (depend on OS and available utilities):
|
15
|
+
- "md5", "sha1", "sha224", "sha256", "sha384", "sha512"
|
16
|
+
On Windows uses `certutil`.
|
17
|
+
On Linux uses `sha*sum` (e.g., sha256sum) or `shasum -a N`.
|
18
|
+
On macOS uses `shasum -a N` or `md5` for MD5.
|
19
|
+
"""
|
20
|
+
p = str(Path(path))
|
21
|
+
|
22
|
+
sysname = platform.system().lower()
|
23
|
+
algo = algo.lower()
|
24
|
+
|
25
|
+
# -------- command attempts depending on OS --------
|
26
|
+
candidates: list[list[str]] = []
|
27
|
+
|
28
|
+
if sysname == "windows":
|
29
|
+
# certutil supports: MD5, SHA1, SHA256, SHA384, SHA512
|
30
|
+
name_map = {
|
31
|
+
"md5": "MD5",
|
32
|
+
"sha1": "SHA1",
|
33
|
+
"sha224": None, # certutil doesn't support
|
34
|
+
"sha256": "SHA256",
|
35
|
+
"sha384": "SHA384",
|
36
|
+
"sha512": "SHA512",
|
37
|
+
}
|
38
|
+
cert_name = name_map.get(algo)
|
39
|
+
if cert_name:
|
40
|
+
candidates.append(["certutil", "-hashfile", p, cert_name])
|
41
|
+
else:
|
42
|
+
# Unix-like systems
|
43
|
+
# 1) specialized *sum utility if available (usually present on Linux)
|
44
|
+
sum_cmd = f"{algo}sum" # md5sum, sha256sum, etc.
|
45
|
+
if shutil.which(sum_cmd):
|
46
|
+
candidates.append([sum_cmd, p])
|
47
|
+
|
48
|
+
# 2) universal shasum with -a parameter (available on macOS and often on Linux)
|
49
|
+
shasum_bits = {
|
50
|
+
"sha1": "1",
|
51
|
+
"sha224": "224",
|
52
|
+
"sha256": "256",
|
53
|
+
"sha384": "384",
|
54
|
+
"sha512": "512",
|
55
|
+
}
|
56
|
+
if algo in shasum_bits and shutil.which("shasum"):
|
57
|
+
candidates.append(["shasum", "-a", shasum_bits[algo], p])
|
58
|
+
|
59
|
+
# 3) for MD5 on macOS there's often a separate `md5` utility
|
60
|
+
if algo == "md5" and shutil.which("md5"):
|
61
|
+
candidates.append(["md5", p])
|
62
|
+
|
63
|
+
# -------- try system utilities --------
|
64
|
+
for cmd in candidates:
|
65
|
+
try:
|
66
|
+
out = subprocess.check_output(cmd, text=True, stderr=subprocess.STDOUT)
|
67
|
+
digest = _parse_hash_output(out, cmd[0])
|
68
|
+
if digest:
|
69
|
+
return digest.lower()
|
70
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
71
|
+
continue # try next candidate
|
72
|
+
|
73
|
+
# -------- reliable fallback to hashlib --------
|
74
|
+
import hashlib
|
75
|
+
|
76
|
+
try:
|
77
|
+
h = getattr(hashlib, algo)
|
78
|
+
except AttributeError:
|
79
|
+
raise ValueError(f"Algorithm not supported: {algo}")
|
80
|
+
|
81
|
+
hasher = h()
|
82
|
+
with open(p, "rb") as f:
|
83
|
+
for chunk in iter(lambda: f.read(1024 * 1024), b""):
|
84
|
+
hasher.update(chunk)
|
85
|
+
return hasher.hexdigest().lower()
|
86
|
+
|
87
|
+
|
88
|
+
def _parse_hash_output(output: str, tool: str) -> Optional[str]:
|
89
|
+
"""
|
90
|
+
Converts output from different utilities to clean hash.
|
91
|
+
Supports:
|
92
|
+
- sha*sum / shasum: '<hex> <filename>'
|
93
|
+
- certutil (Windows): line with second element as hash (spaces inside are removed)
|
94
|
+
- md5 (macOS): 'MD5 (file) = <hex>'
|
95
|
+
"""
|
96
|
+
tool = tool.lower()
|
97
|
+
lines = [ln.strip() for ln in output.splitlines() if ln.strip()]
|
98
|
+
|
99
|
+
if not lines:
|
100
|
+
return None
|
101
|
+
|
102
|
+
if tool in {"sha1sum", "sha224sum", "sha256sum", "sha384sum", "sha512sum", "md5sum", "shasum"}:
|
103
|
+
# format: '<hex> <filename>'
|
104
|
+
first = lines[0]
|
105
|
+
parts = first.split()
|
106
|
+
return parts[0] if parts else None
|
107
|
+
|
108
|
+
if tool == "certutil":
|
109
|
+
# format:
|
110
|
+
# SHA256 hash of file <path>:
|
111
|
+
# <AA BB CC ...>
|
112
|
+
# CertUtil: -hashfile command completed successfully.
|
113
|
+
if len(lines) >= 2:
|
114
|
+
# Second line contains hex with spaces
|
115
|
+
candidate = lines[1].replace(" ", "")
|
116
|
+
# ensure it's hex
|
117
|
+
if all(c in "0123456789abcdefABCDEF" for c in candidate):
|
118
|
+
return candidate
|
119
|
+
return None
|
120
|
+
|
121
|
+
if tool == "md5":
|
122
|
+
# format: 'MD5 (<file>) = <hex>'
|
123
|
+
last = lines[-1]
|
124
|
+
if "=" in last:
|
125
|
+
return last.split("=", 1)[1].strip()
|
126
|
+
# sometimes md5 can return just the hash
|
127
|
+
parts = last.split()
|
128
|
+
if parts and all(c in "0123456789abcdefABCDEF" for c in parts[-1]):
|
129
|
+
return parts[-1]
|
130
|
+
return None
|
131
|
+
|
132
|
+
# as a last resort: take the first "looks like hash" word
|
133
|
+
for ln in lines:
|
134
|
+
for token in ln.split():
|
135
|
+
if all(c in "0123456789abcdefABCDEF" for c in token) and len(token) >= 32:
|
136
|
+
return token
|
137
|
+
return None
|
@@ -77,11 +77,14 @@ def calculate_features_psi(
|
|
77
77
|
psi_features_params: StabilityParams = DEFAULT_FEATURES_PARAMS,
|
78
78
|
psi_target_params: StabilityParams = DEFAULT_TARGET_PARAMS,
|
79
79
|
) -> Dict[str, float]:
|
80
|
-
empty_res =
|
80
|
+
empty_res = {col: 0.0 for col in df.columns if col not in [TARGET, date_column]}
|
81
81
|
|
82
82
|
if not is_numeric_dtype(df[date_column]):
|
83
83
|
df[date_column] = pd.to_datetime(df[date_column]).dt.floor("D").astype(np.int64) / 10**6
|
84
84
|
|
85
|
+
# Filter out rows with missing dates
|
86
|
+
df = df[df[date_column].notna()].copy()
|
87
|
+
|
85
88
|
n_months = pd.to_datetime(df[date_column], unit="ms").dt.month.nunique()
|
86
89
|
|
87
90
|
if TARGET in df.columns:
|
@@ -113,9 +116,9 @@ def calculate_features_psi(
|
|
113
116
|
cat_top_pct=psi_target_params.cat_top_pct,
|
114
117
|
agg_func=target_agg_func,
|
115
118
|
)
|
116
|
-
if target_psi is None:
|
119
|
+
if target_psi is None or np.isnan(target_psi):
|
117
120
|
logger.info("Cannot determine target PSI. Skip feature PSI check")
|
118
|
-
return
|
121
|
+
return empty_res
|
119
122
|
|
120
123
|
if target_psi > psi_target_params.threshold:
|
121
124
|
logger.info(
|
@@ -221,7 +224,7 @@ def _stability_agg(
|
|
221
224
|
|
222
225
|
psi_value = agg_func([_psi(reference, c) for c in current])
|
223
226
|
|
224
|
-
return psi_value
|
227
|
+
return float(psi_value)
|
225
228
|
|
226
229
|
|
227
230
|
def _get_binned_data(
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = "1.2.114a2"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|