upgini 1.2.114a3__tar.gz → 1.2.114a4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {upgini-1.2.114a3 → upgini-1.2.114a4}/PKG-INFO +1 -1
- upgini-1.2.114a4/src/upgini/__about__.py +1 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/features_enricher.py +79 -30
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/http.py +4 -19
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/resource_bundle/strings.properties +1 -1
- upgini-1.2.114a4/src/upgini/utils/hash_utils.py +137 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/psi.py +4 -1
- upgini-1.2.114a3/src/upgini/__about__.py +0 -1
- {upgini-1.2.114a3 → upgini-1.2.114a4}/.gitignore +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/LICENSE +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/README.md +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/pyproject.toml +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/__init__.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/ads.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/all_operators.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/operator.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/__init__.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/base.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/cross.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/delta.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/lag.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/roll.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/trend.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/volatility.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/utils.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/dataset.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/errors.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/metadata.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/metrics.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/search_task.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/spinner.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/mstats.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/sample_utils.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/sort.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/ts_utils.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/version_validator.py +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "1.2.114a4"
|
@@ -101,6 +101,7 @@ from upgini.utils.email_utils import (
|
|
101
101
|
from upgini.utils.feature_info import FeatureInfo, _round_shap_value
|
102
102
|
from upgini.utils.features_validator import FeaturesValidator
|
103
103
|
from upgini.utils.format import Format
|
104
|
+
from upgini.utils.hash_utils import file_hash
|
104
105
|
from upgini.utils.ip_utils import IpSearchKeyConverter
|
105
106
|
from upgini.utils.phone_utils import PhoneSearchKeyDetector
|
106
107
|
from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
|
@@ -2109,7 +2110,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
2109
2110
|
columns_renaming = normalizer.columns_renaming
|
2110
2111
|
|
2111
2112
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
2112
|
-
df = self.
|
2113
|
+
df = self._add_fit_system_record_id(
|
2114
|
+
df,
|
2115
|
+
search_keys,
|
2116
|
+
SYSTEM_RECORD_ID,
|
2117
|
+
TARGET,
|
2118
|
+
columns_renaming,
|
2119
|
+
self.id_columns,
|
2120
|
+
self.cv,
|
2121
|
+
self.model_task_type,
|
2122
|
+
self.logger,
|
2123
|
+
self.bundle,
|
2124
|
+
)
|
2113
2125
|
|
2114
2126
|
# Sample after sorting by system_record_id for idempotency
|
2115
2127
|
df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
|
@@ -2721,13 +2733,17 @@ if response.status_code == 200:
|
|
2721
2733
|
|
2722
2734
|
features_not_to_pass = []
|
2723
2735
|
if add_fit_system_record_id:
|
2724
|
-
df = self.
|
2736
|
+
df = self._add_fit_system_record_id(
|
2725
2737
|
df,
|
2726
2738
|
search_keys,
|
2727
2739
|
SYSTEM_RECORD_ID,
|
2728
2740
|
TARGET,
|
2729
2741
|
columns_renaming,
|
2730
|
-
|
2742
|
+
self.id_columns,
|
2743
|
+
self.cv,
|
2744
|
+
self.model_task_type,
|
2745
|
+
self.logger,
|
2746
|
+
self.bundle,
|
2731
2747
|
)
|
2732
2748
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
2733
2749
|
features_not_to_pass.append(SORT_ID)
|
@@ -3267,8 +3283,17 @@ if response.status_code == 200:
|
|
3267
3283
|
self.__log_warning(self.bundle.get("oot_eval_set_too_small_after_dedup").format(eval_set_index + 1))
|
3268
3284
|
|
3269
3285
|
# Explode multiple search keys
|
3270
|
-
df = self.
|
3271
|
-
df,
|
3286
|
+
df = self._add_fit_system_record_id(
|
3287
|
+
df,
|
3288
|
+
self.fit_search_keys,
|
3289
|
+
ENTITY_SYSTEM_RECORD_ID,
|
3290
|
+
TARGET,
|
3291
|
+
self.fit_columns_renaming,
|
3292
|
+
self.id_columns,
|
3293
|
+
self.cv,
|
3294
|
+
self.model_task_type,
|
3295
|
+
self.logger,
|
3296
|
+
self.bundle,
|
3272
3297
|
)
|
3273
3298
|
|
3274
3299
|
# TODO check that this is correct for enrichment
|
@@ -3302,8 +3327,17 @@ if response.status_code == 200:
|
|
3302
3327
|
if eval_set is not None and len(eval_set) > 0:
|
3303
3328
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
3304
3329
|
|
3305
|
-
df = self.
|
3306
|
-
df,
|
3330
|
+
df = self._add_fit_system_record_id(
|
3331
|
+
df,
|
3332
|
+
self.fit_search_keys,
|
3333
|
+
SYSTEM_RECORD_ID,
|
3334
|
+
TARGET,
|
3335
|
+
self.fit_columns_renaming,
|
3336
|
+
self.id_columns,
|
3337
|
+
self.cv,
|
3338
|
+
self.model_task_type,
|
3339
|
+
self.logger,
|
3340
|
+
self.bundle,
|
3307
3341
|
)
|
3308
3342
|
|
3309
3343
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
@@ -4134,14 +4168,18 @@ if response.status_code == 200:
|
|
4134
4168
|
self.logger.info(f"Finished explosion. Size after: {len(df)}")
|
4135
4169
|
return df, unnest_search_keys
|
4136
4170
|
|
4137
|
-
|
4138
|
-
|
4171
|
+
@staticmethod
|
4172
|
+
def _add_fit_system_record_id(
|
4139
4173
|
df: pd.DataFrame,
|
4140
4174
|
search_keys: Dict[str, SearchKey],
|
4141
4175
|
id_name: str,
|
4142
4176
|
target_name: str,
|
4143
4177
|
columns_renaming: Dict[str, str],
|
4144
|
-
|
4178
|
+
id_columns: Optional[List[str]],
|
4179
|
+
cv: Optional[CVType],
|
4180
|
+
model_task_type: ModelTaskType,
|
4181
|
+
logger: Optional[logging.Logger] = None,
|
4182
|
+
bundle: ResourceBundle = bundle,
|
4145
4183
|
) -> pd.DataFrame:
|
4146
4184
|
original_index_name = df.index.name
|
4147
4185
|
index_name = df.index.name or DEFAULT_INDEX
|
@@ -4170,32 +4208,33 @@ if response.status_code == 200:
|
|
4170
4208
|
columns_to_sort = [date_column] if date_column is not None else []
|
4171
4209
|
|
4172
4210
|
do_sorting = True
|
4173
|
-
if
|
4211
|
+
if id_columns and cv is not None and cv.is_time_series():
|
4174
4212
|
# Check duplicates by date and id_columns
|
4175
4213
|
reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
4176
|
-
renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in
|
4214
|
+
renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in id_columns]
|
4177
4215
|
duplicate_check_columns = [c for c in renamed_id_columns if c in df.columns]
|
4178
4216
|
if date_column is not None:
|
4179
4217
|
duplicate_check_columns.append(date_column)
|
4180
4218
|
|
4181
4219
|
duplicates = df.duplicated(subset=duplicate_check_columns, keep=False)
|
4182
4220
|
if duplicates.any():
|
4183
|
-
raise ValueError(
|
4221
|
+
raise ValueError(bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
|
4184
4222
|
else:
|
4185
4223
|
columns_to_hash = list(set(list(search_keys.keys()) + renamed_id_columns + [target_name]))
|
4186
4224
|
columns_to_hash = sort_columns(
|
4187
4225
|
df[columns_to_hash],
|
4188
4226
|
target_name,
|
4189
4227
|
search_keys,
|
4190
|
-
|
4228
|
+
model_task_type,
|
4191
4229
|
sort_exclude_columns,
|
4192
|
-
logger=
|
4230
|
+
logger=logger,
|
4193
4231
|
)
|
4194
4232
|
else:
|
4195
4233
|
columns_to_hash = sort_columns(
|
4196
|
-
df, target_name, search_keys,
|
4234
|
+
df, target_name, search_keys, model_task_type, sort_exclude_columns, logger=logger
|
4197
4235
|
)
|
4198
|
-
|
4236
|
+
|
4237
|
+
def sort_df(df: pd.DataFrame) -> pd.DataFrame:
|
4199
4238
|
search_keys_hash = "search_keys_hash"
|
4200
4239
|
if len(columns_to_hash) > 0:
|
4201
4240
|
factorized_df = df.copy()
|
@@ -4209,6 +4248,24 @@ if response.status_code == 200:
|
|
4209
4248
|
|
4210
4249
|
if search_keys_hash in df.columns:
|
4211
4250
|
df.drop(columns=search_keys_hash, inplace=True)
|
4251
|
+
return df
|
4252
|
+
|
4253
|
+
if do_sorting:
|
4254
|
+
sorted_dfs = []
|
4255
|
+
if EVAL_SET_INDEX in df.columns:
|
4256
|
+
# Sort train and eval sets separately
|
4257
|
+
train = df[df[EVAL_SET_INDEX] == 0].copy()
|
4258
|
+
sorted_dfs.append(sort_df(train))
|
4259
|
+
|
4260
|
+
for eval_set_index in df[EVAL_SET_INDEX].unique():
|
4261
|
+
if eval_set_index == 0:
|
4262
|
+
continue
|
4263
|
+
eval_set_df = df[df[EVAL_SET_INDEX] == eval_set_index].copy()
|
4264
|
+
sorted_dfs.append(sort_df(eval_set_df))
|
4265
|
+
|
4266
|
+
df = pd.concat(sorted_dfs)
|
4267
|
+
else:
|
4268
|
+
df = sort_df(df)
|
4212
4269
|
|
4213
4270
|
df = df.reset_index(drop=True).reset_index()
|
4214
4271
|
# system_record_id saves correct order for fit
|
@@ -4219,11 +4276,6 @@ if response.status_code == 200:
|
|
4219
4276
|
df.index.name = original_index_name
|
4220
4277
|
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
4221
4278
|
|
4222
|
-
# meaning_types[id_name] = (
|
4223
|
-
# FileColumnMeaningType.SYSTEM_RECORD_ID
|
4224
|
-
# if id_name == SYSTEM_RECORD_ID
|
4225
|
-
# else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
4226
|
-
# )
|
4227
4279
|
return df
|
4228
4280
|
|
4229
4281
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
@@ -4270,6 +4322,7 @@ if response.status_code == 200:
|
|
4270
4322
|
self.logger.error(f"result features not found by search_task_id: {self.get_search_id()}")
|
4271
4323
|
raise RuntimeError(self.bundle.get("features_wasnt_returned"))
|
4272
4324
|
|
4325
|
+
result_features = result_features.copy()
|
4273
4326
|
if EVAL_SET_INDEX in result_features.columns:
|
4274
4327
|
result_features = result_features.drop(columns=EVAL_SET_INDEX)
|
4275
4328
|
|
@@ -4997,7 +5050,7 @@ if response.status_code == 200:
|
|
4997
5050
|
|
4998
5051
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
4999
5052
|
X_.to_parquet(f"{tmp_dir}/x.parquet", compression="zstd")
|
5000
|
-
x_digest_sha256 =
|
5053
|
+
x_digest_sha256 = file_hash(f"{tmp_dir}/x.parquet")
|
5001
5054
|
if self.rest_client.is_file_uploaded(trace_id, x_digest_sha256):
|
5002
5055
|
self.logger.info(
|
5003
5056
|
f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping"
|
@@ -5011,7 +5064,7 @@ if response.status_code == 200:
|
|
5011
5064
|
if isinstance(y_, pd.Series):
|
5012
5065
|
y_ = y_.to_frame()
|
5013
5066
|
y_.to_parquet(f"{tmp_dir}/y.parquet", compression="zstd")
|
5014
|
-
y_digest_sha256 =
|
5067
|
+
y_digest_sha256 = file_hash(f"{tmp_dir}/y.parquet")
|
5015
5068
|
if self.rest_client.is_file_uploaded(trace_id, y_digest_sha256):
|
5016
5069
|
self.logger.info(
|
5017
5070
|
f"File y.parquet was already uploaded with digest {y_digest_sha256}, skipping"
|
@@ -5026,9 +5079,7 @@ if response.status_code == 200:
|
|
5026
5079
|
if isinstance(eval_x_, pd.Series):
|
5027
5080
|
eval_x_ = eval_x_.to_frame()
|
5028
5081
|
eval_x_.to_parquet(f"{tmp_dir}/eval_x_{idx}.parquet", compression="zstd")
|
5029
|
-
eval_x_digest_sha256 =
|
5030
|
-
f"{tmp_dir}/eval_x_{idx}.parquet"
|
5031
|
-
)
|
5082
|
+
eval_x_digest_sha256 = file_hash(f"{tmp_dir}/eval_x_{idx}.parquet")
|
5032
5083
|
if self.rest_client.is_file_uploaded(trace_id, eval_x_digest_sha256):
|
5033
5084
|
self.logger.info(
|
5034
5085
|
f"File eval_x_{idx}.parquet was already uploaded with"
|
@@ -5045,9 +5096,7 @@ if response.status_code == 200:
|
|
5045
5096
|
if isinstance(eval_y_, pd.Series):
|
5046
5097
|
eval_y_ = eval_y_.to_frame()
|
5047
5098
|
eval_y_.to_parquet(f"{tmp_dir}/eval_y_{idx}.parquet", compression="zstd")
|
5048
|
-
eval_y_digest_sha256 =
|
5049
|
-
f"{tmp_dir}/eval_y_{idx}.parquet"
|
5050
|
-
)
|
5099
|
+
eval_y_digest_sha256 = file_hash(f"{tmp_dir}/eval_y_{idx}.parquet")
|
5051
5100
|
if self.rest_client.is_file_uploaded(trace_id, eval_y_digest_sha256):
|
5052
5101
|
self.logger.info(
|
5053
5102
|
f"File eval_y_{idx}.parquet was already uploaded"
|
@@ -45,6 +45,7 @@ from upgini.metadata import (
|
|
45
45
|
SearchCustomization,
|
46
46
|
)
|
47
47
|
from upgini.resource_bundle import bundle
|
48
|
+
from upgini.utils.hash_utils import file_hash
|
48
49
|
from upgini.utils.track_info import get_track_metrics
|
49
50
|
|
50
51
|
UPGINI_URL: str = "UPGINI_URL"
|
@@ -427,7 +428,7 @@ class _RestClient:
|
|
427
428
|
api_path = self.SEARCH_DUMP_INPUT_FILE_FMT
|
428
429
|
|
429
430
|
def upload_with_check(path: str, file_name: str):
|
430
|
-
digest_sha256 =
|
431
|
+
digest_sha256 = file_hash(path)
|
431
432
|
if self.is_file_uploaded(trace_id, digest_sha256):
|
432
433
|
# print(f"File {path} was already uploaded with digest {digest_sha256}, skipping")
|
433
434
|
return
|
@@ -448,16 +449,6 @@ class _RestClient:
|
|
448
449
|
if eval_y_path:
|
449
450
|
upload_with_check(eval_y_path, "eval_y.parquet")
|
450
451
|
|
451
|
-
@staticmethod
|
452
|
-
def compute_file_digest(filepath: str, algorithm="sha256", chunk_size=4096) -> str:
|
453
|
-
hash_func = getattr(hashlib, algorithm)()
|
454
|
-
|
455
|
-
with open(filepath, "rb") as f:
|
456
|
-
for chunk in iter(lambda: f.read(chunk_size), b""):
|
457
|
-
hash_func.update(chunk)
|
458
|
-
|
459
|
-
return hash_func.hexdigest()
|
460
|
-
|
461
452
|
def initial_search_v2(
|
462
453
|
self,
|
463
454
|
trace_id: str,
|
@@ -478,10 +469,7 @@ class _RestClient:
|
|
478
469
|
digest = md5_hash.hexdigest()
|
479
470
|
metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
|
480
471
|
|
481
|
-
|
482
|
-
# pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
|
483
|
-
# ).hexdigest()
|
484
|
-
digest_sha256 = self.compute_file_digest(file_path)
|
472
|
+
digest_sha256 = file_hash(file_path)
|
485
473
|
metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
|
486
474
|
|
487
475
|
with open(file_path, "rb") as file:
|
@@ -576,10 +564,7 @@ class _RestClient:
|
|
576
564
|
digest = md5_hash.hexdigest()
|
577
565
|
metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
|
578
566
|
|
579
|
-
|
580
|
-
# pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
|
581
|
-
# ).hexdigest()
|
582
|
-
digest_sha256 = self.compute_file_digest(file_path)
|
567
|
+
digest_sha256 = file_hash(file_path)
|
583
568
|
metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
|
584
569
|
|
585
570
|
with open(file_path, "rb") as file:
|
@@ -244,7 +244,7 @@ validation_all_valid_status=All valid
|
|
244
244
|
validation_all_valid_message= -
|
245
245
|
validation_drop_message= Invalid rows will be dropped.
|
246
246
|
validation_some_invalid_status=Some invalid
|
247
|
-
validation_invalid_message={:.
|
247
|
+
validation_invalid_message={:.2f}% values failed validation and removed from dataframe, invalid values: {}
|
248
248
|
validation_all_invalid_status=All invalid
|
249
249
|
validation_all_valid_color=#DAF7A6
|
250
250
|
validation_some_invalid_color=#FFC300
|
@@ -0,0 +1,137 @@
|
|
1
|
+
import os
|
2
|
+
import platform
|
3
|
+
import shutil
|
4
|
+
import subprocess
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Optional
|
7
|
+
|
8
|
+
|
9
|
+
def file_hash(path: str | os.PathLike, algo: str = "sha256") -> str:
|
10
|
+
"""
|
11
|
+
Returns file hash using system utilities, working consistently on Windows/macOS/Linux.
|
12
|
+
If no suitable utility is found, gracefully falls back to hashlib.
|
13
|
+
|
14
|
+
Supported algo values (depend on OS and available utilities):
|
15
|
+
- "md5", "sha1", "sha224", "sha256", "sha384", "sha512"
|
16
|
+
On Windows uses `certutil`.
|
17
|
+
On Linux uses `sha*sum` (e.g., sha256sum) or `shasum -a N`.
|
18
|
+
On macOS uses `shasum -a N` or `md5` for MD5.
|
19
|
+
"""
|
20
|
+
p = str(Path(path))
|
21
|
+
|
22
|
+
sysname = platform.system().lower()
|
23
|
+
algo = algo.lower()
|
24
|
+
|
25
|
+
# -------- command attempts depending on OS --------
|
26
|
+
candidates: list[list[str]] = []
|
27
|
+
|
28
|
+
if sysname == "windows":
|
29
|
+
# certutil supports: MD5, SHA1, SHA256, SHA384, SHA512
|
30
|
+
name_map = {
|
31
|
+
"md5": "MD5",
|
32
|
+
"sha1": "SHA1",
|
33
|
+
"sha224": None, # certutil doesn't support
|
34
|
+
"sha256": "SHA256",
|
35
|
+
"sha384": "SHA384",
|
36
|
+
"sha512": "SHA512",
|
37
|
+
}
|
38
|
+
cert_name = name_map.get(algo)
|
39
|
+
if cert_name:
|
40
|
+
candidates.append(["certutil", "-hashfile", p, cert_name])
|
41
|
+
else:
|
42
|
+
# Unix-like systems
|
43
|
+
# 1) specialized *sum utility if available (usually present on Linux)
|
44
|
+
sum_cmd = f"{algo}sum" # md5sum, sha256sum, etc.
|
45
|
+
if shutil.which(sum_cmd):
|
46
|
+
candidates.append([sum_cmd, p])
|
47
|
+
|
48
|
+
# 2) universal shasum with -a parameter (available on macOS and often on Linux)
|
49
|
+
shasum_bits = {
|
50
|
+
"sha1": "1",
|
51
|
+
"sha224": "224",
|
52
|
+
"sha256": "256",
|
53
|
+
"sha384": "384",
|
54
|
+
"sha512": "512",
|
55
|
+
}
|
56
|
+
if algo in shasum_bits and shutil.which("shasum"):
|
57
|
+
candidates.append(["shasum", "-a", shasum_bits[algo], p])
|
58
|
+
|
59
|
+
# 3) for MD5 on macOS there's often a separate `md5` utility
|
60
|
+
if algo == "md5" and shutil.which("md5"):
|
61
|
+
candidates.append(["md5", p])
|
62
|
+
|
63
|
+
# -------- try system utilities --------
|
64
|
+
for cmd in candidates:
|
65
|
+
try:
|
66
|
+
out = subprocess.check_output(cmd, text=True, stderr=subprocess.STDOUT)
|
67
|
+
digest = _parse_hash_output(out, cmd[0])
|
68
|
+
if digest:
|
69
|
+
return digest.lower()
|
70
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
71
|
+
continue # try next candidate
|
72
|
+
|
73
|
+
# -------- reliable fallback to hashlib --------
|
74
|
+
import hashlib
|
75
|
+
|
76
|
+
try:
|
77
|
+
h = getattr(hashlib, algo)
|
78
|
+
except AttributeError:
|
79
|
+
raise ValueError(f"Algorithm not supported: {algo}")
|
80
|
+
|
81
|
+
hasher = h()
|
82
|
+
with open(p, "rb") as f:
|
83
|
+
for chunk in iter(lambda: f.read(1024 * 1024), b""):
|
84
|
+
hasher.update(chunk)
|
85
|
+
return hasher.hexdigest().lower()
|
86
|
+
|
87
|
+
|
88
|
+
def _parse_hash_output(output: str, tool: str) -> Optional[str]:
|
89
|
+
"""
|
90
|
+
Converts output from different utilities to clean hash.
|
91
|
+
Supports:
|
92
|
+
- sha*sum / shasum: '<hex> <filename>'
|
93
|
+
- certutil (Windows): line with second element as hash (spaces inside are removed)
|
94
|
+
- md5 (macOS): 'MD5 (file) = <hex>'
|
95
|
+
"""
|
96
|
+
tool = tool.lower()
|
97
|
+
lines = [ln.strip() for ln in output.splitlines() if ln.strip()]
|
98
|
+
|
99
|
+
if not lines:
|
100
|
+
return None
|
101
|
+
|
102
|
+
if tool in {"sha1sum", "sha224sum", "sha256sum", "sha384sum", "sha512sum", "md5sum", "shasum"}:
|
103
|
+
# format: '<hex> <filename>'
|
104
|
+
first = lines[0]
|
105
|
+
parts = first.split()
|
106
|
+
return parts[0] if parts else None
|
107
|
+
|
108
|
+
if tool == "certutil":
|
109
|
+
# format:
|
110
|
+
# SHA256 hash of file <path>:
|
111
|
+
# <AA BB CC ...>
|
112
|
+
# CertUtil: -hashfile command completed successfully.
|
113
|
+
if len(lines) >= 2:
|
114
|
+
# Second line contains hex with spaces
|
115
|
+
candidate = lines[1].replace(" ", "")
|
116
|
+
# ensure it's hex
|
117
|
+
if all(c in "0123456789abcdefABCDEF" for c in candidate):
|
118
|
+
return candidate
|
119
|
+
return None
|
120
|
+
|
121
|
+
if tool == "md5":
|
122
|
+
# format: 'MD5 (<file>) = <hex>'
|
123
|
+
last = lines[-1]
|
124
|
+
if "=" in last:
|
125
|
+
return last.split("=", 1)[1].strip()
|
126
|
+
# sometimes md5 can return just the hash
|
127
|
+
parts = last.split()
|
128
|
+
if parts and all(c in "0123456789abcdefABCDEF" for c in parts[-1]):
|
129
|
+
return parts[-1]
|
130
|
+
return None
|
131
|
+
|
132
|
+
# as a last resort: take the first "looks like hash" word
|
133
|
+
for ln in lines:
|
134
|
+
for token in ln.split():
|
135
|
+
if all(c in "0123456789abcdefABCDEF" for c in token) and len(token) >= 32:
|
136
|
+
return token
|
137
|
+
return None
|
@@ -82,6 +82,9 @@ def calculate_features_psi(
|
|
82
82
|
if not is_numeric_dtype(df[date_column]):
|
83
83
|
df[date_column] = pd.to_datetime(df[date_column]).dt.floor("D").astype(np.int64) / 10**6
|
84
84
|
|
85
|
+
# Filter out rows with missing dates
|
86
|
+
df = df[df[date_column].notna()].copy()
|
87
|
+
|
85
88
|
n_months = pd.to_datetime(df[date_column], unit="ms").dt.month.nunique()
|
86
89
|
|
87
90
|
if TARGET in df.columns:
|
@@ -221,7 +224,7 @@ def _stability_agg(
|
|
221
224
|
|
222
225
|
psi_value = agg_func([_psi(reference, c) for c in current])
|
223
226
|
|
224
|
-
return psi_value
|
227
|
+
return float(psi_value)
|
225
228
|
|
226
229
|
|
227
230
|
def _get_binned_data(
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = "1.2.114a3"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|