upgini 1.2.114a3__tar.gz → 1.2.114a4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {upgini-1.2.114a3 → upgini-1.2.114a4}/PKG-INFO +1 -1
  2. upgini-1.2.114a4/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/features_enricher.py +79 -30
  4. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/http.py +4 -19
  5. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/resource_bundle/strings.properties +1 -1
  6. upgini-1.2.114a4/src/upgini/utils/hash_utils.py +137 -0
  7. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/psi.py +4 -1
  8. upgini-1.2.114a3/src/upgini/__about__.py +0 -1
  9. {upgini-1.2.114a3 → upgini-1.2.114a4}/.gitignore +0 -0
  10. {upgini-1.2.114a3 → upgini-1.2.114a4}/LICENSE +0 -0
  11. {upgini-1.2.114a3 → upgini-1.2.114a4}/README.md +0 -0
  12. {upgini-1.2.114a3 → upgini-1.2.114a4}/pyproject.toml +0 -0
  13. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/__init__.py +0 -0
  14. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/ads.py +0 -0
  15. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/ads_management/__init__.py +0 -0
  16. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/ads_management/ads_manager.py +0 -0
  17. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/__init__.py +0 -0
  18. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/all_operators.py +0 -0
  19. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/binary.py +0 -0
  20. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/date.py +0 -0
  21. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/feature.py +0 -0
  22. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/groupby.py +0 -0
  23. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/operator.py +0 -0
  24. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/__init__.py +0 -0
  25. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/base.py +0 -0
  26. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/cross.py +0 -0
  27. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/delta.py +0 -0
  28. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/lag.py +0 -0
  29. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/roll.py +0 -0
  30. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/trend.py +0 -0
  31. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/volatility.py +0 -0
  32. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/unary.py +0 -0
  33. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/utils.py +0 -0
  34. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/autofe/vector.py +0 -0
  35. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/data_source/__init__.py +0 -0
  36. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/data_source/data_source_publisher.py +0 -0
  37. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/dataset.py +0 -0
  38. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/errors.py +0 -0
  39. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/mdc/__init__.py +0 -0
  40. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/mdc/context.py +0 -0
  41. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/metadata.py +0 -0
  42. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/metrics.py +0 -0
  43. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/normalizer/__init__.py +0 -0
  44. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/normalizer/normalize_utils.py +0 -0
  45. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/resource_bundle/__init__.py +0 -0
  46. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/resource_bundle/exceptions.py +0 -0
  47. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  48. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/sampler/__init__.py +0 -0
  49. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/sampler/base.py +0 -0
  50. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/sampler/random_under_sampler.py +0 -0
  51. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/sampler/utils.py +0 -0
  52. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/search_task.py +0 -0
  53. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/spinner.py +0 -0
  54. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  55. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/__init__.py +0 -0
  56. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/base_search_key_detector.py +0 -0
  57. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/blocked_time_series.py +0 -0
  58. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/country_utils.py +0 -0
  59. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/custom_loss_utils.py +0 -0
  60. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/cv_utils.py +0 -0
  61. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/datetime_utils.py +0 -0
  62. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/deduplicate_utils.py +0 -0
  63. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/display_utils.py +0 -0
  64. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/email_utils.py +0 -0
  65. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/fallback_progress_bar.py +0 -0
  66. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/feature_info.py +0 -0
  67. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/features_validator.py +0 -0
  68. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/format.py +0 -0
  69. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/ip_utils.py +0 -0
  70. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/mstats.py +0 -0
  71. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/phone_utils.py +0 -0
  72. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/postal_code_utils.py +0 -0
  73. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/progress_bar.py +0 -0
  74. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/sample_utils.py +0 -0
  75. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/sklearn_ext.py +0 -0
  76. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/sort.py +0 -0
  77. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/target_utils.py +0 -0
  78. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/track_info.py +0 -0
  79. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/ts_utils.py +0 -0
  80. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/utils/warning_counter.py +0 -0
  81. {upgini-1.2.114a3 → upgini-1.2.114a4}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.114a3
3
+ Version: 1.2.114a4
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.114a4"
@@ -101,6 +101,7 @@ from upgini.utils.email_utils import (
101
101
  from upgini.utils.feature_info import FeatureInfo, _round_shap_value
102
102
  from upgini.utils.features_validator import FeaturesValidator
103
103
  from upgini.utils.format import Format
104
+ from upgini.utils.hash_utils import file_hash
104
105
  from upgini.utils.ip_utils import IpSearchKeyConverter
105
106
  from upgini.utils.phone_utils import PhoneSearchKeyDetector
106
107
  from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
@@ -2109,7 +2110,18 @@ class FeaturesEnricher(TransformerMixin):
2109
2110
  columns_renaming = normalizer.columns_renaming
2110
2111
 
2111
2112
  df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
2112
- df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET, columns_renaming, silent=True)
2113
+ df = self._add_fit_system_record_id(
2114
+ df,
2115
+ search_keys,
2116
+ SYSTEM_RECORD_ID,
2117
+ TARGET,
2118
+ columns_renaming,
2119
+ self.id_columns,
2120
+ self.cv,
2121
+ self.model_task_type,
2122
+ self.logger,
2123
+ self.bundle,
2124
+ )
2113
2125
 
2114
2126
  # Sample after sorting by system_record_id for idempotency
2115
2127
  df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
@@ -2721,13 +2733,17 @@ if response.status_code == 200:
2721
2733
 
2722
2734
  features_not_to_pass = []
2723
2735
  if add_fit_system_record_id:
2724
- df = self.__add_fit_system_record_id(
2736
+ df = self._add_fit_system_record_id(
2725
2737
  df,
2726
2738
  search_keys,
2727
2739
  SYSTEM_RECORD_ID,
2728
2740
  TARGET,
2729
2741
  columns_renaming,
2730
- silent=True,
2742
+ self.id_columns,
2743
+ self.cv,
2744
+ self.model_task_type,
2745
+ self.logger,
2746
+ self.bundle,
2731
2747
  )
2732
2748
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2733
2749
  features_not_to_pass.append(SORT_ID)
@@ -3267,8 +3283,17 @@ if response.status_code == 200:
3267
3283
  self.__log_warning(self.bundle.get("oot_eval_set_too_small_after_dedup").format(eval_set_index + 1))
3268
3284
 
3269
3285
  # Explode multiple search keys
3270
- df = self.__add_fit_system_record_id(
3271
- df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming
3286
+ df = self._add_fit_system_record_id(
3287
+ df,
3288
+ self.fit_search_keys,
3289
+ ENTITY_SYSTEM_RECORD_ID,
3290
+ TARGET,
3291
+ self.fit_columns_renaming,
3292
+ self.id_columns,
3293
+ self.cv,
3294
+ self.model_task_type,
3295
+ self.logger,
3296
+ self.bundle,
3272
3297
  )
3273
3298
 
3274
3299
  # TODO check that this is correct for enrichment
@@ -3302,8 +3327,17 @@ if response.status_code == 200:
3302
3327
  if eval_set is not None and len(eval_set) > 0:
3303
3328
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
3304
3329
 
3305
- df = self.__add_fit_system_record_id(
3306
- df, self.fit_search_keys, SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming, silent=True
3330
+ df = self._add_fit_system_record_id(
3331
+ df,
3332
+ self.fit_search_keys,
3333
+ SYSTEM_RECORD_ID,
3334
+ TARGET,
3335
+ self.fit_columns_renaming,
3336
+ self.id_columns,
3337
+ self.cv,
3338
+ self.model_task_type,
3339
+ self.logger,
3340
+ self.bundle,
3307
3341
  )
3308
3342
 
3309
3343
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
@@ -4134,14 +4168,18 @@ if response.status_code == 200:
4134
4168
  self.logger.info(f"Finished explosion. Size after: {len(df)}")
4135
4169
  return df, unnest_search_keys
4136
4170
 
4137
- def __add_fit_system_record_id(
4138
- self,
4171
+ @staticmethod
4172
+ def _add_fit_system_record_id(
4139
4173
  df: pd.DataFrame,
4140
4174
  search_keys: Dict[str, SearchKey],
4141
4175
  id_name: str,
4142
4176
  target_name: str,
4143
4177
  columns_renaming: Dict[str, str],
4144
- silent: bool = False,
4178
+ id_columns: Optional[List[str]],
4179
+ cv: Optional[CVType],
4180
+ model_task_type: ModelTaskType,
4181
+ logger: Optional[logging.Logger] = None,
4182
+ bundle: ResourceBundle = bundle,
4145
4183
  ) -> pd.DataFrame:
4146
4184
  original_index_name = df.index.name
4147
4185
  index_name = df.index.name or DEFAULT_INDEX
@@ -4170,32 +4208,33 @@ if response.status_code == 200:
4170
4208
  columns_to_sort = [date_column] if date_column is not None else []
4171
4209
 
4172
4210
  do_sorting = True
4173
- if self.id_columns and self.cv is not None and self.cv.is_time_series():
4211
+ if id_columns and cv is not None and cv.is_time_series():
4174
4212
  # Check duplicates by date and id_columns
4175
4213
  reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
4176
- renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in self.id_columns]
4214
+ renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in id_columns]
4177
4215
  duplicate_check_columns = [c for c in renamed_id_columns if c in df.columns]
4178
4216
  if date_column is not None:
4179
4217
  duplicate_check_columns.append(date_column)
4180
4218
 
4181
4219
  duplicates = df.duplicated(subset=duplicate_check_columns, keep=False)
4182
4220
  if duplicates.any():
4183
- raise ValueError(self.bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
4221
+ raise ValueError(bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
4184
4222
  else:
4185
4223
  columns_to_hash = list(set(list(search_keys.keys()) + renamed_id_columns + [target_name]))
4186
4224
  columns_to_hash = sort_columns(
4187
4225
  df[columns_to_hash],
4188
4226
  target_name,
4189
4227
  search_keys,
4190
- self.model_task_type,
4228
+ model_task_type,
4191
4229
  sort_exclude_columns,
4192
- logger=self.logger,
4230
+ logger=logger,
4193
4231
  )
4194
4232
  else:
4195
4233
  columns_to_hash = sort_columns(
4196
- df, target_name, search_keys, self.model_task_type, sort_exclude_columns, logger=self.logger
4234
+ df, target_name, search_keys, model_task_type, sort_exclude_columns, logger=logger
4197
4235
  )
4198
- if do_sorting:
4236
+
4237
+ def sort_df(df: pd.DataFrame) -> pd.DataFrame:
4199
4238
  search_keys_hash = "search_keys_hash"
4200
4239
  if len(columns_to_hash) > 0:
4201
4240
  factorized_df = df.copy()
@@ -4209,6 +4248,24 @@ if response.status_code == 200:
4209
4248
 
4210
4249
  if search_keys_hash in df.columns:
4211
4250
  df.drop(columns=search_keys_hash, inplace=True)
4251
+ return df
4252
+
4253
+ if do_sorting:
4254
+ sorted_dfs = []
4255
+ if EVAL_SET_INDEX in df.columns:
4256
+ # Sort train and eval sets separately
4257
+ train = df[df[EVAL_SET_INDEX] == 0].copy()
4258
+ sorted_dfs.append(sort_df(train))
4259
+
4260
+ for eval_set_index in df[EVAL_SET_INDEX].unique():
4261
+ if eval_set_index == 0:
4262
+ continue
4263
+ eval_set_df = df[df[EVAL_SET_INDEX] == eval_set_index].copy()
4264
+ sorted_dfs.append(sort_df(eval_set_df))
4265
+
4266
+ df = pd.concat(sorted_dfs)
4267
+ else:
4268
+ df = sort_df(df)
4212
4269
 
4213
4270
  df = df.reset_index(drop=True).reset_index()
4214
4271
  # system_record_id saves correct order for fit
@@ -4219,11 +4276,6 @@ if response.status_code == 200:
4219
4276
  df.index.name = original_index_name
4220
4277
  df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
4221
4278
 
4222
- # meaning_types[id_name] = (
4223
- # FileColumnMeaningType.SYSTEM_RECORD_ID
4224
- # if id_name == SYSTEM_RECORD_ID
4225
- # else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
4226
- # )
4227
4279
  return df
4228
4280
 
4229
4281
  def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -4270,6 +4322,7 @@ if response.status_code == 200:
4270
4322
  self.logger.error(f"result features not found by search_task_id: {self.get_search_id()}")
4271
4323
  raise RuntimeError(self.bundle.get("features_wasnt_returned"))
4272
4324
 
4325
+ result_features = result_features.copy()
4273
4326
  if EVAL_SET_INDEX in result_features.columns:
4274
4327
  result_features = result_features.drop(columns=EVAL_SET_INDEX)
4275
4328
 
@@ -4997,7 +5050,7 @@ if response.status_code == 200:
4997
5050
 
4998
5051
  with tempfile.TemporaryDirectory() as tmp_dir:
4999
5052
  X_.to_parquet(f"{tmp_dir}/x.parquet", compression="zstd")
5000
- x_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/x.parquet")
5053
+ x_digest_sha256 = file_hash(f"{tmp_dir}/x.parquet")
5001
5054
  if self.rest_client.is_file_uploaded(trace_id, x_digest_sha256):
5002
5055
  self.logger.info(
5003
5056
  f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping"
@@ -5011,7 +5064,7 @@ if response.status_code == 200:
5011
5064
  if isinstance(y_, pd.Series):
5012
5065
  y_ = y_.to_frame()
5013
5066
  y_.to_parquet(f"{tmp_dir}/y.parquet", compression="zstd")
5014
- y_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/y.parquet")
5067
+ y_digest_sha256 = file_hash(f"{tmp_dir}/y.parquet")
5015
5068
  if self.rest_client.is_file_uploaded(trace_id, y_digest_sha256):
5016
5069
  self.logger.info(
5017
5070
  f"File y.parquet was already uploaded with digest {y_digest_sha256}, skipping"
@@ -5026,9 +5079,7 @@ if response.status_code == 200:
5026
5079
  if isinstance(eval_x_, pd.Series):
5027
5080
  eval_x_ = eval_x_.to_frame()
5028
5081
  eval_x_.to_parquet(f"{tmp_dir}/eval_x_{idx}.parquet", compression="zstd")
5029
- eval_x_digest_sha256 = self.rest_client.compute_file_digest(
5030
- f"{tmp_dir}/eval_x_{idx}.parquet"
5031
- )
5082
+ eval_x_digest_sha256 = file_hash(f"{tmp_dir}/eval_x_{idx}.parquet")
5032
5083
  if self.rest_client.is_file_uploaded(trace_id, eval_x_digest_sha256):
5033
5084
  self.logger.info(
5034
5085
  f"File eval_x_{idx}.parquet was already uploaded with"
@@ -5045,9 +5096,7 @@ if response.status_code == 200:
5045
5096
  if isinstance(eval_y_, pd.Series):
5046
5097
  eval_y_ = eval_y_.to_frame()
5047
5098
  eval_y_.to_parquet(f"{tmp_dir}/eval_y_{idx}.parquet", compression="zstd")
5048
- eval_y_digest_sha256 = self.rest_client.compute_file_digest(
5049
- f"{tmp_dir}/eval_y_{idx}.parquet"
5050
- )
5099
+ eval_y_digest_sha256 = file_hash(f"{tmp_dir}/eval_y_{idx}.parquet")
5051
5100
  if self.rest_client.is_file_uploaded(trace_id, eval_y_digest_sha256):
5052
5101
  self.logger.info(
5053
5102
  f"File eval_y_{idx}.parquet was already uploaded"
@@ -45,6 +45,7 @@ from upgini.metadata import (
45
45
  SearchCustomization,
46
46
  )
47
47
  from upgini.resource_bundle import bundle
48
+ from upgini.utils.hash_utils import file_hash
48
49
  from upgini.utils.track_info import get_track_metrics
49
50
 
50
51
  UPGINI_URL: str = "UPGINI_URL"
@@ -427,7 +428,7 @@ class _RestClient:
427
428
  api_path = self.SEARCH_DUMP_INPUT_FILE_FMT
428
429
 
429
430
  def upload_with_check(path: str, file_name: str):
430
- digest_sha256 = self.compute_file_digest(path)
431
+ digest_sha256 = file_hash(path)
431
432
  if self.is_file_uploaded(trace_id, digest_sha256):
432
433
  # print(f"File {path} was already uploaded with digest {digest_sha256}, skipping")
433
434
  return
@@ -448,16 +449,6 @@ class _RestClient:
448
449
  if eval_y_path:
449
450
  upload_with_check(eval_y_path, "eval_y.parquet")
450
451
 
451
- @staticmethod
452
- def compute_file_digest(filepath: str, algorithm="sha256", chunk_size=4096) -> str:
453
- hash_func = getattr(hashlib, algorithm)()
454
-
455
- with open(filepath, "rb") as f:
456
- for chunk in iter(lambda: f.read(chunk_size), b""):
457
- hash_func.update(chunk)
458
-
459
- return hash_func.hexdigest()
460
-
461
452
  def initial_search_v2(
462
453
  self,
463
454
  trace_id: str,
@@ -478,10 +469,7 @@ class _RestClient:
478
469
  digest = md5_hash.hexdigest()
479
470
  metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
480
471
 
481
- # digest_sha256 = hashlib.sha256(
482
- # pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
483
- # ).hexdigest()
484
- digest_sha256 = self.compute_file_digest(file_path)
472
+ digest_sha256 = file_hash(file_path)
485
473
  metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
486
474
 
487
475
  with open(file_path, "rb") as file:
@@ -576,10 +564,7 @@ class _RestClient:
576
564
  digest = md5_hash.hexdigest()
577
565
  metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
578
566
 
579
- # digest_sha256 = hashlib.sha256(
580
- # pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
581
- # ).hexdigest()
582
- digest_sha256 = self.compute_file_digest(file_path)
567
+ digest_sha256 = file_hash(file_path)
583
568
  metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
584
569
 
585
570
  with open(file_path, "rb") as file:
@@ -244,7 +244,7 @@ validation_all_valid_status=All valid
244
244
  validation_all_valid_message= -
245
245
  validation_drop_message= Invalid rows will be dropped.
246
246
  validation_some_invalid_status=Some invalid
247
- validation_invalid_message={:.1f}% values failed validation and removed from dataframe, invalid values: {}
247
+ validation_invalid_message={:.2f}% values failed validation and removed from dataframe, invalid values: {}
248
248
  validation_all_invalid_status=All invalid
249
249
  validation_all_valid_color=#DAF7A6
250
250
  validation_some_invalid_color=#FFC300
@@ -0,0 +1,137 @@
1
+ import os
2
+ import platform
3
+ import shutil
4
+ import subprocess
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+
9
+ def file_hash(path: str | os.PathLike, algo: str = "sha256") -> str:
10
+ """
11
+ Returns file hash using system utilities, working consistently on Windows/macOS/Linux.
12
+ If no suitable utility is found, gracefully falls back to hashlib.
13
+
14
+ Supported algo values (depend on OS and available utilities):
15
+ - "md5", "sha1", "sha224", "sha256", "sha384", "sha512"
16
+ On Windows uses `certutil`.
17
+ On Linux uses `sha*sum` (e.g., sha256sum) or `shasum -a N`.
18
+ On macOS uses `shasum -a N` or `md5` for MD5.
19
+ """
20
+ p = str(Path(path))
21
+
22
+ sysname = platform.system().lower()
23
+ algo = algo.lower()
24
+
25
+ # -------- command attempts depending on OS --------
26
+ candidates: list[list[str]] = []
27
+
28
+ if sysname == "windows":
29
+ # certutil supports: MD5, SHA1, SHA256, SHA384, SHA512
30
+ name_map = {
31
+ "md5": "MD5",
32
+ "sha1": "SHA1",
33
+ "sha224": None, # certutil doesn't support
34
+ "sha256": "SHA256",
35
+ "sha384": "SHA384",
36
+ "sha512": "SHA512",
37
+ }
38
+ cert_name = name_map.get(algo)
39
+ if cert_name:
40
+ candidates.append(["certutil", "-hashfile", p, cert_name])
41
+ else:
42
+ # Unix-like systems
43
+ # 1) specialized *sum utility if available (usually present on Linux)
44
+ sum_cmd = f"{algo}sum" # md5sum, sha256sum, etc.
45
+ if shutil.which(sum_cmd):
46
+ candidates.append([sum_cmd, p])
47
+
48
+ # 2) universal shasum with -a parameter (available on macOS and often on Linux)
49
+ shasum_bits = {
50
+ "sha1": "1",
51
+ "sha224": "224",
52
+ "sha256": "256",
53
+ "sha384": "384",
54
+ "sha512": "512",
55
+ }
56
+ if algo in shasum_bits and shutil.which("shasum"):
57
+ candidates.append(["shasum", "-a", shasum_bits[algo], p])
58
+
59
+ # 3) for MD5 on macOS there's often a separate `md5` utility
60
+ if algo == "md5" and shutil.which("md5"):
61
+ candidates.append(["md5", p])
62
+
63
+ # -------- try system utilities --------
64
+ for cmd in candidates:
65
+ try:
66
+ out = subprocess.check_output(cmd, text=True, stderr=subprocess.STDOUT)
67
+ digest = _parse_hash_output(out, cmd[0])
68
+ if digest:
69
+ return digest.lower()
70
+ except (subprocess.CalledProcessError, FileNotFoundError):
71
+ continue # try next candidate
72
+
73
+ # -------- reliable fallback to hashlib --------
74
+ import hashlib
75
+
76
+ try:
77
+ h = getattr(hashlib, algo)
78
+ except AttributeError:
79
+ raise ValueError(f"Algorithm not supported: {algo}")
80
+
81
+ hasher = h()
82
+ with open(p, "rb") as f:
83
+ for chunk in iter(lambda: f.read(1024 * 1024), b""):
84
+ hasher.update(chunk)
85
+ return hasher.hexdigest().lower()
86
+
87
+
88
+ def _parse_hash_output(output: str, tool: str) -> Optional[str]:
89
+ """
90
+ Converts output from different utilities to clean hash.
91
+ Supports:
92
+ - sha*sum / shasum: '<hex> <filename>'
93
+ - certutil (Windows): line with second element as hash (spaces inside are removed)
94
+ - md5 (macOS): 'MD5 (file) = <hex>'
95
+ """
96
+ tool = tool.lower()
97
+ lines = [ln.strip() for ln in output.splitlines() if ln.strip()]
98
+
99
+ if not lines:
100
+ return None
101
+
102
+ if tool in {"sha1sum", "sha224sum", "sha256sum", "sha384sum", "sha512sum", "md5sum", "shasum"}:
103
+ # format: '<hex> <filename>'
104
+ first = lines[0]
105
+ parts = first.split()
106
+ return parts[0] if parts else None
107
+
108
+ if tool == "certutil":
109
+ # format:
110
+ # SHA256 hash of file <path>:
111
+ # <AA BB CC ...>
112
+ # CertUtil: -hashfile command completed successfully.
113
+ if len(lines) >= 2:
114
+ # Second line contains hex with spaces
115
+ candidate = lines[1].replace(" ", "")
116
+ # ensure it's hex
117
+ if all(c in "0123456789abcdefABCDEF" for c in candidate):
118
+ return candidate
119
+ return None
120
+
121
+ if tool == "md5":
122
+ # format: 'MD5 (<file>) = <hex>'
123
+ last = lines[-1]
124
+ if "=" in last:
125
+ return last.split("=", 1)[1].strip()
126
+ # sometimes md5 can return just the hash
127
+ parts = last.split()
128
+ if parts and all(c in "0123456789abcdefABCDEF" for c in parts[-1]):
129
+ return parts[-1]
130
+ return None
131
+
132
+ # as a last resort: take the first "looks like hash" word
133
+ for ln in lines:
134
+ for token in ln.split():
135
+ if all(c in "0123456789abcdefABCDEF" for c in token) and len(token) >= 32:
136
+ return token
137
+ return None
@@ -82,6 +82,9 @@ def calculate_features_psi(
82
82
  if not is_numeric_dtype(df[date_column]):
83
83
  df[date_column] = pd.to_datetime(df[date_column]).dt.floor("D").astype(np.int64) / 10**6
84
84
 
85
+ # Filter out rows with missing dates
86
+ df = df[df[date_column].notna()].copy()
87
+
85
88
  n_months = pd.to_datetime(df[date_column], unit="ms").dt.month.nunique()
86
89
 
87
90
  if TARGET in df.columns:
@@ -221,7 +224,7 @@ def _stability_agg(
221
224
 
222
225
  psi_value = agg_func([_psi(reference, c) for c in current])
223
226
 
224
- return psi_value
227
+ return float(psi_value)
225
228
 
226
229
 
227
230
  def _get_binned_data(
@@ -1 +0,0 @@
1
- __version__ = "1.2.114a3"
File without changes
File without changes
File without changes
File without changes
File without changes