upgini 1.2.114a2__tar.gz → 1.2.114a4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {upgini-1.2.114a2 → upgini-1.2.114a4}/PKG-INFO +1 -1
  2. upgini-1.2.114a4/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/features_enricher.py +83 -30
  4. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/http.py +4 -19
  5. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/resource_bundle/strings.properties +1 -1
  6. upgini-1.2.114a4/src/upgini/utils/hash_utils.py +137 -0
  7. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/psi.py +7 -4
  8. upgini-1.2.114a2/src/upgini/__about__.py +0 -1
  9. {upgini-1.2.114a2 → upgini-1.2.114a4}/.gitignore +0 -0
  10. {upgini-1.2.114a2 → upgini-1.2.114a4}/LICENSE +0 -0
  11. {upgini-1.2.114a2 → upgini-1.2.114a4}/README.md +0 -0
  12. {upgini-1.2.114a2 → upgini-1.2.114a4}/pyproject.toml +0 -0
  13. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/__init__.py +0 -0
  14. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/ads.py +0 -0
  15. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/ads_management/__init__.py +0 -0
  16. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/ads_management/ads_manager.py +0 -0
  17. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/__init__.py +0 -0
  18. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/all_operators.py +0 -0
  19. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/binary.py +0 -0
  20. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/date.py +0 -0
  21. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/feature.py +0 -0
  22. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/groupby.py +0 -0
  23. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/operator.py +0 -0
  24. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/__init__.py +0 -0
  25. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/base.py +0 -0
  26. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/cross.py +0 -0
  27. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/delta.py +0 -0
  28. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/lag.py +0 -0
  29. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/roll.py +0 -0
  30. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/trend.py +0 -0
  31. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/timeseries/volatility.py +0 -0
  32. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/unary.py +0 -0
  33. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/utils.py +0 -0
  34. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/autofe/vector.py +0 -0
  35. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/data_source/__init__.py +0 -0
  36. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/data_source/data_source_publisher.py +0 -0
  37. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/dataset.py +0 -0
  38. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/errors.py +0 -0
  39. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/mdc/__init__.py +0 -0
  40. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/mdc/context.py +0 -0
  41. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/metadata.py +0 -0
  42. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/metrics.py +0 -0
  43. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/normalizer/__init__.py +0 -0
  44. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/normalizer/normalize_utils.py +0 -0
  45. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/resource_bundle/__init__.py +0 -0
  46. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/resource_bundle/exceptions.py +0 -0
  47. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  48. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/sampler/__init__.py +0 -0
  49. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/sampler/base.py +0 -0
  50. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/sampler/random_under_sampler.py +0 -0
  51. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/sampler/utils.py +0 -0
  52. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/search_task.py +0 -0
  53. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/spinner.py +0 -0
  54. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  55. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/__init__.py +0 -0
  56. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/base_search_key_detector.py +0 -0
  57. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/blocked_time_series.py +0 -0
  58. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/country_utils.py +0 -0
  59. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/custom_loss_utils.py +0 -0
  60. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/cv_utils.py +0 -0
  61. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/datetime_utils.py +0 -0
  62. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/deduplicate_utils.py +0 -0
  63. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/display_utils.py +0 -0
  64. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/email_utils.py +0 -0
  65. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/fallback_progress_bar.py +0 -0
  66. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/feature_info.py +0 -0
  67. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/features_validator.py +0 -0
  68. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/format.py +0 -0
  69. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/ip_utils.py +0 -0
  70. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/mstats.py +0 -0
  71. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/phone_utils.py +0 -0
  72. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/postal_code_utils.py +0 -0
  73. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/progress_bar.py +0 -0
  74. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/sample_utils.py +0 -0
  75. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/sklearn_ext.py +0 -0
  76. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/sort.py +0 -0
  77. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/target_utils.py +0 -0
  78. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/track_info.py +0 -0
  79. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/ts_utils.py +0 -0
  80. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/utils/warning_counter.py +0 -0
  81. {upgini-1.2.114a2 → upgini-1.2.114a4}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.114a2
3
+ Version: 1.2.114a4
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.114a4"
@@ -101,6 +101,7 @@ from upgini.utils.email_utils import (
101
101
  from upgini.utils.feature_info import FeatureInfo, _round_shap_value
102
102
  from upgini.utils.features_validator import FeaturesValidator
103
103
  from upgini.utils.format import Format
104
+ from upgini.utils.hash_utils import file_hash
104
105
  from upgini.utils.ip_utils import IpSearchKeyConverter
105
106
  from upgini.utils.phone_utils import PhoneSearchKeyDetector
106
107
  from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
@@ -1516,6 +1517,8 @@ class FeaturesEnricher(TransformerMixin):
1516
1517
  checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
1517
1518
  )
1518
1519
 
1520
+ self.logger.info(f"PSI values by sparsity: {psi_values_sparse}")
1521
+
1519
1522
  unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
1520
1523
  if unstable_by_sparsity:
1521
1524
  self.logger.info(f"Unstable by sparsity features: {sorted(unstable_by_sparsity)}")
@@ -1524,6 +1527,8 @@ class FeaturesEnricher(TransformerMixin):
1524
1527
  checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
1525
1528
  )
1526
1529
 
1530
+ self.logger.info(f"PSI values by value: {psi_values}")
1531
+
1527
1532
  unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
1528
1533
  if unstable_by_value:
1529
1534
  self.logger.info(f"Unstable by value features: {sorted(unstable_by_value)}")
@@ -2105,7 +2110,18 @@ class FeaturesEnricher(TransformerMixin):
2105
2110
  columns_renaming = normalizer.columns_renaming
2106
2111
 
2107
2112
  df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
2108
- df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET, columns_renaming, silent=True)
2113
+ df = self._add_fit_system_record_id(
2114
+ df,
2115
+ search_keys,
2116
+ SYSTEM_RECORD_ID,
2117
+ TARGET,
2118
+ columns_renaming,
2119
+ self.id_columns,
2120
+ self.cv,
2121
+ self.model_task_type,
2122
+ self.logger,
2123
+ self.bundle,
2124
+ )
2109
2125
 
2110
2126
  # Sample after sorting by system_record_id for idempotency
2111
2127
  df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
@@ -2717,13 +2733,17 @@ if response.status_code == 200:
2717
2733
 
2718
2734
  features_not_to_pass = []
2719
2735
  if add_fit_system_record_id:
2720
- df = self.__add_fit_system_record_id(
2736
+ df = self._add_fit_system_record_id(
2721
2737
  df,
2722
2738
  search_keys,
2723
2739
  SYSTEM_RECORD_ID,
2724
2740
  TARGET,
2725
2741
  columns_renaming,
2726
- silent=True,
2742
+ self.id_columns,
2743
+ self.cv,
2744
+ self.model_task_type,
2745
+ self.logger,
2746
+ self.bundle,
2727
2747
  )
2728
2748
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2729
2749
  features_not_to_pass.append(SORT_ID)
@@ -3263,8 +3283,17 @@ if response.status_code == 200:
3263
3283
  self.__log_warning(self.bundle.get("oot_eval_set_too_small_after_dedup").format(eval_set_index + 1))
3264
3284
 
3265
3285
  # Explode multiple search keys
3266
- df = self.__add_fit_system_record_id(
3267
- df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming
3286
+ df = self._add_fit_system_record_id(
3287
+ df,
3288
+ self.fit_search_keys,
3289
+ ENTITY_SYSTEM_RECORD_ID,
3290
+ TARGET,
3291
+ self.fit_columns_renaming,
3292
+ self.id_columns,
3293
+ self.cv,
3294
+ self.model_task_type,
3295
+ self.logger,
3296
+ self.bundle,
3268
3297
  )
3269
3298
 
3270
3299
  # TODO check that this is correct for enrichment
@@ -3298,8 +3327,17 @@ if response.status_code == 200:
3298
3327
  if eval_set is not None and len(eval_set) > 0:
3299
3328
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
3300
3329
 
3301
- df = self.__add_fit_system_record_id(
3302
- df, self.fit_search_keys, SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming, silent=True
3330
+ df = self._add_fit_system_record_id(
3331
+ df,
3332
+ self.fit_search_keys,
3333
+ SYSTEM_RECORD_ID,
3334
+ TARGET,
3335
+ self.fit_columns_renaming,
3336
+ self.id_columns,
3337
+ self.cv,
3338
+ self.model_task_type,
3339
+ self.logger,
3340
+ self.bundle,
3303
3341
  )
3304
3342
 
3305
3343
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
@@ -4130,14 +4168,18 @@ if response.status_code == 200:
4130
4168
  self.logger.info(f"Finished explosion. Size after: {len(df)}")
4131
4169
  return df, unnest_search_keys
4132
4170
 
4133
- def __add_fit_system_record_id(
4134
- self,
4171
+ @staticmethod
4172
+ def _add_fit_system_record_id(
4135
4173
  df: pd.DataFrame,
4136
4174
  search_keys: Dict[str, SearchKey],
4137
4175
  id_name: str,
4138
4176
  target_name: str,
4139
4177
  columns_renaming: Dict[str, str],
4140
- silent: bool = False,
4178
+ id_columns: Optional[List[str]],
4179
+ cv: Optional[CVType],
4180
+ model_task_type: ModelTaskType,
4181
+ logger: Optional[logging.Logger] = None,
4182
+ bundle: ResourceBundle = bundle,
4141
4183
  ) -> pd.DataFrame:
4142
4184
  original_index_name = df.index.name
4143
4185
  index_name = df.index.name or DEFAULT_INDEX
@@ -4166,32 +4208,33 @@ if response.status_code == 200:
4166
4208
  columns_to_sort = [date_column] if date_column is not None else []
4167
4209
 
4168
4210
  do_sorting = True
4169
- if self.id_columns and self.cv is not None and self.cv.is_time_series():
4211
+ if id_columns and cv is not None and cv.is_time_series():
4170
4212
  # Check duplicates by date and id_columns
4171
4213
  reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
4172
- renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in self.id_columns]
4214
+ renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in id_columns]
4173
4215
  duplicate_check_columns = [c for c in renamed_id_columns if c in df.columns]
4174
4216
  if date_column is not None:
4175
4217
  duplicate_check_columns.append(date_column)
4176
4218
 
4177
4219
  duplicates = df.duplicated(subset=duplicate_check_columns, keep=False)
4178
4220
  if duplicates.any():
4179
- raise ValueError(self.bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
4221
+ raise ValueError(bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
4180
4222
  else:
4181
4223
  columns_to_hash = list(set(list(search_keys.keys()) + renamed_id_columns + [target_name]))
4182
4224
  columns_to_hash = sort_columns(
4183
4225
  df[columns_to_hash],
4184
4226
  target_name,
4185
4227
  search_keys,
4186
- self.model_task_type,
4228
+ model_task_type,
4187
4229
  sort_exclude_columns,
4188
- logger=self.logger,
4230
+ logger=logger,
4189
4231
  )
4190
4232
  else:
4191
4233
  columns_to_hash = sort_columns(
4192
- df, target_name, search_keys, self.model_task_type, sort_exclude_columns, logger=self.logger
4234
+ df, target_name, search_keys, model_task_type, sort_exclude_columns, logger=logger
4193
4235
  )
4194
- if do_sorting:
4236
+
4237
+ def sort_df(df: pd.DataFrame) -> pd.DataFrame:
4195
4238
  search_keys_hash = "search_keys_hash"
4196
4239
  if len(columns_to_hash) > 0:
4197
4240
  factorized_df = df.copy()
@@ -4205,6 +4248,24 @@ if response.status_code == 200:
4205
4248
 
4206
4249
  if search_keys_hash in df.columns:
4207
4250
  df.drop(columns=search_keys_hash, inplace=True)
4251
+ return df
4252
+
4253
+ if do_sorting:
4254
+ sorted_dfs = []
4255
+ if EVAL_SET_INDEX in df.columns:
4256
+ # Sort train and eval sets separately
4257
+ train = df[df[EVAL_SET_INDEX] == 0].copy()
4258
+ sorted_dfs.append(sort_df(train))
4259
+
4260
+ for eval_set_index in df[EVAL_SET_INDEX].unique():
4261
+ if eval_set_index == 0:
4262
+ continue
4263
+ eval_set_df = df[df[EVAL_SET_INDEX] == eval_set_index].copy()
4264
+ sorted_dfs.append(sort_df(eval_set_df))
4265
+
4266
+ df = pd.concat(sorted_dfs)
4267
+ else:
4268
+ df = sort_df(df)
4208
4269
 
4209
4270
  df = df.reset_index(drop=True).reset_index()
4210
4271
  # system_record_id saves correct order for fit
@@ -4215,11 +4276,6 @@ if response.status_code == 200:
4215
4276
  df.index.name = original_index_name
4216
4277
  df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
4217
4278
 
4218
- # meaning_types[id_name] = (
4219
- # FileColumnMeaningType.SYSTEM_RECORD_ID
4220
- # if id_name == SYSTEM_RECORD_ID
4221
- # else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
4222
- # )
4223
4279
  return df
4224
4280
 
4225
4281
  def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -4266,6 +4322,7 @@ if response.status_code == 200:
4266
4322
  self.logger.error(f"result features not found by search_task_id: {self.get_search_id()}")
4267
4323
  raise RuntimeError(self.bundle.get("features_wasnt_returned"))
4268
4324
 
4325
+ result_features = result_features.copy()
4269
4326
  if EVAL_SET_INDEX in result_features.columns:
4270
4327
  result_features = result_features.drop(columns=EVAL_SET_INDEX)
4271
4328
 
@@ -4993,7 +5050,7 @@ if response.status_code == 200:
4993
5050
 
4994
5051
  with tempfile.TemporaryDirectory() as tmp_dir:
4995
5052
  X_.to_parquet(f"{tmp_dir}/x.parquet", compression="zstd")
4996
- x_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/x.parquet")
5053
+ x_digest_sha256 = file_hash(f"{tmp_dir}/x.parquet")
4997
5054
  if self.rest_client.is_file_uploaded(trace_id, x_digest_sha256):
4998
5055
  self.logger.info(
4999
5056
  f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping"
@@ -5007,7 +5064,7 @@ if response.status_code == 200:
5007
5064
  if isinstance(y_, pd.Series):
5008
5065
  y_ = y_.to_frame()
5009
5066
  y_.to_parquet(f"{tmp_dir}/y.parquet", compression="zstd")
5010
- y_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/y.parquet")
5067
+ y_digest_sha256 = file_hash(f"{tmp_dir}/y.parquet")
5011
5068
  if self.rest_client.is_file_uploaded(trace_id, y_digest_sha256):
5012
5069
  self.logger.info(
5013
5070
  f"File y.parquet was already uploaded with digest {y_digest_sha256}, skipping"
@@ -5022,9 +5079,7 @@ if response.status_code == 200:
5022
5079
  if isinstance(eval_x_, pd.Series):
5023
5080
  eval_x_ = eval_x_.to_frame()
5024
5081
  eval_x_.to_parquet(f"{tmp_dir}/eval_x_{idx}.parquet", compression="zstd")
5025
- eval_x_digest_sha256 = self.rest_client.compute_file_digest(
5026
- f"{tmp_dir}/eval_x_{idx}.parquet"
5027
- )
5082
+ eval_x_digest_sha256 = file_hash(f"{tmp_dir}/eval_x_{idx}.parquet")
5028
5083
  if self.rest_client.is_file_uploaded(trace_id, eval_x_digest_sha256):
5029
5084
  self.logger.info(
5030
5085
  f"File eval_x_{idx}.parquet was already uploaded with"
@@ -5041,9 +5096,7 @@ if response.status_code == 200:
5041
5096
  if isinstance(eval_y_, pd.Series):
5042
5097
  eval_y_ = eval_y_.to_frame()
5043
5098
  eval_y_.to_parquet(f"{tmp_dir}/eval_y_{idx}.parquet", compression="zstd")
5044
- eval_y_digest_sha256 = self.rest_client.compute_file_digest(
5045
- f"{tmp_dir}/eval_y_{idx}.parquet"
5046
- )
5099
+ eval_y_digest_sha256 = file_hash(f"{tmp_dir}/eval_y_{idx}.parquet")
5047
5100
  if self.rest_client.is_file_uploaded(trace_id, eval_y_digest_sha256):
5048
5101
  self.logger.info(
5049
5102
  f"File eval_y_{idx}.parquet was already uploaded"
@@ -45,6 +45,7 @@ from upgini.metadata import (
45
45
  SearchCustomization,
46
46
  )
47
47
  from upgini.resource_bundle import bundle
48
+ from upgini.utils.hash_utils import file_hash
48
49
  from upgini.utils.track_info import get_track_metrics
49
50
 
50
51
  UPGINI_URL: str = "UPGINI_URL"
@@ -427,7 +428,7 @@ class _RestClient:
427
428
  api_path = self.SEARCH_DUMP_INPUT_FILE_FMT
428
429
 
429
430
  def upload_with_check(path: str, file_name: str):
430
- digest_sha256 = self.compute_file_digest(path)
431
+ digest_sha256 = file_hash(path)
431
432
  if self.is_file_uploaded(trace_id, digest_sha256):
432
433
  # print(f"File {path} was already uploaded with digest {digest_sha256}, skipping")
433
434
  return
@@ -448,16 +449,6 @@ class _RestClient:
448
449
  if eval_y_path:
449
450
  upload_with_check(eval_y_path, "eval_y.parquet")
450
451
 
451
- @staticmethod
452
- def compute_file_digest(filepath: str, algorithm="sha256", chunk_size=4096) -> str:
453
- hash_func = getattr(hashlib, algorithm)()
454
-
455
- with open(filepath, "rb") as f:
456
- for chunk in iter(lambda: f.read(chunk_size), b""):
457
- hash_func.update(chunk)
458
-
459
- return hash_func.hexdigest()
460
-
461
452
  def initial_search_v2(
462
453
  self,
463
454
  trace_id: str,
@@ -478,10 +469,7 @@ class _RestClient:
478
469
  digest = md5_hash.hexdigest()
479
470
  metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
480
471
 
481
- # digest_sha256 = hashlib.sha256(
482
- # pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
483
- # ).hexdigest()
484
- digest_sha256 = self.compute_file_digest(file_path)
472
+ digest_sha256 = file_hash(file_path)
485
473
  metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
486
474
 
487
475
  with open(file_path, "rb") as file:
@@ -576,10 +564,7 @@ class _RestClient:
576
564
  digest = md5_hash.hexdigest()
577
565
  metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
578
566
 
579
- # digest_sha256 = hashlib.sha256(
580
- # pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
581
- # ).hexdigest()
582
- digest_sha256 = self.compute_file_digest(file_path)
567
+ digest_sha256 = file_hash(file_path)
583
568
  metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
584
569
 
585
570
  with open(file_path, "rb") as file:
@@ -244,7 +244,7 @@ validation_all_valid_status=All valid
244
244
  validation_all_valid_message= -
245
245
  validation_drop_message= Invalid rows will be dropped.
246
246
  validation_some_invalid_status=Some invalid
247
- validation_invalid_message={:.1f}% values failed validation and removed from dataframe, invalid values: {}
247
+ validation_invalid_message={:.2f}% values failed validation and removed from dataframe, invalid values: {}
248
248
  validation_all_invalid_status=All invalid
249
249
  validation_all_valid_color=#DAF7A6
250
250
  validation_some_invalid_color=#FFC300
@@ -0,0 +1,137 @@
1
+ import os
2
+ import platform
3
+ import shutil
4
+ import subprocess
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+
9
+ def file_hash(path: str | os.PathLike, algo: str = "sha256") -> str:
10
+ """
11
+ Returns file hash using system utilities, working consistently on Windows/macOS/Linux.
12
+ If no suitable utility is found, gracefully falls back to hashlib.
13
+
14
+ Supported algo values (depend on OS and available utilities):
15
+ - "md5", "sha1", "sha224", "sha256", "sha384", "sha512"
16
+ On Windows uses `certutil`.
17
+ On Linux uses `sha*sum` (e.g., sha256sum) or `shasum -a N`.
18
+ On macOS uses `shasum -a N` or `md5` for MD5.
19
+ """
20
+ p = str(Path(path))
21
+
22
+ sysname = platform.system().lower()
23
+ algo = algo.lower()
24
+
25
+ # -------- command attempts depending on OS --------
26
+ candidates: list[list[str]] = []
27
+
28
+ if sysname == "windows":
29
+ # certutil supports: MD5, SHA1, SHA256, SHA384, SHA512
30
+ name_map = {
31
+ "md5": "MD5",
32
+ "sha1": "SHA1",
33
+ "sha224": None, # certutil doesn't support
34
+ "sha256": "SHA256",
35
+ "sha384": "SHA384",
36
+ "sha512": "SHA512",
37
+ }
38
+ cert_name = name_map.get(algo)
39
+ if cert_name:
40
+ candidates.append(["certutil", "-hashfile", p, cert_name])
41
+ else:
42
+ # Unix-like systems
43
+ # 1) specialized *sum utility if available (usually present on Linux)
44
+ sum_cmd = f"{algo}sum" # md5sum, sha256sum, etc.
45
+ if shutil.which(sum_cmd):
46
+ candidates.append([sum_cmd, p])
47
+
48
+ # 2) universal shasum with -a parameter (available on macOS and often on Linux)
49
+ shasum_bits = {
50
+ "sha1": "1",
51
+ "sha224": "224",
52
+ "sha256": "256",
53
+ "sha384": "384",
54
+ "sha512": "512",
55
+ }
56
+ if algo in shasum_bits and shutil.which("shasum"):
57
+ candidates.append(["shasum", "-a", shasum_bits[algo], p])
58
+
59
+ # 3) for MD5 on macOS there's often a separate `md5` utility
60
+ if algo == "md5" and shutil.which("md5"):
61
+ candidates.append(["md5", p])
62
+
63
+ # -------- try system utilities --------
64
+ for cmd in candidates:
65
+ try:
66
+ out = subprocess.check_output(cmd, text=True, stderr=subprocess.STDOUT)
67
+ digest = _parse_hash_output(out, cmd[0])
68
+ if digest:
69
+ return digest.lower()
70
+ except (subprocess.CalledProcessError, FileNotFoundError):
71
+ continue # try next candidate
72
+
73
+ # -------- reliable fallback to hashlib --------
74
+ import hashlib
75
+
76
+ try:
77
+ h = getattr(hashlib, algo)
78
+ except AttributeError:
79
+ raise ValueError(f"Algorithm not supported: {algo}")
80
+
81
+ hasher = h()
82
+ with open(p, "rb") as f:
83
+ for chunk in iter(lambda: f.read(1024 * 1024), b""):
84
+ hasher.update(chunk)
85
+ return hasher.hexdigest().lower()
86
+
87
+
88
+ def _parse_hash_output(output: str, tool: str) -> Optional[str]:
89
+ """
90
+ Converts output from different utilities to clean hash.
91
+ Supports:
92
+ - sha*sum / shasum: '<hex> <filename>'
93
+ - certutil (Windows): line with second element as hash (spaces inside are removed)
94
+ - md5 (macOS): 'MD5 (file) = <hex>'
95
+ """
96
+ tool = tool.lower()
97
+ lines = [ln.strip() for ln in output.splitlines() if ln.strip()]
98
+
99
+ if not lines:
100
+ return None
101
+
102
+ if tool in {"sha1sum", "sha224sum", "sha256sum", "sha384sum", "sha512sum", "md5sum", "shasum"}:
103
+ # format: '<hex> <filename>'
104
+ first = lines[0]
105
+ parts = first.split()
106
+ return parts[0] if parts else None
107
+
108
+ if tool == "certutil":
109
+ # format:
110
+ # SHA256 hash of file <path>:
111
+ # <AA BB CC ...>
112
+ # CertUtil: -hashfile command completed successfully.
113
+ if len(lines) >= 2:
114
+ # Second line contains hex with spaces
115
+ candidate = lines[1].replace(" ", "")
116
+ # ensure it's hex
117
+ if all(c in "0123456789abcdefABCDEF" for c in candidate):
118
+ return candidate
119
+ return None
120
+
121
+ if tool == "md5":
122
+ # format: 'MD5 (<file>) = <hex>'
123
+ last = lines[-1]
124
+ if "=" in last:
125
+ return last.split("=", 1)[1].strip()
126
+ # sometimes md5 can return just the hash
127
+ parts = last.split()
128
+ if parts and all(c in "0123456789abcdefABCDEF" for c in parts[-1]):
129
+ return parts[-1]
130
+ return None
131
+
132
+ # as a last resort: take the first "looks like hash" word
133
+ for ln in lines:
134
+ for token in ln.split():
135
+ if all(c in "0123456789abcdefABCDEF" for c in token) and len(token) >= 32:
136
+ return token
137
+ return None
@@ -77,11 +77,14 @@ def calculate_features_psi(
77
77
  psi_features_params: StabilityParams = DEFAULT_FEATURES_PARAMS,
78
78
  psi_target_params: StabilityParams = DEFAULT_TARGET_PARAMS,
79
79
  ) -> Dict[str, float]:
80
- empty_res = pd.Series(index=df.columns, data=0)
80
+ empty_res = {col: 0.0 for col in df.columns if col not in [TARGET, date_column]}
81
81
 
82
82
  if not is_numeric_dtype(df[date_column]):
83
83
  df[date_column] = pd.to_datetime(df[date_column]).dt.floor("D").astype(np.int64) / 10**6
84
84
 
85
+ # Filter out rows with missing dates
86
+ df = df[df[date_column].notna()].copy()
87
+
85
88
  n_months = pd.to_datetime(df[date_column], unit="ms").dt.month.nunique()
86
89
 
87
90
  if TARGET in df.columns:
@@ -113,9 +116,9 @@ def calculate_features_psi(
113
116
  cat_top_pct=psi_target_params.cat_top_pct,
114
117
  agg_func=target_agg_func,
115
118
  )
116
- if target_psi is None:
119
+ if target_psi is None or np.isnan(target_psi):
117
120
  logger.info("Cannot determine target PSI. Skip feature PSI check")
118
- return pd.Series(index=df.columns, data=0)
121
+ return empty_res
119
122
 
120
123
  if target_psi > psi_target_params.threshold:
121
124
  logger.info(
@@ -221,7 +224,7 @@ def _stability_agg(
221
224
 
222
225
  psi_value = agg_func([_psi(reference, c) for c in current])
223
226
 
224
- return psi_value
227
+ return float(psi_value)
225
228
 
226
229
 
227
230
  def _get_binned_data(
@@ -1 +0,0 @@
1
- __version__ = "1.2.114a2"
File without changes
File without changes
File without changes
File without changes
File without changes