upgini 1.1.244a24__py3-none-any.whl → 1.1.245a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/dataset.py CHANGED
@@ -38,7 +38,7 @@ from upgini.metadata import (
38
38
  SearchCustomization,
39
39
  )
40
40
  from upgini.normalizer.phone_normalizer import PhoneNormalizer
41
- from upgini.resource_bundle import bundle
41
+ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
42
42
  from upgini.sampler.random_under_sampler import RandomUnderSampler
43
43
  from upgini.search_task import SearchTask
44
44
  from upgini.utils import combine_search_keys
@@ -81,8 +81,10 @@ class Dataset: # (pd.DataFrame):
81
81
  rest_client: Optional[_RestClient] = None,
82
82
  logger: Optional[logging.Logger] = None,
83
83
  warning_counter: Optional[WarningCounter] = None,
84
+ bundle: Optional[ResourceBundle] = None,
84
85
  **kwargs,
85
86
  ):
87
+ self.bundle = bundle or get_custom_bundle()
86
88
  if df is not None:
87
89
  data = df.copy()
88
90
  elif path is not None:
@@ -95,13 +97,13 @@ class Dataset: # (pd.DataFrame):
95
97
  kwargs["sep"] = sep
96
98
  data = pd.read_csv(path, **kwargs)
97
99
  else:
98
- raise ValueError(bundle.get("dataset_dataframe_or_path_empty"))
100
+ raise ValueError(self.bundle.get("dataset_dataframe_or_path_empty"))
99
101
  if isinstance(data, pd.DataFrame):
100
102
  self.data = data
101
103
  elif isinstance(data, pd.io.parsers.TextFileReader): # type: ignore
102
- raise ValueError(bundle.get("dataset_dataframe_iterator"))
104
+ raise ValueError(self.bundle.get("dataset_dataframe_iterator"))
103
105
  else:
104
- raise ValueError(bundle.get("dataset_dataframe_not_pandas"))
106
+ raise ValueError(self.bundle.get("dataset_dataframe_not_pandas"))
105
107
 
106
108
  self.dataset_name = dataset_name
107
109
  self.task_type = model_task_type
@@ -134,14 +136,14 @@ class Dataset: # (pd.DataFrame):
134
136
  @property
135
137
  def meaning_types_checked(self) -> Dict[str, FileColumnMeaningType]:
136
138
  if self.meaning_types is None:
137
- raise ValueError(bundle.get("dataset_empty_meaning_types"))
139
+ raise ValueError(self.bundle.get("dataset_empty_meaning_types"))
138
140
  else:
139
141
  return self.meaning_types
140
142
 
141
143
  @property
142
144
  def search_keys_checked(self) -> List[Tuple[str, ...]]:
143
145
  if self.search_keys is None:
144
- raise ValueError(bundle.get("dataset_empty_search_keys"))
146
+ raise ValueError(self.bundle.get("dataset_empty_search_keys"))
145
147
  else:
146
148
  return self.search_keys
147
149
 
@@ -156,11 +158,11 @@ class Dataset: # (pd.DataFrame):
156
158
 
157
159
  def __validate_min_rows_count(self):
158
160
  if len(self.data) < self.MIN_ROWS_COUNT:
159
- raise ValidationError(bundle.get("dataset_too_few_rows").format(self.MIN_ROWS_COUNT))
161
+ raise ValidationError(self.bundle.get("dataset_too_few_rows").format(self.MIN_ROWS_COUNT))
160
162
 
161
163
  def __validate_max_row_count(self):
162
164
  if len(self.data) > self.MAX_ROWS:
163
- raise ValidationError(bundle.get("dataset_too_many_rows_registered").format(self.MAX_ROWS))
165
+ raise ValidationError(self.bundle.get("dataset_too_many_rows_registered").format(self.MAX_ROWS))
164
166
 
165
167
  def __rename_columns(self):
166
168
  # self.logger.info("Replace restricted symbols in column names")
@@ -175,7 +177,7 @@ class Dataset: # (pd.DataFrame):
175
177
  new_column = str(column)
176
178
  suffix = hashlib.sha256(new_column.encode()).hexdigest()[:6]
177
179
  if len(new_column) == 0:
178
- raise ValidationError(bundle.get("dataset_empty_column_names"))
180
+ raise ValidationError(self.bundle.get("dataset_empty_column_names"))
179
181
  # db limit for column length
180
182
  if len(new_column) > 250:
181
183
  new_column = new_column[:250]
@@ -235,7 +237,7 @@ class Dataset: # (pd.DataFrame):
235
237
  nrows_after_full_dedup = len(self.data)
236
238
  share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
237
239
  if share_full_dedup > 0:
238
- msg = bundle.get("dataset_full_duplicates").format(share_full_dedup)
240
+ msg = self.bundle.get("dataset_full_duplicates").format(share_full_dedup)
239
241
  self.logger.warning(msg)
240
242
  # if not silent_mode:
241
243
  # print(msg)
@@ -250,7 +252,9 @@ class Dataset: # (pd.DataFrame):
250
252
  num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
251
253
  share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
252
254
 
253
- msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
255
+ msg = self.bundle.get("dataset_diff_target_duplicates").format(
256
+ share_tgt_dedup, num_dup_rows, dups_indices
257
+ )
254
258
  self.logger.warning(msg)
255
259
  if not silent_mode:
256
260
  print(msg)
@@ -342,7 +346,7 @@ class Dataset: # (pd.DataFrame):
342
346
 
343
347
  self.data[ip] = self.data[ip].apply(self._safe_ip_parse)
344
348
  if self.data[ip].isnull().all():
345
- raise ValidationError(bundle.get("invalid_ip").format(ip))
349
+ raise ValidationError(self.bundle.get("invalid_ip").format(ip))
346
350
 
347
351
  if self.data[ip].apply(self._is_ipv4).any():
348
352
  ipv4 = ip + "_v4"
@@ -379,7 +383,7 @@ class Dataset: # (pd.DataFrame):
379
383
  .str.replace("UK", "GB", regex=False)
380
384
  )
381
385
  if (self.data[iso_code] == "").all():
382
- raise ValidationError(bundle.get("invalid_country").format(iso_code))
386
+ raise ValidationError(self.bundle.get("invalid_country").format(iso_code))
383
387
 
384
388
  def __normalize_postal_code(self):
385
389
  postal_code = self.etalon_def_checked.get(FileColumnMeaningType.POSTAL_CODE.value)
@@ -402,7 +406,7 @@ class Dataset: # (pd.DataFrame):
402
406
  .str.replace(r"^0+\B", "", regex=True) # remove leading zeros
403
407
  )
404
408
  if (self.data[postal_code] == "").all():
405
- raise ValidationError(bundle.get("invalid_postal_code").format(postal_code))
409
+ raise ValidationError(self.bundle.get("invalid_postal_code").format(postal_code))
406
410
 
407
411
  def __normalize_hem(self):
408
412
  hem = self.etalon_def_checked.get(FileColumnMeaningType.HEM.value)
@@ -420,9 +424,9 @@ class Dataset: # (pd.DataFrame):
420
424
  self.data.drop(index=old_subset.index, inplace=True) # type: ignore
421
425
  self.logger.info(f"df after dropping old rows: {self.data.shape}")
422
426
  if len(self.data) == 0:
423
- raise ValidationError(bundle.get("dataset_all_dates_old"))
427
+ raise ValidationError(self.bundle.get("dataset_all_dates_old"))
424
428
  else:
425
- msg = bundle.get("dataset_drop_old_dates")
429
+ msg = self.bundle.get("dataset_drop_old_dates")
426
430
  self.logger.warning(msg)
427
431
  if not silent_mode:
428
432
  print(msg)
@@ -458,10 +462,10 @@ class Dataset: # (pd.DataFrame):
458
462
  target = target.astype("category").cat.codes
459
463
  except ValueError:
460
464
  self.logger.exception("Failed to cast target to category codes for binary task type")
461
- raise ValidationError(bundle.get("dataset_invalid_target_type").format(target.dtype))
465
+ raise ValidationError(self.bundle.get("dataset_invalid_target_type").format(target.dtype))
462
466
  target_classes_count = target.nunique()
463
467
  if target_classes_count != 2:
464
- msg = bundle.get("dataset_invalid_binary_target").format(target_classes_count)
468
+ msg = self.bundle.get("dataset_invalid_binary_target").format(target_classes_count)
465
469
  self.logger.warning(msg)
466
470
  raise ValidationError(msg)
467
471
  elif self.task_type == ModelTaskType.MULTICLASS:
@@ -470,21 +474,21 @@ class Dataset: # (pd.DataFrame):
470
474
  target = self.data[target_column].astype("category").cat.codes
471
475
  except Exception:
472
476
  self.logger.exception("Failed to cast target to category codes for multiclass task type")
473
- raise ValidationError(bundle.get("dataset_invalid_multiclass_target").format(target.dtype))
477
+ raise ValidationError(self.bundle.get("dataset_invalid_multiclass_target").format(target.dtype))
474
478
  elif self.task_type == ModelTaskType.REGRESSION:
475
479
  if not is_float_dtype(target):
476
480
  try:
477
481
  self.data[target_column] = self.data[target_column].astype("float")
478
482
  except ValueError:
479
483
  self.logger.exception("Failed to cast target to float for regression task type")
480
- raise ValidationError(bundle.get("dataset_invalid_regression_target").format(target.dtype))
484
+ raise ValidationError(self.bundle.get("dataset_invalid_regression_target").format(target.dtype))
481
485
  elif self.task_type == ModelTaskType.TIMESERIES:
482
486
  if not is_float_dtype(target):
483
487
  try:
484
488
  self.data[target_column] = self.data[target_column].astype("float")
485
489
  except ValueError:
486
490
  self.logger.exception("Failed to cast target to float for timeseries task type")
487
- raise ValidationError(bundle.get("dataset_invalid_timeseries_target").format(target.dtype))
491
+ raise ValidationError(self.bundle.get("dataset_invalid_timeseries_target").format(target.dtype))
488
492
 
489
493
  def __resample(self):
490
494
  # self.logger.info("Resampling etalon")
@@ -505,7 +509,7 @@ class Dataset: # (pd.DataFrame):
505
509
  target_classes_count = target.nunique()
506
510
 
507
511
  if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
508
- msg = bundle.get("dataset_to_many_multiclass_targets").format(
512
+ msg = self.bundle.get("dataset_to_many_multiclass_targets").format(
509
513
  target_classes_count, self.MAX_MULTICLASS_CLASS_COUNT
510
514
  )
511
515
  self.logger.warning(msg)
@@ -519,7 +523,7 @@ class Dataset: # (pd.DataFrame):
519
523
  min_class_value = v
520
524
 
521
525
  if min_class_count < self.MIN_TARGET_CLASS_ROWS:
522
- msg = bundle.get("dataset_rarest_class_less_min").format(
526
+ msg = self.bundle.get("dataset_rarest_class_less_min").format(
523
527
  min_class_value, min_class_count, self.MIN_TARGET_CLASS_ROWS
524
528
  )
525
529
  self.logger.warning(msg)
@@ -529,7 +533,7 @@ class Dataset: # (pd.DataFrame):
529
533
  min_class_threshold = min_class_percent * count
530
534
 
531
535
  if min_class_count < min_class_threshold:
532
- msg = bundle.get("dataset_rarest_class_less_threshold").format(
536
+ msg = self.bundle.get("dataset_rarest_class_less_threshold").format(
533
537
  min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
534
538
  )
535
539
  self.logger.warning(msg)
@@ -543,7 +547,7 @@ class Dataset: # (pd.DataFrame):
543
547
  quantile25_idx = int(0.75 * len(classes))
544
548
  quantile25_class = classes[quantile25_idx]
545
549
  count_of_quantile25_class = len(target[target == quantile25_class])
546
- msg = bundle.get("imbalance_multiclass").format(quantile25_class, count_of_quantile25_class)
550
+ msg = self.bundle.get("imbalance_multiclass").format(quantile25_class, count_of_quantile25_class)
547
551
  self.logger.warning(msg)
548
552
  print(msg)
549
553
  # 25% and lower classes will stay as is. Higher classes will be downsampled
@@ -621,7 +625,7 @@ class Dataset: # (pd.DataFrame):
621
625
  del self.meaning_types_checked[f]
622
626
 
623
627
  if removed_features:
624
- msg = bundle.get("dataset_date_features").format(removed_features)
628
+ msg = self.bundle.get("dataset_date_features").format(removed_features)
625
629
  self.logger.warning(msg)
626
630
  if not silent_mode:
627
631
  print(msg)
@@ -629,7 +633,7 @@ class Dataset: # (pd.DataFrame):
629
633
 
630
634
  def __validate_features_count(self):
631
635
  if len(self.__features()) > self.MAX_FEATURES_COUNT:
632
- msg = bundle.get("dataset_too_many_features").format(self.MAX_FEATURES_COUNT)
636
+ msg = self.bundle.get("dataset_too_many_features").format(self.MAX_FEATURES_COUNT)
633
637
  self.logger.warning(msg)
634
638
  raise ValidationError(msg)
635
639
 
@@ -646,14 +650,14 @@ class Dataset: # (pd.DataFrame):
646
650
  target = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value)
647
651
  if validate_target:
648
652
  if target is None:
649
- raise ValidationError(bundle.get("dataset_missing_target"))
653
+ raise ValidationError(self.bundle.get("dataset_missing_target"))
650
654
 
651
655
  target_value = self.__target_value()
652
656
  target_items = target_value.nunique()
653
657
  if target_items == 1:
654
- raise ValidationError(bundle.get("dataset_constant_target"))
658
+ raise ValidationError(self.bundle.get("dataset_constant_target"))
655
659
  elif target_items == 0:
656
- raise ValidationError(bundle.get("dataset_empty_target"))
660
+ raise ValidationError(self.bundle.get("dataset_empty_target"))
657
661
 
658
662
  # if self.task_type != ModelTaskType.MULTICLASS:
659
663
  # self.data[target] = self.data[target].apply(pd.to_numeric, errors="coerce")
@@ -664,23 +668,29 @@ class Dataset: # (pd.DataFrame):
664
668
  for key in search_group
665
669
  if self.columns_renaming.get(key) != EmailSearchKeyConverter.EMAIL_ONE_DOMAIN_COLUMN_NAME
666
670
  ]
671
+ ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS)
672
+ if (
673
+ FileColumnMeaningType.IPV6_ADDRESS in self.etalon_def_checked
674
+ and ipv4_column is not None
675
+ and ipv4_column in keys_to_validate
676
+ ):
677
+ keys_to_validate.remove(ipv4_column)
678
+
667
679
  mandatory_columns = [target]
668
680
  columns_to_validate = mandatory_columns.copy()
669
681
  columns_to_validate.extend(keys_to_validate)
670
682
  columns_to_validate = set([i for i in columns_to_validate if i is not None])
671
683
 
672
- # TODO remove ipv4 from validation if ipv6 is presented
673
-
674
684
  nrows = len(self.data)
675
685
  validation_stats = {}
676
686
  self.data["valid_keys"] = 0
677
687
  self.data["valid_mandatory"] = True
678
688
 
679
- all_valid_status = bundle.get("validation_all_valid_status")
680
- some_invalid_status = bundle.get("validation_some_invalid_status")
681
- all_invalid_status = bundle.get("validation_all_invalid_status")
682
- all_valid_message = bundle.get("validation_all_valid_message")
683
- invalid_message = bundle.get("validation_invalid_message")
689
+ all_valid_status = self.bundle.get("validation_all_valid_status")
690
+ some_invalid_status = self.bundle.get("validation_some_invalid_status")
691
+ all_invalid_status = self.bundle.get("validation_all_invalid_status")
692
+ all_valid_message = self.bundle.get("validation_all_valid_message")
693
+ invalid_message = self.bundle.get("validation_invalid_message")
684
694
 
685
695
  for col in columns_to_validate:
686
696
  self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
@@ -721,9 +731,9 @@ class Dataset: # (pd.DataFrame):
721
731
  if not silent_mode:
722
732
  df_stats = pd.DataFrame.from_dict(validation_stats, orient="index")
723
733
  df_stats.reset_index(inplace=True)
724
- name_header = bundle.get("validation_column_name_header")
725
- status_header = bundle.get("validation_status_header")
726
- description_header = bundle.get("validation_descr_header")
734
+ name_header = self.bundle.get("validation_column_name_header")
735
+ status_header = self.bundle.get("validation_status_header")
736
+ description_header = self.bundle.get("validation_descr_header")
727
737
  df_stats.columns = [name_header, status_header, description_header]
728
738
  try:
729
739
  import html
@@ -732,11 +742,11 @@ class Dataset: # (pd.DataFrame):
732
742
 
733
743
  _ = get_ipython() # type: ignore
734
744
 
735
- text_color = bundle.get("validation_text_color")
745
+ text_color = self.bundle.get("validation_text_color")
736
746
  colormap = {
737
- all_valid_status: bundle.get("validation_all_valid_color"),
738
- some_invalid_status: bundle.get("validation_some_invalid_color"),
739
- all_invalid_status: bundle.get("validation_all_invalid_color"),
747
+ all_valid_status: self.bundle.get("validation_all_valid_color"),
748
+ some_invalid_status: self.bundle.get("validation_some_invalid_color"),
749
+ all_invalid_status: self.bundle.get("validation_all_invalid_color"),
740
750
  }
741
751
 
742
752
  def map_color(text) -> str:
@@ -760,31 +770,33 @@ class Dataset: # (pd.DataFrame):
760
770
  print(df_stats)
761
771
 
762
772
  if len(self.data) == 0:
763
- raise ValidationError(bundle.get("all_search_keys_invalid"))
773
+ raise ValidationError(self.bundle.get("all_search_keys_invalid"))
764
774
 
765
775
  def __validate_meaning_types(self, validate_target: bool):
766
776
  # self.logger.info("Validating meaning types")
767
777
  if self.meaning_types is None or len(self.meaning_types) == 0:
768
- raise ValueError(bundle.get("dataset_missing_meaning_types"))
778
+ raise ValueError(self.bundle.get("dataset_missing_meaning_types"))
769
779
 
770
780
  if SYSTEM_RECORD_ID not in self.data.columns:
771
781
  raise ValueError("Internal error")
772
782
 
773
783
  for column in self.meaning_types:
774
784
  if column not in self.data.columns:
775
- raise ValueError(bundle.get("dataset_missing_meaning_column").format(column, self.data.columns))
785
+ raise ValueError(self.bundle.get("dataset_missing_meaning_column").format(column, self.data.columns))
776
786
  if validate_target and FileColumnMeaningType.TARGET not in self.meaning_types.values():
777
- raise ValueError(bundle.get("dataset_missing_target"))
787
+ raise ValueError(self.bundle.get("dataset_missing_target"))
778
788
 
779
789
  def __validate_search_keys(self):
780
790
  # self.logger.info("Validating search keys")
781
791
  if self.search_keys is None or len(self.search_keys) == 0:
782
- raise ValueError(bundle.get("dataset_missing_search_keys"))
792
+ raise ValueError(self.bundle.get("dataset_missing_search_keys"))
783
793
  for keys_group in self.search_keys:
784
794
  for key in keys_group:
785
795
  if key not in self.data.columns:
786
796
  showing_columns = set(self.data.columns) - SYSTEM_COLUMNS
787
- raise ValidationError(bundle.get("dataset_missing_search_key_column").format(key, showing_columns))
797
+ raise ValidationError(
798
+ self.bundle.get("dataset_missing_search_key_column").format(key, showing_columns)
799
+ )
788
800
 
789
801
  def validate(self, validate_target: bool = True, silent_mode: bool = False):
790
802
  # self.logger.info("Validating dataset")
@@ -889,7 +901,7 @@ class Dataset: # (pd.DataFrame):
889
901
  elif is_string_dtype(pandas_data_type):
890
902
  return DataType.STRING
891
903
  else:
892
- msg = bundle.get("dataset_invalid_column_type").format(column_name, pandas_data_type)
904
+ msg = self.bundle.get("dataset_invalid_column_type").format(column_name, pandas_data_type)
893
905
  self.logger.warning(msg)
894
906
  raise ValidationError(msg)
895
907
 
@@ -920,7 +932,7 @@ class Dataset: # (pd.DataFrame):
920
932
  for key in filter_features
921
933
  if key not in {"min_importance", "max_psi", "max_count", "selected_features"}
922
934
  ]:
923
- raise ValidationError(bundle.get("dataset_invalid_filter"))
935
+ raise ValidationError(self.bundle.get("dataset_invalid_filter"))
924
936
  feature_filter = FeaturesFilter(
925
937
  minImportance=filter_features.get("min_importance"),
926
938
  maxPSI=filter_features.get("max_psi"),
@@ -1011,7 +1023,7 @@ class Dataset: # (pd.DataFrame):
1011
1023
  trace_id, parquet_file_path, file_metadata, file_metrics, search_customization
1012
1024
  )
1013
1025
  # if progress_bar is not None:
1014
- # progress_bar.progress = (6.0, bundle.get(ProgressStage.MATCHING.value))
1026
+ # progress_bar.progress = (6.0, self.bundle.get(ProgressStage.MATCHING.value))
1015
1027
  # if progress_callback is not None:
1016
1028
  # progress_callback(SearchProgress(6.0, ProgressStage.MATCHING))
1017
1029
  self.file_upload_id = search_task_response.file_upload_id
@@ -1082,7 +1094,7 @@ class Dataset: # (pd.DataFrame):
1082
1094
  )
1083
1095
  self.file_upload_id = search_task_response.file_upload_id
1084
1096
  # if progress_bar is not None:
1085
- # progress_bar.progress = (6.0, bundle.get(ProgressStage.ENRICHING.value))
1097
+ # progress_bar.progress = (6.0, self.bundle.get(ProgressStage.ENRICHING.value))
1086
1098
  # if progress_callback is not None:
1087
1099
  # progress_callback(SearchProgress(6.0, ProgressStage.ENRICHING))
1088
1100
 
@@ -1102,5 +1114,5 @@ class Dataset: # (pd.DataFrame):
1102
1114
  uploading_file_size = Path(parquet_file_path).stat().st_size
1103
1115
  self.logger.info(f"Size of prepared uploading file: {uploading_file_size}")
1104
1116
  if uploading_file_size > self.MAX_UPLOADING_FILE_SIZE:
1105
- raise ValidationError(bundle.get("dataset_too_big_file"))
1117
+ raise ValidationError(self.bundle.get("dataset_too_big_file"))
1106
1118
  return parquet_file_path