upgini 1.1.253a5__tar.gz → 1.1.253a3261__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (85) hide show
  1. {upgini-1.1.253a5/src/upgini.egg-info → upgini-1.1.253a3261}/PKG-INFO +3 -2
  2. {upgini-1.1.253a5 → upgini-1.1.253a3261}/setup.py +3 -2
  3. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/data_source/data_source_publisher.py +1 -1
  4. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/dataset.py +57 -20
  5. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/features_enricher.py +15 -15
  6. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/search_task.py +1 -1
  7. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/datetime_utils.py +1 -1
  8. upgini-1.1.253a3261/src/upgini/utils/target_utils.py +74 -0
  9. {upgini-1.1.253a5 → upgini-1.1.253a3261/src/upgini.egg-info}/PKG-INFO +3 -2
  10. {upgini-1.1.253a5 → upgini-1.1.253a3261}/tests/test_etalon_validation.py +3 -5
  11. {upgini-1.1.253a5 → upgini-1.1.253a3261}/tests/test_features_enricher.py +5 -2
  12. upgini-1.1.253a3261/tests/test_target_utils.py +74 -0
  13. upgini-1.1.253a5/src/upgini/utils/target_utils.py +0 -183
  14. upgini-1.1.253a5/tests/test_target_utils.py +0 -134
  15. {upgini-1.1.253a5 → upgini-1.1.253a3261}/LICENSE +0 -0
  16. {upgini-1.1.253a5 → upgini-1.1.253a3261}/README.md +0 -0
  17. {upgini-1.1.253a5 → upgini-1.1.253a3261}/pyproject.toml +0 -0
  18. {upgini-1.1.253a5 → upgini-1.1.253a3261}/setup.cfg +0 -0
  19. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/__init__.py +0 -0
  20. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/ads.py +0 -0
  21. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/ads_management/__init__.py +0 -0
  22. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/ads_management/ads_manager.py +0 -0
  23. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/autofe/__init__.py +0 -0
  24. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/autofe/all_operands.py +0 -0
  25. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/autofe/binary.py +0 -0
  26. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/autofe/feature.py +0 -0
  27. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/autofe/groupby.py +0 -0
  28. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/autofe/operand.py +0 -0
  29. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/autofe/unary.py +0 -0
  30. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/autofe/vector.py +0 -0
  31. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/data_source/__init__.py +0 -0
  32. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/errors.py +0 -0
  33. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/fingerprint.js +0 -0
  34. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/http.py +0 -0
  35. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/mdc/__init__.py +0 -0
  36. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/mdc/context.py +0 -0
  37. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/metadata.py +0 -0
  38. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/metrics.py +0 -0
  39. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/normalizer/__init__.py +0 -0
  40. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/normalizer/phone_normalizer.py +0 -0
  41. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/resource_bundle/__init__.py +0 -0
  42. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/resource_bundle/exceptions.py +0 -0
  43. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/resource_bundle/strings.properties +0 -0
  44. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  45. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/sampler/__init__.py +0 -0
  46. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/sampler/base.py +0 -0
  47. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/sampler/random_under_sampler.py +0 -0
  48. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/sampler/utils.py +0 -0
  49. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/spinner.py +0 -0
  50. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/__init__.py +0 -0
  51. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/base_search_key_detector.py +0 -0
  52. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/blocked_time_series.py +0 -0
  53. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/country_utils.py +0 -0
  54. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/custom_loss_utils.py +0 -0
  55. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/cv_utils.py +0 -0
  56. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/deduplicate_utils.py +0 -0
  57. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/display_utils.py +0 -0
  58. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/email_utils.py +0 -0
  59. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/fallback_progress_bar.py +0 -0
  60. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/features_validator.py +0 -0
  61. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/format.py +0 -0
  62. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/ip_utils.py +0 -0
  63. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/phone_utils.py +0 -0
  64. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/postal_code_utils.py +0 -0
  65. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/progress_bar.py +0 -0
  66. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/sklearn_ext.py +0 -0
  67. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/track_info.py +0 -0
  68. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/utils/warning_counter.py +0 -0
  69. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini/version_validator.py +0 -0
  70. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini.egg-info/SOURCES.txt +0 -0
  71. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini.egg-info/dependency_links.txt +0 -0
  72. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini.egg-info/requires.txt +0 -0
  73. {upgini-1.1.253a5 → upgini-1.1.253a3261}/src/upgini.egg-info/top_level.txt +0 -0
  74. {upgini-1.1.253a5 → upgini-1.1.253a3261}/tests/test_binary_dataset.py +0 -0
  75. {upgini-1.1.253a5 → upgini-1.1.253a3261}/tests/test_blocked_time_series.py +0 -0
  76. {upgini-1.1.253a5 → upgini-1.1.253a3261}/tests/test_categorical_dataset.py +0 -0
  77. {upgini-1.1.253a5 → upgini-1.1.253a3261}/tests/test_continuous_dataset.py +0 -0
  78. {upgini-1.1.253a5 → upgini-1.1.253a3261}/tests/test_country_utils.py +0 -0
  79. {upgini-1.1.253a5 → upgini-1.1.253a3261}/tests/test_custom_loss_utils.py +0 -0
  80. {upgini-1.1.253a5 → upgini-1.1.253a3261}/tests/test_datetime_utils.py +0 -0
  81. {upgini-1.1.253a5 → upgini-1.1.253a3261}/tests/test_email_utils.py +0 -0
  82. {upgini-1.1.253a5 → upgini-1.1.253a3261}/tests/test_metrics.py +0 -0
  83. {upgini-1.1.253a5 → upgini-1.1.253a3261}/tests/test_phone_utils.py +0 -0
  84. {upgini-1.1.253a5 → upgini-1.1.253a3261}/tests/test_postal_code_utils.py +0 -0
  85. {upgini-1.1.253a5 → upgini-1.1.253a3261}/tests/test_widget.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.253a5
3
+ Version: 1.1.253a3261
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -18,12 +18,13 @@ Classifier: Intended Audience :: Science/Research
18
18
  Classifier: Intended Audience :: Telecommunications Industry
19
19
  Classifier: License :: OSI Approved :: BSD License
20
20
  Classifier: Operating System :: OS Independent
21
+ Classifier: Programming Language :: Python :: 3.7
21
22
  Classifier: Programming Language :: Python :: 3.8
22
23
  Classifier: Programming Language :: Python :: 3.9
23
24
  Classifier: Programming Language :: Python :: 3.10
24
25
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
25
26
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
26
- Requires-Python: >=3.8,<3.11
27
+ Requires-Python: >=3.7,<3.11
27
28
  Description-Content-Type: text/markdown
28
29
  License-File: LICENSE
29
30
  Requires-Dist: python-dateutil>=2.8.0
@@ -40,7 +40,7 @@ def send_log(msg: str):
40
40
 
41
41
 
42
42
  here = Path(__file__).parent.resolve()
43
- version = "1.1.253a5"
43
+ version = "1.1.253a3261"
44
44
  try:
45
45
  send_log(f"Start setup PyLib version {version}")
46
46
  setup(
@@ -62,6 +62,7 @@ try:
62
62
  "Intended Audience :: Telecommunications Industry",
63
63
  "License :: OSI Approved :: BSD License",
64
64
  "Operating System :: OS Independent",
65
+ "Programming Language :: Python :: 3.7",
65
66
  "Programming Language :: Python :: 3.8",
66
67
  "Programming Language :: Python :: 3.9",
67
68
  "Programming Language :: Python :: 3.10",
@@ -73,7 +74,7 @@ try:
73
74
  package_dir={"": "src"},
74
75
  packages=find_packages(where="src"),
75
76
  package_data={"": ["strings.properties", "strings_widget.properties", "fingerprint.js"]},
76
- python_requires=">=3.8,<3.11",
77
+ python_requires=">=3.7,<3.11",
77
78
  install_requires=[
78
79
  "python-dateutil>=2.8.0",
79
80
  "requests>=2.8.0",
@@ -40,7 +40,7 @@ class DataSourcePublisher:
40
40
  if logs_enabled:
41
41
  self.logger = LoggerFactory().get_logger(endpoint, api_key)
42
42
  else:
43
- self.logger = logging.getLogger("muted_logger")
43
+ self.logger = logging.getLogger()
44
44
  self.logger.setLevel("FATAL")
45
45
 
46
46
  def place(
@@ -39,10 +39,10 @@ from upgini.metadata import (
39
39
  )
40
40
  from upgini.normalizer.phone_normalizer import PhoneNormalizer
41
41
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
42
+ from upgini.sampler.random_under_sampler import RandomUnderSampler
42
43
  from upgini.search_task import SearchTask
43
44
  from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
44
45
  from upgini.utils.email_utils import EmailSearchKeyConverter
45
- from upgini.utils.target_utils import balance_undersample
46
46
 
47
47
  try:
48
48
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
@@ -61,8 +61,6 @@ class Dataset: # (pd.DataFrame):
61
61
  FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
62
62
  MIN_SAMPLE_THRESHOLD = 5_000
63
63
  IMBALANCE_THESHOLD = 0.4
64
- BINARY_BOOTSTRAP_LOOPS = 5
65
- MULTICLASS_BOOTSTRAP_LOOPS = 2
66
64
  MIN_TARGET_CLASS_ROWS = 100
67
65
  MAX_MULTICLASS_CLASS_COUNT = 100
68
66
  MIN_SUPPORTED_DATE_TS = 946684800000 # 2000-01-01
@@ -463,8 +461,10 @@ class Dataset: # (pd.DataFrame):
463
461
  self.task_type == ModelTaskType.BINARY and len(train_segment) > self.MIN_SAMPLE_THRESHOLD
464
462
  ):
465
463
  count = len(train_segment)
466
- target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
467
- target = train_segment[target_column]
464
+ min_class_count = count
465
+ min_class_value = None
466
+ target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, "")
467
+ target = train_segment[target_column].copy()
468
468
  target_classes_count = target.nunique()
469
469
 
470
470
  if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
@@ -474,9 +474,12 @@ class Dataset: # (pd.DataFrame):
474
474
  self.logger.warning(msg)
475
475
  raise ValidationError(msg)
476
476
 
477
- vc = target.value_counts()
478
- min_class_value = vc.index[len(vc) - 1]
479
- min_class_count = vc[min_class_value]
477
+ unique_target = target.unique()
478
+ for v in list(unique_target): # type: ignore
479
+ current_class_count = len(train_segment.loc[target == v])
480
+ if current_class_count < min_class_count:
481
+ min_class_count = current_class_count
482
+ min_class_value = v
480
483
 
481
484
  if min_class_count < self.MIN_TARGET_CLASS_ROWS:
482
485
  msg = self.bundle.get("dataset_rarest_class_less_min").format(
@@ -489,19 +492,53 @@ class Dataset: # (pd.DataFrame):
489
492
  min_class_threshold = min_class_percent * count
490
493
 
491
494
  if min_class_count < min_class_threshold:
492
- self.imbalanced = True
493
- self.data = balance_undersample(
494
- df=train_segment,
495
- target_column=target_column,
496
- task_type=self.task_type,
497
- random_state=self.random_state,
498
- imbalance_threshold=self.IMBALANCE_THESHOLD,
499
- binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
500
- multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
501
- logger=self.logger,
502
- bundle=self.bundle,
503
- warning_counter=self.warning_counter,
495
+ msg = self.bundle.get("dataset_rarest_class_less_threshold").format(
496
+ min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
504
497
  )
498
+ self.logger.warning(msg)
499
+ print(msg)
500
+ self.warning_counter.increment()
501
+
502
+ train_segment = train_segment.copy().sort_values(by=SYSTEM_RECORD_ID)
503
+ if self.task_type == ModelTaskType.MULTICLASS:
504
+ # Sort classes by rows count and find 25% quantile class
505
+ classes = target.value_counts().index
506
+ quantile25_idx = int(0.75 * len(classes))
507
+ quantile25_class = classes[quantile25_idx]
508
+ count_of_quantile25_class = len(target[target == quantile25_class])
509
+ msg = self.bundle.get("imbalance_multiclass").format(quantile25_class, count_of_quantile25_class)
510
+ self.logger.warning(msg)
511
+ print(msg)
512
+ # 25% and lower classes will stay as is. Higher classes will be downsampled
513
+ parts = []
514
+ for class_idx in range(quantile25_idx):
515
+ sampled = train_segment[train_segment[target_column] == classes[class_idx]].sample(
516
+ n=count_of_quantile25_class, random_state=self.random_state
517
+ )
518
+ parts.append(sampled)
519
+ for class_idx in range(quantile25_idx, len(classes)):
520
+ parts.append(train_segment[train_segment[target_column] == classes[class_idx]])
521
+ resampled_data = pd.concat(parts)
522
+ elif self.task_type == ModelTaskType.BINARY and min_class_count < self.MIN_SAMPLE_THRESHOLD / 2:
523
+ minority_class = train_segment[train_segment[target_column] == min_class_value]
524
+ majority_class = train_segment[train_segment[target_column] != min_class_value]
525
+ sampled_majority_class = majority_class.sample(
526
+ n=self.MIN_SAMPLE_THRESHOLD - min_class_count, random_state=self.random_state
527
+ )
528
+ resampled_data = train_segment[
529
+ (train_segment[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
530
+ | (train_segment[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
531
+ ]
532
+ else:
533
+ sampler = RandomUnderSampler(random_state=self.random_state)
534
+ X = train_segment[SYSTEM_RECORD_ID]
535
+ X = X.to_frame(SYSTEM_RECORD_ID)
536
+ new_x, _ = sampler.fit_resample(X, target) # type: ignore
537
+ resampled_data = train_segment[train_segment[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
538
+
539
+ self.data = resampled_data
540
+ self.logger.info(f"Shape after rebalance resampling: {self.data.shape}")
541
+ self.imbalanced = True
505
542
 
506
543
  # Resample over fit threshold
507
544
  if not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
@@ -221,7 +221,7 @@ class FeaturesEnricher(TransformerMixin):
221
221
  if logs_enabled:
222
222
  self.logger = LoggerFactory().get_logger(endpoint, self._api_key, client_ip, client_visitorid)
223
223
  else:
224
- self.logger = logging.getLogger("muted_logger")
224
+ self.logger = logging.getLogger()
225
225
  self.logger.setLevel("FATAL")
226
226
 
227
227
  if len(kwargs) > 0:
@@ -3047,6 +3047,20 @@ class FeaturesEnricher(TransformerMixin):
3047
3047
  def list_or_single(lst: List[str], single: str):
3048
3048
  return lst or ([single] if single else [])
3049
3049
 
3050
+ def to_anchor(link: str, value: str) -> str:
3051
+ if not value:
3052
+ return ""
3053
+ elif not link:
3054
+ return value
3055
+ elif value == llm_source:
3056
+ return value
3057
+ else:
3058
+ return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
3059
+
3060
+ def make_links(names: List[str], links: List[str]):
3061
+ all_links = [to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
3062
+ return ",".join(all_links)
3063
+
3050
3064
  features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3051
3065
  for feature_meta in features_meta:
3052
3066
  if feature_meta.name in original_names_dict.keys():
@@ -3072,20 +3086,6 @@ class FeaturesEnricher(TransformerMixin):
3072
3086
  if len(feature_sample) > 30:
3073
3087
  feature_sample = feature_sample[:30] + "..."
3074
3088
 
3075
- def to_anchor(link: str, value: str) -> str:
3076
- if not value:
3077
- return ""
3078
- elif not link:
3079
- return value
3080
- elif value == llm_source:
3081
- return value
3082
- else:
3083
- return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
3084
-
3085
- def make_links(names: List[str], links: List[str]):
3086
- all_links = [to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
3087
- return ",".join(all_links)
3088
-
3089
3089
  internal_provider = feature_meta.data_provider or "Upgini"
3090
3090
  providers = list_or_single(feature_meta.data_providers, feature_meta.data_provider)
3091
3091
  provider_links = list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
@@ -57,7 +57,7 @@ class SearchTask:
57
57
  if logger is not None:
58
58
  self.logger = logger
59
59
  else:
60
- self.logger = logging.getLogger("muted_logger")
60
+ self.logger = logging.getLogger()
61
61
  self.logger.setLevel("FATAL")
62
62
  self.provider_metadata_v2: Optional[List[ProviderTaskMetadataV2]] = None
63
63
  self.unused_features_for_generation: Optional[List[str]] = None
@@ -31,7 +31,7 @@ class DateTimeSearchKeyConverter:
31
31
  if logger is not None:
32
32
  self.logger = logger
33
33
  else:
34
- self.logger = logging.getLogger("muted_logger")
34
+ self.logger = logging.getLogger()
35
35
  self.logger.setLevel("FATAL")
36
36
  self.generated_features: List[str] = []
37
37
  self.bundle = bundle or get_custom_bundle()
@@ -0,0 +1,74 @@
1
+ import logging
2
+ from typing import Optional, Union
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from pandas.api.types import is_numeric_dtype
7
+
8
+ from upgini.errors import ValidationError
9
+ from upgini.metadata import ModelTaskType
10
+ from upgini.resource_bundle import bundle
11
+
12
+
13
+ def correct_string_target(y: Union[pd.Series, np.ndarray]) -> Union[pd.Series, np.ndarray]:
14
+ if isinstance(y, pd.Series):
15
+ return y.astype(str).astype("category").cat.codes
16
+ elif isinstance(y, np.ndarray):
17
+ return pd.Series(y).astype(str).astype("category").cat.codes.values
18
+
19
+
20
+ def define_task(
21
+ y: pd.Series, has_date: bool = False, logger: Optional[logging.Logger] = None, silent: bool = False
22
+ ) -> ModelTaskType:
23
+ if logger is None:
24
+ logger = logging.getLogger()
25
+ target = y.dropna()
26
+ if is_numeric_dtype(target):
27
+ target = target.loc[np.isfinite(target)]
28
+ else:
29
+ target = target.loc[target != ""]
30
+ if len(target) == 0:
31
+ raise ValidationError(bundle.get("empty_target"))
32
+ target_items = target.nunique()
33
+ if target_items == 1:
34
+ raise ValidationError(bundle.get("dataset_constant_target"))
35
+ if target_items == 2:
36
+ task = ModelTaskType.BINARY
37
+ else:
38
+ try:
39
+ target = pd.to_numeric(target)
40
+ is_numeric = True
41
+ except Exception:
42
+ is_numeric = False
43
+
44
+ # If any value is non numeric - multiclass
45
+ if not is_numeric:
46
+ task = ModelTaskType.MULTICLASS
47
+ else:
48
+ if target.nunique() <= 50 and is_int_encoding(target.unique()):
49
+ task = ModelTaskType.MULTICLASS
50
+ elif has_date:
51
+ task = ModelTaskType.REGRESSION
52
+ else:
53
+ non_zero_target = target[target != 0]
54
+ target_items = non_zero_target.nunique()
55
+ target_ratio = target_items / len(non_zero_target)
56
+ if (
57
+ (target.dtype.kind == "f" and np.any(target != target.astype(int))) # any non integer
58
+ or target_items > 50
59
+ or target_ratio > 0.2
60
+ ):
61
+ task = ModelTaskType.REGRESSION
62
+ else:
63
+ task = ModelTaskType.MULTICLASS
64
+
65
+ logger.info(f"Detected task type: {task}")
66
+ if not silent:
67
+ print(bundle.get("target_type_detected").format(task))
68
+ return task
69
+
70
+
71
+ def is_int_encoding(unique_values):
72
+ return set(unique_values) == set(range(len(unique_values))) or set(unique_values) == set(
73
+ range(1, len(unique_values) + 1)
74
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.253a5
3
+ Version: 1.1.253a3261
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -18,12 +18,13 @@ Classifier: Intended Audience :: Science/Research
18
18
  Classifier: Intended Audience :: Telecommunications Industry
19
19
  Classifier: License :: OSI Approved :: BSD License
20
20
  Classifier: Operating System :: OS Independent
21
+ Classifier: Programming Language :: Python :: 3.7
21
22
  Classifier: Programming Language :: Python :: 3.8
22
23
  Classifier: Programming Language :: Python :: 3.9
23
24
  Classifier: Programming Language :: Python :: 3.10
24
25
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
25
26
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
26
- Requires-Python: >=3.8,<3.11
27
+ Requires-Python: >=3.7,<3.11
27
28
  Description-Content-Type: text/markdown
28
29
  License-File: LICENSE
29
30
  Requires-Dist: python-dateutil>=2.8.0
@@ -244,13 +244,11 @@ def test_imbalanced_target():
244
244
  }
245
245
  dataset.task_type = ModelTaskType.MULTICLASS
246
246
  dataset._Dataset__resample()
247
- assert len(dataset) == 1800
247
+ assert len(dataset) == 400
248
248
  value_counts = dataset.data["target"].value_counts()
249
249
  assert len(value_counts) == 4
250
- assert value_counts["a"] == 100
251
- assert value_counts["b"] == 400
252
- assert value_counts["c"] == 500
253
- assert value_counts["d"] == 800
250
+ for label in dataset.data["target"].unique():
251
+ assert value_counts[label] == 100
254
252
 
255
253
 
256
254
  def test_fail_on_small_class_observations():
@@ -2136,6 +2136,8 @@ def test_idempotent_order_with_imbalanced_dataset(requests_mock: Mocker):
2136
2136
  expected_result_df = (
2137
2137
  pd.read_parquet(expected_result_path).sort_values(by="system_record_id").reset_index(drop=True)
2138
2138
  )
2139
+ expected_result_df["phone_num_a54a33"] = expected_result_df["phone_num_a54a33"].astype("Int64")
2140
+ expected_result_df["rep_date_f5d6bb"] = expected_result_df["rep_date_f5d6bb"].astype("Int64")
2139
2141
 
2140
2142
  def test(n_shuffles: int):
2141
2143
  train_df = initial_train_df.copy()
@@ -2161,7 +2163,6 @@ def test_idempotent_order_with_imbalanced_dataset(requests_mock: Mocker):
2161
2163
  pass
2162
2164
 
2163
2165
  actual_result_df = result_wrapper.df.sort_values(by="system_record_id").reset_index(drop=True)
2164
- # actual_result_df.to_parquet(expected_result_path)
2165
2166
  assert_frame_equal(actual_result_df, expected_result_df)
2166
2167
 
2167
2168
  for i in range(5):
@@ -2603,7 +2604,9 @@ def test_unsupported_arguments(requests_mock: Mocker):
2603
2604
 
2604
2605
  enricher.transform(df.drop(columns="target"), "unsupported_positional_argument", unsupported_key_argument=False)
2605
2606
 
2606
- with pytest.raises(ValueError, match="Only one class present in y_true. ROC AUC score is not defined in that case."):
2607
+ with pytest.raises(
2608
+ ValueError, match="Only one class present in y_true. ROC AUC score is not defined in that case."
2609
+ ):
2607
2610
  enricher.calculate_metrics(
2608
2611
  df.drop(columns="target"),
2609
2612
  df["target"],
@@ -0,0 +1,74 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import pytest
4
+
5
+ from upgini.errors import ValidationError
6
+ from upgini.metadata import ModelTaskType
7
+ from upgini.resource_bundle import bundle
8
+ from upgini.utils.target_utils import define_task
9
+
10
+
11
+ def test_invalid_target():
12
+ y = pd.Series(["", "", ""])
13
+ with pytest.raises(ValidationError, match=bundle.get("empty_target")):
14
+ define_task(y)
15
+
16
+ y = pd.Series([np.nan, np.inf, -np.inf])
17
+ with pytest.raises(ValidationError, match=bundle.get("empty_target")):
18
+ define_task(y)
19
+
20
+ y = pd.Series([1, 1, 1, 1, 1])
21
+ with pytest.raises(ValidationError, match=bundle.get("dataset_constant_target")):
22
+ define_task(y)
23
+
24
+
25
+ def test_define_binary_task_type():
26
+ y = pd.Series([0, 1, 0, 1, 0, 1])
27
+ assert define_task(y, False) == ModelTaskType.BINARY
28
+ assert define_task(y, True) == ModelTaskType.BINARY
29
+
30
+ y = pd.Series(["a", "b", "a", "b", "a"])
31
+ assert define_task(y, False) == ModelTaskType.BINARY
32
+ assert define_task(y, True) == ModelTaskType.BINARY
33
+
34
+
35
+ def test_define_multiclass_task_type():
36
+ y = pd.Series(range(1, 51))
37
+ assert define_task(y, False) == ModelTaskType.MULTICLASS
38
+ assert define_task(y, True) == ModelTaskType.MULTICLASS
39
+
40
+ y = pd.Series([float(x) for x in range(1, 51)])
41
+ assert define_task(y, False) == ModelTaskType.MULTICLASS
42
+ assert define_task(y, True) == ModelTaskType.MULTICLASS
43
+
44
+ y = pd.Series(range(0, 50))
45
+ assert define_task(y, False) == ModelTaskType.MULTICLASS
46
+ assert define_task(y, True) == ModelTaskType.MULTICLASS
47
+
48
+ y = pd.Series(["a", "b", "c", "b", "a"])
49
+ assert define_task(y, False) == ModelTaskType.MULTICLASS
50
+ assert define_task(y, True) == ModelTaskType.MULTICLASS
51
+
52
+ y = pd.Series(["0", "1", "2", "3", "a"])
53
+ assert define_task(y, False) == ModelTaskType.MULTICLASS
54
+ assert define_task(y, True) == ModelTaskType.MULTICLASS
55
+
56
+ y = pd.Series([0.0, 3.0, 5.0, 0.0, 5.0, 0.0, 3.0, 0.0, 5.0, 0.0, 5.0, 0.0, 3.0, 0.0, 3.0, 5.0, 3.0])
57
+ assert define_task(y, False) == ModelTaskType.MULTICLASS
58
+
59
+
60
+ def test_define_regression_task_type():
61
+ y = pd.Series([0.0, 3.0, 5.0, 0.0, 5.0, 0.0, 3.0, 0.0, 5.0, 0.0, 5.0, 0.0, 3.0, 0.0, 3.0, 5.0, 3.0])
62
+ assert define_task(y, True) == ModelTaskType.REGRESSION
63
+
64
+ y = pd.Series([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.5])
65
+ assert define_task(y, False) == ModelTaskType.REGRESSION
66
+ assert define_task(y, True) == ModelTaskType.REGRESSION
67
+
68
+ y = pd.Series([0, 1, 2, 3, 4, 5, 6, 8])
69
+ assert define_task(y, False) == ModelTaskType.REGRESSION
70
+ assert define_task(y, True) == ModelTaskType.REGRESSION
71
+
72
+ y = pd.Series([0.0, 3.0, 5.0, 0.0, 5.0, 0.0, 3.0])
73
+ assert define_task(y, False) == ModelTaskType.REGRESSION
74
+ assert define_task(y, True) == ModelTaskType.REGRESSION
@@ -1,183 +0,0 @@
1
- import logging
2
- from typing import Optional, Union
3
-
4
- import numpy as np
5
- import pandas as pd
6
- from pandas.api.types import is_numeric_dtype
7
-
8
- from upgini.errors import ValidationError
9
- from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
10
- from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
11
- from upgini.sampler.random_under_sampler import RandomUnderSampler
12
- from upgini.utils.warning_counter import WarningCounter
13
-
14
-
15
- def correct_string_target(y: Union[pd.Series, np.ndarray]) -> Union[pd.Series, np.ndarray]:
16
- if isinstance(y, pd.Series):
17
- return y.astype(str).astype("category").cat.codes
18
- elif isinstance(y, np.ndarray):
19
- return pd.Series(y).astype(str).astype("category").cat.codes.values
20
-
21
-
22
- def define_task(
23
- y: pd.Series, has_date: bool = False, logger: Optional[logging.Logger] = None, silent: bool = False
24
- ) -> ModelTaskType:
25
- if logger is None:
26
- logger = logging.getLogger()
27
- target = y.dropna()
28
- if is_numeric_dtype(target):
29
- target = target.loc[np.isfinite(target)]
30
- else:
31
- target = target.loc[target != ""]
32
- if len(target) == 0:
33
- raise ValidationError(bundle.get("empty_target"))
34
- target_items = target.nunique()
35
- if target_items == 1:
36
- raise ValidationError(bundle.get("dataset_constant_target"))
37
- if target_items == 2:
38
- task = ModelTaskType.BINARY
39
- else:
40
- try:
41
- target = pd.to_numeric(target)
42
- is_numeric = True
43
- except Exception:
44
- is_numeric = False
45
-
46
- # If any value is non numeric - multiclass
47
- if not is_numeric:
48
- task = ModelTaskType.MULTICLASS
49
- else:
50
- if target.nunique() <= 50 and is_int_encoding(target.unique()):
51
- task = ModelTaskType.MULTICLASS
52
- elif has_date:
53
- task = ModelTaskType.REGRESSION
54
- else:
55
- non_zero_target = target[target != 0]
56
- target_items = non_zero_target.nunique()
57
- target_ratio = target_items / len(non_zero_target)
58
- if (
59
- (target.dtype.kind == "f" and np.any(target != target.astype(int))) # any non integer
60
- or target_items > 50
61
- or target_ratio > 0.2
62
- ):
63
- task = ModelTaskType.REGRESSION
64
- else:
65
- task = ModelTaskType.MULTICLASS
66
-
67
- logger.info(f"Detected task type: {task}")
68
- if not silent:
69
- print(bundle.get("target_type_detected").format(task))
70
- return task
71
-
72
-
73
- def is_int_encoding(unique_values):
74
- return set(unique_values) == set(range(len(unique_values))) or set(unique_values) == set(
75
- range(1, len(unique_values) + 1)
76
- )
77
-
78
-
79
- def balance_undersample(
80
- df: pd.DataFrame,
81
- target_column: str,
82
- task_type: ModelTaskType,
83
- random_state: int,
84
- imbalance_threshold: int = 0.2,
85
- min_sample_threshold: int = 5000,
86
- binary_bootstrap_loops: int = 5,
87
- multiclass_bootstrap_loops: int = 2,
88
- logger: Optional[logging.Logger] = None,
89
- bundle: Optional[ResourceBundle] = None,
90
- warning_counter: Optional[WarningCounter] = None,
91
- ) -> pd.DataFrame:
92
- if logger is None:
93
- logger = logging.getLogger("muted_logger")
94
- logger.setLevel("FATAL")
95
- bundle = bundle or get_custom_bundle()
96
- if SYSTEM_RECORD_ID not in df.columns:
97
- raise Exception("System record id must be presented for undersampling")
98
-
99
- count = len(df)
100
- target = df[target_column].copy()
101
- target_classes_count = target.nunique()
102
-
103
- vc = target.value_counts()
104
- max_class_value = vc.index[0]
105
- min_class_value = vc.index[len(vc) - 1]
106
- max_class_count = vc[max_class_value]
107
- min_class_count = vc[min_class_value]
108
-
109
- min_class_percent = imbalance_threshold / target_classes_count
110
- min_class_threshold = min_class_percent * count
111
-
112
- resampled_data = df
113
- df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
114
- if task_type == ModelTaskType.MULTICLASS:
115
- # Sort classes by rows count and find 25% quantile class
116
- classes = vc.index
117
- quantile25_idx = int(0.75 * len(classes)) - 1
118
- quantile25_class = classes[quantile25_idx]
119
- quantile25_class_cnt = vc[quantile25_class]
120
-
121
- if max_class_count > (quantile25_class_cnt * multiclass_bootstrap_loops):
122
- msg = bundle.get("imbalance_multiclass").format(quantile25_class, quantile25_class_cnt)
123
- logger.warning(msg)
124
- print(msg)
125
- if warning_counter:
126
- warning_counter.increment()
127
-
128
- # 25% and lower classes will stay as is. Higher classes will be downsampled
129
- sample_strategy = dict()
130
- for class_idx in range(quantile25_idx):
131
- # compare class count with count_of_quantile25_class * 2
132
- class_value = classes[class_idx]
133
- class_count = vc[class_value]
134
- sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
135
- sampler = RandomUnderSampler(
136
- sampling_strategy=sample_strategy, random_state=random_state
137
- )
138
- X = df[SYSTEM_RECORD_ID]
139
- X = X.to_frame(SYSTEM_RECORD_ID)
140
- new_x, _ = sampler.fit_resample(X, target) # type: ignore
141
-
142
- resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
143
- elif len(df) > min_sample_threshold and min_class_count < min_sample_threshold / 2:
144
- msg = bundle.get("dataset_rarest_class_less_threshold").format(
145
- min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
146
- )
147
- logger.warning(msg)
148
- print(msg)
149
- if warning_counter:
150
- warning_counter.increment()
151
-
152
- # fill up to min_sample_threshold by majority class
153
- minority_class = df[df[target_column] == min_class_value]
154
- majority_class = df[df[target_column] != min_class_value]
155
- sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
156
- sampled_majority_class = majority_class.sample(
157
- n=sample_size, random_state=random_state
158
- )
159
- resampled_data = df[
160
- (df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
161
- | (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
162
- ]
163
-
164
- elif max_class_count > min_class_count * binary_bootstrap_loops:
165
- msg = bundle.get("dataset_rarest_class_less_threshold").format(
166
- min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
167
- )
168
- logger.warning(msg)
169
- print(msg)
170
- if warning_counter:
171
- warning_counter.increment()
172
-
173
- sampler = RandomUnderSampler(
174
- sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
175
- )
176
- X = df[SYSTEM_RECORD_ID]
177
- X = X.to_frame(SYSTEM_RECORD_ID)
178
- new_x, _ = sampler.fit_resample(X, target) # type: ignore
179
-
180
- resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
181
-
182
- logger.info(f"Shape after rebalance resampling: {resampled_data}")
183
- return resampled_data
@@ -1,134 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- import pytest
4
- from pandas.testing import assert_frame_equal
5
-
6
- from upgini.errors import ValidationError
7
- from upgini.metadata import SYSTEM_RECORD_ID, TARGET, ModelTaskType
8
- from upgini.resource_bundle import bundle
9
- from upgini.utils.target_utils import balance_undersample, define_task
10
-
11
-
12
- def test_invalid_target():
13
- y = pd.Series(["", "", ""])
14
- with pytest.raises(ValidationError, match=bundle.get("empty_target")):
15
- define_task(y)
16
-
17
- y = pd.Series([np.nan, np.inf, -np.inf])
18
- with pytest.raises(ValidationError, match=bundle.get("empty_target")):
19
- define_task(y)
20
-
21
- y = pd.Series([1, 1, 1, 1, 1])
22
- with pytest.raises(ValidationError, match=bundle.get("dataset_constant_target")):
23
- define_task(y)
24
-
25
-
26
- def test_define_binary_task_type():
27
- y = pd.Series([0, 1, 0, 1, 0, 1])
28
- assert define_task(y, False) == ModelTaskType.BINARY
29
- assert define_task(y, True) == ModelTaskType.BINARY
30
-
31
- y = pd.Series(["a", "b", "a", "b", "a"])
32
- assert define_task(y, False) == ModelTaskType.BINARY
33
- assert define_task(y, True) == ModelTaskType.BINARY
34
-
35
-
36
- def test_define_multiclass_task_type():
37
- y = pd.Series(range(1, 51))
38
- assert define_task(y, False) == ModelTaskType.MULTICLASS
39
- assert define_task(y, True) == ModelTaskType.MULTICLASS
40
-
41
- y = pd.Series([float(x) for x in range(1, 51)])
42
- assert define_task(y, False) == ModelTaskType.MULTICLASS
43
- assert define_task(y, True) == ModelTaskType.MULTICLASS
44
-
45
- y = pd.Series(range(0, 50))
46
- assert define_task(y, False) == ModelTaskType.MULTICLASS
47
- assert define_task(y, True) == ModelTaskType.MULTICLASS
48
-
49
- y = pd.Series(["a", "b", "c", "b", "a"])
50
- assert define_task(y, False) == ModelTaskType.MULTICLASS
51
- assert define_task(y, True) == ModelTaskType.MULTICLASS
52
-
53
- y = pd.Series(["0", "1", "2", "3", "a"])
54
- assert define_task(y, False) == ModelTaskType.MULTICLASS
55
- assert define_task(y, True) == ModelTaskType.MULTICLASS
56
-
57
- y = pd.Series([0.0, 3.0, 5.0, 0.0, 5.0, 0.0, 3.0, 0.0, 5.0, 0.0, 5.0, 0.0, 3.0, 0.0, 3.0, 5.0, 3.0])
58
- assert define_task(y, False) == ModelTaskType.MULTICLASS
59
-
60
-
61
- def test_define_regression_task_type():
62
- y = pd.Series([0.0, 3.0, 5.0, 0.0, 5.0, 0.0, 3.0, 0.0, 5.0, 0.0, 5.0, 0.0, 3.0, 0.0, 3.0, 5.0, 3.0])
63
- assert define_task(y, True) == ModelTaskType.REGRESSION
64
-
65
- y = pd.Series([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.5])
66
- assert define_task(y, False) == ModelTaskType.REGRESSION
67
- assert define_task(y, True) == ModelTaskType.REGRESSION
68
-
69
- y = pd.Series([0, 1, 2, 3, 4, 5, 6, 8])
70
- assert define_task(y, False) == ModelTaskType.REGRESSION
71
- assert define_task(y, True) == ModelTaskType.REGRESSION
72
-
73
- y = pd.Series([0.0, 3.0, 5.0, 0.0, 5.0, 0.0, 3.0])
74
- assert define_task(y, False) == ModelTaskType.REGRESSION
75
- assert define_task(y, True) == ModelTaskType.REGRESSION
76
-
77
-
78
- def test_balance_undersampling_binary():
79
- df = pd.DataFrame({SYSTEM_RECORD_ID: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], TARGET: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]})
80
- balanced_df = balance_undersample(
81
- df, TARGET, ModelTaskType.BINARY, 42, imbalance_threshold=0.1, min_sample_threshold=2
82
- )
83
- # Get all minority class and 5x of majority class if minority class count (1)
84
- # more or equal to min_sample_threshold/2 (1)
85
- expected_df = pd.DataFrame({
86
- SYSTEM_RECORD_ID: [1, 2, 3, 7, 9, 10],
87
- TARGET: [0, 1, 0, 0, 0, 0]
88
- })
89
- assert_frame_equal(balanced_df.sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True), expected_df)
90
-
91
- balanced_df = balance_undersample(
92
- df, TARGET, ModelTaskType.BINARY, 42, imbalance_threshold=0.1, min_sample_threshold=8
93
- )
94
- # Get all minority class and fill up to min_sample_threshold (8) by majority class
95
- expected_df = pd.DataFrame({
96
- SYSTEM_RECORD_ID: [1, 2, 3, 4, 6, 7, 9, 10],
97
- TARGET: [0, 1, 0, 0, 0, 0, 0, 0]
98
- })
99
- assert_frame_equal(balanced_df.sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True), expected_df)
100
-
101
- df = pd.DataFrame({"system_record_id": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], TARGET: [0, 1, 0, 0, 0, 0, 0, 0, 1, 0]})
102
- balanced_df = balance_undersample(
103
- df, "target", ModelTaskType.BINARY, 42, imbalance_threshold=0.1, min_sample_threshold=4
104
- )
105
- # Get full dataset if majority class count (8) less than x5 of minority class count (2)
106
- assert_frame_equal(balanced_df, df)
107
-
108
-
109
- def test_balance_undersaampling_multiclass():
110
- df = pd.DataFrame({
111
- SYSTEM_RECORD_ID: [1, 2, 3, 4, 5, 6],
112
- TARGET: ["a", "b", "c", "c", "b", "c"]
113
- # a - 1, b - 2, c - 3
114
- })
115
- balanced_df = balance_undersample(
116
- df, TARGET, ModelTaskType.MULTICLASS, 42, imbalance_threshold=0.1, min_sample_threshold=10
117
- )
118
- # Get full dataset if majority class count (3) less than x2 of 25% class (b) count (2)
119
- assert_frame_equal(balanced_df, df)
120
-
121
- df = pd.DataFrame({
122
- SYSTEM_RECORD_ID: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
123
- TARGET: ["a", "b", "c", "c", "c", "b", "c", "d", "d", "d", "c"]
124
- # a - 1, b - 2, c - 5, d - 3
125
- })
126
- balanced_df = balance_undersample(
127
- df, TARGET, ModelTaskType.MULTICLASS, 42, imbalance_threshold=0.1, min_sample_threshold=10
128
- )
129
- expected_df = pd.DataFrame({
130
- SYSTEM_RECORD_ID: [1, 2, 3, 4, 5, 6, 8, 9, 10, 11],
131
- TARGET: ["a", "b", "c", "c", "c", "b", "d", "d", "d", "c"]
132
- })
133
- # Get all of 25% quantile class (b) and minor classes (a) and x2 (or all if less) of major classes
134
- assert_frame_equal(balanced_df.sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True), expected_df)
File without changes
File without changes
File without changes
File without changes