upgini 1.2.60__tar.gz → 1.2.60a3792.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (70) hide show
  1. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/PKG-INFO +1 -2
  2. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/pyproject.toml +0 -1
  3. upgini-1.2.60a3792.dev2/src/upgini/__about__.py +1 -0
  4. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/autofe/date.py +1 -1
  5. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/dataset.py +17 -7
  6. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/features_enricher.py +45 -107
  7. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/metrics.py +7 -4
  8. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/resource_bundle/strings.properties +0 -1
  9. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/datetime_utils.py +0 -2
  10. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/target_utils.py +57 -4
  11. upgini-1.2.60a3792.dev2/src/upgini/utils/ts_utils.py +47 -0
  12. upgini-1.2.60/src/upgini/__about__.py +0 -1
  13. upgini-1.2.60/src/upgini/utils/mstats.py +0 -177
  14. upgini-1.2.60/src/upgini/utils/sort.py +0 -172
  15. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/.gitignore +0 -0
  16. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/LICENSE +0 -0
  17. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/README.md +0 -0
  18. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/__init__.py +0 -0
  19. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/ads.py +0 -0
  20. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/ads_management/__init__.py +0 -0
  21. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/ads_management/ads_manager.py +0 -0
  22. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/autofe/__init__.py +0 -0
  23. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/autofe/all_operands.py +0 -0
  24. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/autofe/binary.py +0 -0
  25. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/autofe/feature.py +0 -0
  26. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/autofe/groupby.py +0 -0
  27. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/autofe/operand.py +0 -0
  28. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/autofe/unary.py +0 -0
  29. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/autofe/vector.py +0 -0
  30. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/data_source/__init__.py +0 -0
  31. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/data_source/data_source_publisher.py +0 -0
  32. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/errors.py +0 -0
  33. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/http.py +0 -0
  34. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/lazy_import.py +0 -0
  35. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/mdc/__init__.py +0 -0
  36. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/mdc/context.py +0 -0
  37. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/metadata.py +0 -0
  38. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/normalizer/__init__.py +0 -0
  39. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/normalizer/normalize_utils.py +0 -0
  40. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/resource_bundle/__init__.py +0 -0
  41. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/resource_bundle/exceptions.py +0 -0
  42. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  43. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/sampler/__init__.py +0 -0
  44. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/sampler/base.py +0 -0
  45. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/sampler/random_under_sampler.py +0 -0
  46. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/sampler/utils.py +0 -0
  47. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/search_task.py +0 -0
  48. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/spinner.py +0 -0
  49. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  50. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/__init__.py +0 -0
  51. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/base_search_key_detector.py +0 -0
  52. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/blocked_time_series.py +0 -0
  53. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/country_utils.py +0 -0
  54. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/custom_loss_utils.py +0 -0
  55. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/cv_utils.py +0 -0
  56. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/deduplicate_utils.py +0 -0
  57. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/display_utils.py +0 -0
  58. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/email_utils.py +0 -0
  59. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/fallback_progress_bar.py +0 -0
  60. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/feature_info.py +0 -0
  61. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/features_validator.py +0 -0
  62. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/format.py +0 -0
  63. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/ip_utils.py +0 -0
  64. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/phone_utils.py +0 -0
  65. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/postal_code_utils.py +0 -0
  66. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/progress_bar.py +0 -0
  67. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/sklearn_ext.py +0 -0
  68. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/track_info.py +0 -0
  69. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/warning_counter.py +0 -0
  70. {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.60
3
+ Version: 1.2.60a3792.dev2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -30,7 +30,6 @@ Requires-Dist: jarowinkler>=2.0.0
30
30
  Requires-Dist: levenshtein>=0.25.1
31
31
  Requires-Dist: numpy<=1.26.4,>=1.19.0
32
32
  Requires-Dist: pandas<3.0.0,>=1.1.0
33
- Requires-Dist: psutil>=6.0.0
34
33
  Requires-Dist: pydantic<3.0.0,>1.0.0
35
34
  Requires-Dist: pyjwt>=2.8.0
36
35
  Requires-Dist: python-bidi==0.4.2
@@ -50,7 +50,6 @@ dependencies = [
50
50
  "xhtml2pdf>=0.2.11,<0.3.0",
51
51
  "jarowinkler>=2.0.0",
52
52
  "levenshtein>=0.25.1",
53
- "psutil>=6.0.0",
54
53
  ]
55
54
 
56
55
  [project.urls]
@@ -0,0 +1 @@
1
+ __version__ = "1.2.60a3792.dev2"
@@ -1,6 +1,6 @@
1
1
  import abc
2
2
  import json
3
- from typing import Dict, List, Optional, Union
3
+ from typing import Any, Dict, List, Optional, Union
4
4
 
5
5
  import numpy as np
6
6
  import pandas as pd
@@ -40,7 +40,7 @@ from upgini.utils.email_utils import EmailSearchKeyConverter
40
40
  from upgini.utils.target_utils import (
41
41
  balance_undersample,
42
42
  balance_undersample_forced,
43
- balance_undersample_time_series,
43
+ balance_undersample_time_series_trunc,
44
44
  )
45
45
 
46
46
  try:
@@ -58,6 +58,8 @@ class Dataset: # (pd.DataFrame):
58
58
  FIT_SAMPLE_THRESHOLD = 200_000
59
59
  FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
60
60
  FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
61
+ FIT_SAMPLE_THRESHOLD_TS = 54_000
62
+ FIT_SAMPLE_ROWS_TS = 54_000
61
63
  BINARY_MIN_SAMPLE_THRESHOLD = 5_000
62
64
  MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
63
65
  IMBALANCE_THESHOLD = 0.6
@@ -301,7 +303,10 @@ class Dataset: # (pd.DataFrame):
301
303
  )
302
304
 
303
305
  # Resample over fit threshold
304
- if not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
306
+ if self.cv_type is not None and self.cv_type.is_time_series():
307
+ sample_threshold = self.FIT_SAMPLE_THRESHOLD_TS
308
+ sample_rows = self.FIT_SAMPLE_ROWS_TS
309
+ elif not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
305
310
  sample_threshold = self.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
306
311
  sample_rows = self.FIT_SAMPLE_WITH_EVAL_SET_ROWS
307
312
  else:
@@ -314,7 +319,7 @@ class Dataset: # (pd.DataFrame):
314
319
  f"and will be downsampled to {sample_rows}"
315
320
  )
316
321
  if self.cv_type is not None and self.cv_type.is_time_series():
317
- resampled_data = balance_undersample_time_series(
322
+ resampled_data = balance_undersample_time_series_trunc(
318
323
  df=self.data,
319
324
  id_columns=self.id_columns,
320
325
  date_column=next(
@@ -584,10 +589,7 @@ class Dataset: # (pd.DataFrame):
584
589
  return search_customization
585
590
 
586
591
  def _rename_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
587
- if (
588
- runtime_parameters is not None
589
- and runtime_parameters.properties is not None
590
- ):
592
+ if runtime_parameters is not None and runtime_parameters.properties is not None:
591
593
  if "generate_features" in runtime_parameters.properties:
592
594
  generate_features = runtime_parameters.properties["generate_features"].split(",")
593
595
  renamed_generate_features = []
@@ -607,6 +609,13 @@ class Dataset: # (pd.DataFrame):
607
609
 
608
610
  return runtime_parameters
609
611
 
612
+ def _set_sample_size(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
613
+ if runtime_parameters is not None and runtime_parameters.properties is not None:
614
+ if self.cv_type is not None and self.cv_type.is_time_series():
615
+ runtime_parameters.properties["sample_size"] = self.FIT_SAMPLE_ROWS_TS
616
+ runtime_parameters.properties["iter0_sample_size"] = self.FIT_SAMPLE_ROWS_TS
617
+ return runtime_parameters
618
+
610
619
  def _clean_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
611
620
  if (
612
621
  runtime_parameters is not None
@@ -638,6 +647,7 @@ class Dataset: # (pd.DataFrame):
638
647
  file_metrics = FileMetrics()
639
648
 
640
649
  runtime_parameters = self._rename_generate_features(runtime_parameters)
650
+ runtime_parameters = self._set_sample_size(runtime_parameters)
641
651
 
642
652
  file_metadata = self.__construct_metadata(exclude_features_sources)
643
653
  search_customization = self.__construct_search_customization(
@@ -112,7 +112,6 @@ try:
112
112
  except Exception:
113
113
  from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
114
114
 
115
- from upgini.utils.sort import sort_columns
116
115
  from upgini.utils.target_utils import (
117
116
  balance_undersample_forced,
118
117
  calculate_psi,
@@ -1262,7 +1261,7 @@ class FeaturesEnricher(TransformerMixin):
1262
1261
  for feature, shap in new_shaps.items()
1263
1262
  if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
1264
1263
  }
1265
- self.__prepare_feature_importances(trace_id, x_columns, new_shaps)
1264
+ self.__prepare_feature_importances(trace_id, x_columns, new_shaps, silent=True)
1266
1265
 
1267
1266
  if self.features_info_display_handle is not None:
1268
1267
  try:
@@ -1569,23 +1568,9 @@ class FeaturesEnricher(TransformerMixin):
1569
1568
 
1570
1569
  fitting_eval_set_dict = {}
1571
1570
  fitting_x_columns = fitting_X.columns.to_list()
1572
- # Idempotently sort columns
1573
- fitting_x_columns = sort_columns(
1574
- fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
1575
- )
1576
- fitting_X = fitting_X[fitting_x_columns]
1577
- self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
1571
+ self.logger.info(f"Final list of fitting X columns: {fitting_x_columns}")
1578
1572
  fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
1579
- fitting_enriched_x_columns = sort_columns(
1580
- fitting_enriched_X,
1581
- enriched_y_sorted,
1582
- search_keys,
1583
- self.model_task_type,
1584
- sort_all_columns=True,
1585
- logger=self.logger,
1586
- )
1587
- fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
1588
- self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
1573
+ self.logger.info(f"Final list of fitting enriched X columns: {fitting_enriched_x_columns}")
1589
1574
  for idx, eval_tuple in eval_set_sampled_dict.items():
1590
1575
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
1591
1576
  eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
@@ -1749,15 +1734,11 @@ class FeaturesEnricher(TransformerMixin):
1749
1734
  if eval_set is not None
1750
1735
  else (Dataset.FIT_SAMPLE_THRESHOLD, Dataset.FIT_SAMPLE_ROWS)
1751
1736
  )
1752
-
1753
- df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET, columns_renaming, silent=True)
1754
- # Sample after sorting by system_record_id for idempotency
1755
- df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
1756
-
1757
1737
  if num_samples > sample_threshold:
1758
1738
  self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
1759
1739
  df = df.sample(n=sample_rows, random_state=self.random_state)
1760
1740
 
1741
+ df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
1761
1742
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
1762
1743
  df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1763
1744
 
@@ -1901,7 +1882,6 @@ class FeaturesEnricher(TransformerMixin):
1901
1882
  and self.columns_for_online_api is not None
1902
1883
  and num_samples > Dataset.FORCE_SAMPLE_SIZE
1903
1884
  )
1904
- # TODO: check that system_record_id was added before this step
1905
1885
  if force_downsampling:
1906
1886
  self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1907
1887
  df = balance_undersample_forced(
@@ -1935,7 +1915,6 @@ class FeaturesEnricher(TransformerMixin):
1935
1915
  progress_bar=progress_bar,
1936
1916
  progress_callback=progress_callback,
1937
1917
  add_fit_system_record_id=True,
1938
- target_name=tmp_target_name,
1939
1918
  )
1940
1919
  if enriched_df is None:
1941
1920
  return None
@@ -1974,7 +1953,6 @@ class FeaturesEnricher(TransformerMixin):
1974
1953
  and self.columns_for_online_api is not None
1975
1954
  and num_samples > Dataset.FORCE_SAMPLE_SIZE
1976
1955
  )
1977
-
1978
1956
  if force_downsampling:
1979
1957
  self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1980
1958
  df = balance_undersample_forced(
@@ -2006,7 +1984,6 @@ class FeaturesEnricher(TransformerMixin):
2006
1984
  progress_bar=progress_bar,
2007
1985
  progress_callback=progress_callback,
2008
1986
  add_fit_system_record_id=True,
2009
- target_name=tmp_target_name,
2010
1987
  )
2011
1988
  if enriched_Xy is None:
2012
1989
  return None
@@ -2168,7 +2145,6 @@ if response.status_code == 200:
2168
2145
  progress_bar: Optional[ProgressBar] = None,
2169
2146
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
2170
2147
  add_fit_system_record_id: bool = False,
2171
- target_name: Optional[str] = None,
2172
2148
  ) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
2173
2149
  if self._search_task is None:
2174
2150
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
@@ -2353,16 +2329,8 @@ if response.status_code == 200:
2353
2329
  and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
2354
2330
  ]
2355
2331
 
2356
- if add_fit_system_record_id and target_name is not None:
2357
- reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
2358
- df = self.__add_fit_system_record_id(
2359
- df,
2360
- search_keys,
2361
- SYSTEM_RECORD_ID,
2362
- reversed_columns_renaming.get(target_name, target_name),
2363
- columns_renaming,
2364
- silent=True,
2365
- )
2332
+ if add_fit_system_record_id:
2333
+ df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
2366
2334
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2367
2335
  features_not_to_pass.append(SORT_ID)
2368
2336
 
@@ -2807,9 +2775,7 @@ if response.status_code == 200:
2807
2775
  self.__log_warning(full_duplicates_warning)
2808
2776
 
2809
2777
  # Explode multiple search keys
2810
- df = self.__add_fit_system_record_id(
2811
- df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming
2812
- )
2778
+ df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2813
2779
 
2814
2780
  # TODO check that this is correct for enrichment
2815
2781
  self.df_with_original_index = df.copy()
@@ -2891,9 +2857,7 @@ if response.status_code == 200:
2891
2857
  if eval_set is not None and len(eval_set) > 0:
2892
2858
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2893
2859
 
2894
- df = self.__add_fit_system_record_id(
2895
- df, self.fit_search_keys, SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming, silent=True
2896
- )
2860
+ df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
2897
2861
 
2898
2862
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2899
2863
  df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
@@ -3580,82 +3544,56 @@ if response.status_code == 200:
3580
3544
  def __add_fit_system_record_id(
3581
3545
  self,
3582
3546
  df: pd.DataFrame,
3547
+ # meaning_types: Dict[str, FileColumnMeaningType],
3583
3548
  search_keys: Dict[str, SearchKey],
3584
3549
  id_name: str,
3585
- target_name: str,
3586
- columns_renaming: Dict[str, str],
3587
- silent: bool = False,
3588
3550
  ) -> pd.DataFrame:
3551
+ # save original order or rows
3589
3552
  original_index_name = df.index.name
3590
3553
  index_name = df.index.name or DEFAULT_INDEX
3591
3554
  original_order_name = "original_order"
3592
- # Save original index
3593
3555
  df = df.reset_index().rename(columns={index_name: ORIGINAL_INDEX})
3594
- # Save original order
3595
3556
  df = df.reset_index().rename(columns={DEFAULT_INDEX: original_order_name})
3596
3557
 
3597
- # order by date and idempotent order by other keys and features
3558
+ # order by date and idempotent order by other keys
3559
+ if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
3560
+ sort_exclude_columns = [
3561
+ original_order_name,
3562
+ ORIGINAL_INDEX,
3563
+ EVAL_SET_INDEX,
3564
+ TARGET,
3565
+ "__target",
3566
+ ENTITY_SYSTEM_RECORD_ID,
3567
+ ]
3568
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3569
+ date_column = DateTimeSearchKeyConverter.DATETIME_COL
3570
+ sort_exclude_columns.append(self._get_date_column(search_keys))
3571
+ else:
3572
+ date_column = self._get_date_column(search_keys)
3573
+ sort_columns = [date_column] if date_column is not None else []
3598
3574
 
3599
- sort_exclude_columns = [
3600
- original_order_name,
3601
- ORIGINAL_INDEX,
3602
- EVAL_SET_INDEX,
3603
- TARGET,
3604
- "__target",
3605
- ENTITY_SYSTEM_RECORD_ID,
3606
- ]
3607
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3608
- date_column = DateTimeSearchKeyConverter.DATETIME_COL
3609
- sort_exclude_columns.append(FeaturesEnricher._get_date_column(search_keys))
3610
- else:
3611
- date_column = FeaturesEnricher._get_date_column(search_keys)
3612
- sort_exclude_columns.append(date_column)
3613
- columns_to_sort = [date_column] if date_column is not None else []
3614
-
3615
- do_sorting = True
3616
- if self.id_columns and self.cv in [CVType.time_series, CVType.blocked_time_series]:
3617
- # Check duplicates by date and id_columns
3618
- reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
3619
- renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in self.id_columns]
3620
- duplicate_check_columns = [c for c in renamed_id_columns if c in df.columns]
3621
- if date_column is not None:
3622
- duplicate_check_columns.append(date_column)
3575
+ sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
3576
+ sorted_other_keys = [k for k in sorted_other_keys if k not in sort_exclude_columns]
3623
3577
 
3624
- duplicates = df.duplicated(subset=duplicate_check_columns, keep=False)
3625
- if duplicates.any():
3626
- if not silent:
3627
- self.__log_warning(self.bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
3628
- else:
3629
- self.logger.warning(
3630
- f"Found {duplicates.sum()} duplicate rows by date and ID columns: {duplicate_check_columns}."
3631
- " Will not sort dataset"
3632
- )
3633
- do_sorting = False
3634
- else:
3635
- columns_to_hash = list(search_keys.keys()) + renamed_id_columns + [target_name]
3636
- columns_to_hash = sort_columns(
3637
- df[columns_to_hash],
3638
- target_name,
3639
- search_keys,
3640
- self.model_task_type,
3641
- sort_exclude_columns,
3642
- logger=self.logger,
3643
- )
3644
- else:
3645
- columns_to_hash = sort_columns(
3646
- df, target_name, search_keys, self.model_task_type, sort_exclude_columns, logger=self.logger
3578
+ other_columns = sorted(
3579
+ [
3580
+ c
3581
+ for c in df.columns
3582
+ if c not in sort_columns
3583
+ and c not in sorted_other_keys
3584
+ and c not in sort_exclude_columns
3585
+ and df[c].nunique() > 1
3586
+ ]
3647
3587
  )
3648
- if do_sorting:
3588
+
3589
+ all_other_columns = sorted_other_keys + other_columns
3590
+
3649
3591
  search_keys_hash = "search_keys_hash"
3650
- if len(columns_to_hash) > 0:
3651
- factorized_df = df.copy()
3652
- for col in columns_to_hash:
3653
- if col not in search_keys and not is_numeric_dtype(factorized_df[col]):
3654
- factorized_df[col] = factorized_df[col].factorize(sort=True)[0]
3655
- df[search_keys_hash] = pd.util.hash_pandas_object(factorized_df[columns_to_hash], index=False)
3656
- columns_to_sort.append(search_keys_hash)
3657
-
3658
- df = df.sort_values(by=columns_to_sort)
3592
+ if len(all_other_columns) > 0:
3593
+ sort_columns.append(search_keys_hash)
3594
+ df[search_keys_hash] = pd.util.hash_pandas_object(df[all_other_columns], index=False)
3595
+
3596
+ df = df.sort_values(by=sort_columns)
3659
3597
 
3660
3598
  if search_keys_hash in df.columns:
3661
3599
  df.drop(columns=search_keys_hash, inplace=True)
@@ -30,8 +30,8 @@ except ImportError:
30
30
  from sklearn.metrics._regression import (
31
31
  _check_reg_targets,
32
32
  check_consistent_length,
33
+ mean_squared_error,
33
34
  )
34
- from sklearn.metrics import mean_squared_error
35
35
  from sklearn.model_selection import BaseCrossValidator
36
36
 
37
37
  from upgini.errors import ValidationError
@@ -289,6 +289,9 @@ class EstimatorWrapper:
289
289
  else:
290
290
  x, y = self._remove_empty_target_rows(x, y)
291
291
 
292
+ # Make order of columns idempotent
293
+ x = x[sorted(x.columns)]
294
+
292
295
  self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
293
296
  return x, y, groups
294
297
 
@@ -566,7 +569,7 @@ class CatBoostWrapper(EstimatorWrapper):
566
569
  if all([isinstance(c, int) for c in estimator_cat_features]):
567
570
  cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
568
571
  cat_features_idx.update(estimator_cat_features)
569
- self.cat_features = [x.columns[idx] for idx in cat_features_idx]
572
+ self.cat_features = [x.columns[idx] for idx in sorted(cat_features_idx)]
570
573
  elif all([isinstance(c, str) for c in estimator_cat_features]):
571
574
  self.cat_features = list(set(self.cat_features + estimator_cat_features))
572
575
  else:
@@ -937,13 +940,13 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
937
940
  if (y_true < 0).any():
938
941
  raise ValidationError(bundle.get("metrics_msle_negative_target"))
939
942
 
940
- mse = mean_squared_error(
943
+ return mean_squared_error(
941
944
  log1p(y_true),
942
945
  log1p(y_pred.clip(0)),
943
946
  sample_weight=sample_weight,
944
947
  multioutput=multioutput,
948
+ squared=squared,
945
949
  )
946
- return mse if squared else np.sqrt(mse)
947
950
 
948
951
 
949
952
  def fill_na_cat_features(df: pd.DataFrame, cat_features: List[str]) -> pd.DataFrame:
@@ -35,7 +35,6 @@ trial_quota_limit_riched=You have reached the quota limit of trial data usage. P
35
35
  loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
36
36
  loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
37
37
  multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
38
- date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns
39
38
  group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
40
39
  current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
41
40
  # Errors
@@ -166,8 +166,6 @@ class DateTimeSearchKeyConverter:
166
166
 
167
167
  # Drop intermediate columns if not needed
168
168
  df.drop(columns=["second", "minute", "hour"], inplace=True)
169
- else:
170
- keep_time = False
171
169
 
172
170
  for generated_feature in self.generated_features[:]:
173
171
  if df[generated_feature].dropna().nunique() <= 1:
@@ -1,3 +1,4 @@
1
+ import itertools
1
2
  import logging
2
3
  from typing import Callable, List, Optional, Union
3
4
 
@@ -9,6 +10,7 @@ from upgini.errors import ValidationError
9
10
  from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
10
11
  from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
11
12
  from upgini.sampler.random_under_sampler import RandomUnderSampler
13
+ from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
12
14
 
13
15
  TS_MIN_DIFFERENT_IDS_RATIO = 0.2
14
16
 
@@ -206,7 +208,7 @@ def balance_undersample_forced(
206
208
  id_columns: List[str],
207
209
  date_column: str,
208
210
  task_type: ModelTaskType,
209
- cv_type: Optional[CVType],
211
+ cv_type: CVType | None,
210
212
  random_state: int,
211
213
  sample_size: int = 7000,
212
214
  logger: Optional[logging.Logger] = None,
@@ -240,7 +242,7 @@ def balance_undersample_forced(
240
242
  df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
241
243
  if cv_type is not None and cv_type.is_time_series():
242
244
  logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
243
- resampled_data = balance_undersample_time_series(
245
+ resampled_data = balance_undersample_time_series_trunc(
244
246
  df,
245
247
  id_columns=id_columns,
246
248
  date_column=date_column,
@@ -279,6 +281,58 @@ def balance_undersample_forced(
279
281
  return resampled_data
280
282
 
281
283
 
284
+ DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
285
+ DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
286
+ DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
287
+
288
+
289
+ def balance_undersample_time_series_trunc(
290
+ df: pd.DataFrame,
291
+ id_columns: List[str],
292
+ date_column: str,
293
+ sample_size: int,
294
+ random_state: int = 42,
295
+ logger: Optional[logging.Logger] = None,
296
+ highfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_HIGH_FREQ_TRUNC_LENGTHS,
297
+ lowfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_LOW_FREQ_TRUNC_LENGTHS,
298
+ time_unit_threshold: pd.Timedelta = DEFAULT_TIME_UNIT_THRESHOLD,
299
+ **kwargs,
300
+ ):
301
+ # Convert date column to datetime
302
+ dates_df = df[id_columns + [date_column]].copy()
303
+ dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
304
+
305
+ time_unit = get_most_frequent_time_unit(dates_df, id_columns, date_column)
306
+ if logger is not None:
307
+ logger.info(f"Time unit: {time_unit}")
308
+
309
+ if time_unit is None:
310
+ if logger is not None:
311
+ logger.info("Cannot detect time unit, returning original dataset")
312
+ return df
313
+
314
+ if time_unit < time_unit_threshold:
315
+ for trunc_length in highfreq_trunc_lengths:
316
+ sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
317
+ if len(sampled_df) <= sample_size:
318
+ break
319
+ if len(sampled_df) > sample_size:
320
+ sampled_df = balance_undersample_time_series(
321
+ sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
322
+ )
323
+ else:
324
+ for trunc_length in lowfreq_trunc_lengths:
325
+ sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
326
+ if len(sampled_df) <= sample_size:
327
+ break
328
+ if len(sampled_df) > sample_size:
329
+ sampled_df = balance_undersample_time_series(
330
+ sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
331
+ )
332
+
333
+ return df.loc[sampled_df.index]
334
+
335
+
282
336
  def balance_undersample_time_series(
283
337
  df: pd.DataFrame,
284
338
  id_columns: List[str],
@@ -318,8 +372,7 @@ def balance_undersample_time_series(
318
372
  if len(id_counts) < min_different_ids:
319
373
  if logger is not None:
320
374
  logger.info(
321
- f"Different ids count {len(id_counts)} for sample size {sample_size}"
322
- f" is less than min different ids {min_different_ids}, sampling time window"
375
+ f"Different ids count {len(id_counts)} for sample size {sample_size} is less than min different ids {min_different_ids}, sampling time window"
323
376
  )
324
377
  date_counts = df.groupby(id_columns)[date_column].nunique().sort_values(ascending=False)
325
378
  ids_to_sample = date_counts.index[:min_different_ids] if len(id_counts) > 0 else date_counts.index
@@ -0,0 +1,47 @@
1
+ import logging
2
+ from typing import List, Optional
3
+ import pandas as pd
4
+
5
+
6
+ def get_most_frequent_time_unit(df: pd.DataFrame, id_columns: List[str], date_column: str) -> Optional[pd.DateOffset]:
7
+
8
+ def closest_unit(diff):
9
+ return pd.tseries.frequencies.to_offset(pd.Timedelta(diff, unit="s"))
10
+
11
+ # Calculate differences for each ID group
12
+ all_diffs = []
13
+ groups = df.groupby(id_columns) if id_columns else [(None, df)]
14
+ for _, group in groups:
15
+ # Get sorted dates for this group
16
+ group_dates = group[date_column].sort_values().unique()
17
+ if len(group_dates) > 1:
18
+ # Calculate time differences between consecutive dates
19
+ diff_series = pd.Series(group_dates[1:] - group_dates[:-1])
20
+ # Convert to nanoseconds
21
+ diff_ns = diff_series.dt.total_seconds()
22
+ all_diffs.extend(diff_ns)
23
+
24
+ # Convert to series for easier processing
25
+ all_diffs = pd.Series(all_diffs)
26
+
27
+ # Get most common time unit across all groups
28
+ most_frequent_unit = all_diffs.apply(closest_unit).mode().min()
29
+
30
+ return most_frequent_unit if isinstance(most_frequent_unit, pd.DateOffset) else None
31
+
32
+
33
+ def trunc_datetime(
34
+ df: pd.DataFrame,
35
+ id_columns: List[str],
36
+ date_column: str,
37
+ length: pd.DateOffset,
38
+ logger: Optional[logging.Logger] = None,
39
+ ) -> pd.DataFrame:
40
+ if logger is not None:
41
+ logger.info(f"Truncating time series dataset to {length}")
42
+
43
+ if id_columns:
44
+ min_datetime = df.groupby(id_columns)[date_column].transform(lambda group: group.max() - length)
45
+ else:
46
+ min_datetime = df[date_column].max() - length
47
+ return df[df[date_column] > min_datetime]
@@ -1 +0,0 @@
1
- __version__ = "1.2.60"
@@ -1,177 +0,0 @@
1
- import warnings
2
- from collections import namedtuple
3
-
4
- import numpy as np
5
- import numpy.ma as ma
6
- import scipy
7
- from joblib import Parallel, delayed
8
- from numpy import ndarray
9
- from psutil import cpu_count
10
-
11
- np.seterr(divide="ignore")
12
-
13
-
14
- warnings.simplefilter(action="ignore", category=RuntimeWarning)
15
-
16
-
17
- def _find_repeats(arr):
18
- # This function assumes it may clobber its input.
19
- if len(arr) == 0:
20
- return np.array(0, np.float64), np.array(0, np.intp)
21
-
22
- # XXX This cast was previously needed for the Fortran implementation,
23
- # should we ditch it?
24
- arr = np.asarray(arr, np.float64).ravel()
25
- arr.sort()
26
-
27
- # Taken from NumPy 1.9's np.unique.
28
- change = np.concatenate(([True], arr[1:] != arr[:-1]))
29
- unique = arr[change]
30
- change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
31
- freq = np.diff(change_idx)
32
- atleast2 = freq > 1
33
- return unique[atleast2], freq[atleast2]
34
-
35
-
36
- def find_repeats(arr):
37
- # Make sure we get a copy. ma.compressed promises a "new array", but can
38
- # actually return a reference.
39
- compr = np.asarray(ma.compressed(arr), dtype=np.float64)
40
- try:
41
- need_copy = np.may_share_memory(compr, arr)
42
- except AttributeError:
43
- # numpy < 1.8.2 bug: np.may_share_memory([], []) raises,
44
- # while in numpy 1.8.2 and above it just (correctly) returns False.
45
- need_copy = False
46
- if need_copy:
47
- compr = compr.copy()
48
- return _find_repeats(compr)
49
-
50
-
51
- def rankdata(data, axis=None, use_missing=False):
52
- def _rank1d(data, use_missing=False):
53
- n = data.count()
54
- rk = np.empty(data.size, dtype=float)
55
- idx = data.argsort()
56
- rk[idx[:n]] = np.arange(1, n + 1)
57
-
58
- if use_missing:
59
- rk[idx[n:]] = (n + 1) / 2.0
60
- else:
61
- rk[idx[n:]] = 0
62
-
63
- repeats = find_repeats(data.copy())
64
- for r in repeats[0]:
65
- condition = (data == r).filled(False)
66
- rk[condition] = rk[condition].mean()
67
- return rk
68
-
69
- data = ma.array(data, copy=False)
70
- if axis is None:
71
- if data.ndim > 1:
72
- return _rank1d(data.ravel(), use_missing).reshape(data.shape)
73
- else:
74
- return _rank1d(data, use_missing)
75
- else:
76
- return ma.apply_along_axis(_rank1d, axis, data, use_missing).view(ndarray)
77
-
78
-
79
- def _chk_asarray(a, axis):
80
- # Always returns a masked array, raveled for axis=None
81
- a = ma.asanyarray(a)
82
- if axis is None:
83
- a = ma.ravel(a)
84
- outaxis = 0
85
- else:
86
- outaxis = axis
87
- return a, outaxis
88
-
89
-
90
- SpearmanrResult = namedtuple("SpearmanrResult", ("correlation", "pvalue"))
91
-
92
-
93
- # Taken from scipy.mstats with following tweaks:
94
- # 1. parallel pairwise computation
95
- # 2. custom masking
96
- def spearmanr(
97
- x, y=None, use_ties=True, axis=None, nan_policy="propagate", alternative="two-sided", mask_fn=ma.masked_invalid
98
- ):
99
- if not use_ties:
100
- raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")
101
-
102
- # Always returns a masked array, raveled if axis=None
103
- x, axisout = _chk_asarray(x, axis)
104
- if y is not None:
105
- # Deal only with 2-D `x` case.
106
- y, _ = _chk_asarray(y, axis)
107
- if axisout == 0:
108
- x = ma.column_stack((x, y))
109
- else:
110
- x = ma.row_stack((x, y))
111
-
112
- if axisout == 1:
113
- # To simplify the code that follow (always use `n_obs, n_vars` shape)
114
- x = x.T
115
-
116
- if nan_policy == "omit":
117
- x = mask_fn(x)
118
-
119
- def _spearmanr_2cols(x):
120
- # Mask the same observations for all variables, and then drop those
121
- # observations (can't leave them masked, rankdata is weird).
122
- x = ma.mask_rowcols(x, axis=0)
123
- x = x[~x.mask.any(axis=1), :]
124
-
125
- # If either column is entirely NaN or Inf
126
- if not np.any(x.data):
127
- return SpearmanrResult(np.nan, np.nan)
128
-
129
- m = ma.getmask(x)
130
- n_obs = x.shape[0]
131
- dof = n_obs - 2 - int(m.sum(axis=0)[0])
132
- if dof < 0:
133
- return SpearmanrResult(np.nan, np.nan)
134
-
135
- # Gets the ranks and rank differences
136
- x_ranked = rankdata(x, axis=0)
137
- rs = ma.corrcoef(x_ranked, rowvar=False).data
138
-
139
- # rs can have elements equal to 1, so avoid zero division warnings
140
- with np.errstate(divide="ignore"):
141
- # clip the small negative values possibly caused by rounding
142
- # errors before taking the square root
143
- t = rs * np.sqrt((dof / ((rs + 1.0) * (1.0 - rs))).clip(0))
144
-
145
- t, prob = scipy.stats._mstats_basic._ttest_finish(dof, t, alternative)
146
-
147
- # For backwards compatibility, return scalars when comparing 2 columns
148
- if rs.shape == (2, 2):
149
- return SpearmanrResult(rs[1, 0], prob[1, 0])
150
- else:
151
- return SpearmanrResult(rs, prob)
152
-
153
- # Need to do this per pair of variables, otherwise the dropped observations
154
- # in a third column mess up the result for a pair.
155
- n_vars = x.shape[1]
156
- if n_vars == 2:
157
- return _spearmanr_2cols(x)
158
- else:
159
- max_cpu_cores = cpu_count(logical=False)
160
- with np.errstate(divide="ignore"):
161
- results = Parallel(n_jobs=max_cpu_cores)(
162
- delayed(_spearmanr_2cols)(x[:, [var1, var2]])
163
- for var1 in range(n_vars - 1)
164
- for var2 in range(var1 + 1, n_vars)
165
- )
166
-
167
- rs = np.ones((n_vars, n_vars), dtype=float)
168
- prob = np.zeros((n_vars, n_vars), dtype=float)
169
- for var1 in range(n_vars - 1):
170
- for var2 in range(var1 + 1, n_vars):
171
- result = results.pop(0)
172
- rs[var1, var2] = result.correlation
173
- rs[var2, var1] = result.correlation
174
- prob[var1, var2] = result.pvalue
175
- prob[var2, var1] = result.pvalue
176
-
177
- return SpearmanrResult(rs, prob)
@@ -1,172 +0,0 @@
1
- import hashlib
2
- import logging
3
- from typing import Any, Dict, List, Optional, Union
4
-
5
- import numpy as np
6
- import pandas as pd
7
- from joblib import Parallel, delayed
8
- from pandas.api.types import is_datetime64_any_dtype, is_numeric_dtype
9
- from psutil import cpu_count
10
- from scipy.stats import skew, spearmanr
11
-
12
- from upgini.metadata import ModelTaskType, SearchKey
13
- from upgini.utils import mstats
14
-
15
-
16
- def sort_columns(
17
- df: pd.DataFrame,
18
- target_column: Union[str, pd.Series],
19
- search_keys: Dict[str, SearchKey],
20
- model_task_type: ModelTaskType,
21
- exclude_columns: Optional[List[str]] = None,
22
- sort_all_columns: bool = False,
23
- logger: Optional[logging.Logger] = None,
24
- ) -> List[str]:
25
- if exclude_columns is None:
26
- exclude_columns = []
27
- if logger is None:
28
- logger = logging.getLogger(__name__)
29
- logger.setLevel(logging.FATAL)
30
- df = df.copy() # avoid side effects
31
-
32
- # Check multiple search keys
33
- search_key_values = list(search_keys.values())
34
- has_duplicate_search_keys = len(search_key_values) != len(set(search_key_values))
35
- if has_duplicate_search_keys:
36
- logging.warning(f"WARNING: Found duplicate SearchKey values in search_keys: {search_keys}")
37
-
38
- sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
39
- sorted_keys = [k for k in sorted_keys if k in df.columns and k not in exclude_columns]
40
-
41
- other_columns = sorted(
42
- [
43
- c
44
- for c in df.columns
45
- if c not in sorted_keys and c not in exclude_columns and (df[c].nunique() > 1 or sort_all_columns)
46
- ]
47
- )
48
- target = target_column if isinstance(target_column, pd.Series) else df[target_column]
49
- target = prepare_target(target, model_task_type)
50
- sort_dict = get_sort_columns_dict(
51
- df[sorted_keys + other_columns], target, sorted_keys, omit_nan=True, sort_all_columns=sort_all_columns
52
- )
53
- other_columns = [c for c in other_columns if c in sort_dict]
54
- columns_for_sort = sorted_keys + sorted(other_columns, key=lambda e: sort_dict[e], reverse=True)
55
- return columns_for_sort
56
-
57
-
58
- def get_sort_columns_dict(
59
- df: pd.DataFrame,
60
- target: pd.Series,
61
- sorted_keys: List[str],
62
- omit_nan: bool,
63
- n_jobs: Optional[int] = None,
64
- sort_all_columns: bool = False,
65
- ) -> Dict[str, Any]:
66
- string_features = [c for c in df.select_dtypes(exclude=[np.number]).columns if c not in sorted_keys]
67
- columns_for_sort = [c for c in df.columns if c not in sorted_keys + string_features]
68
- if len(string_features) > 0:
69
- if len(df) > len(df.drop(columns=string_features).drop_duplicates()) or sort_all_columns:
70
- # factorize string features
71
- for c in string_features:
72
- df.loc[:, c] = pd.Series(df[c].factorize(sort=True)[0], index=df.index, dtype="int")
73
- columns_for_sort.extend(string_features)
74
-
75
- if len(columns_for_sort) == 0:
76
- return {}
77
-
78
- df = df[columns_for_sort]
79
- hashes = [hash_series(df[col]) for col in columns_for_sort]
80
- df = np.asarray(df, dtype=np.float32)
81
- correlations = get_sort_columns_correlations(df, target, omit_nan, n_jobs)
82
-
83
- sort_dict = {col: (corr, h) for col, corr, h in zip(columns_for_sort, correlations, hashes)}
84
- return sort_dict
85
-
86
-
87
- def get_sort_columns_correlations(df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: Optional[int] = None):
88
- target_correlations = get_target_correlations(df, target, omit_nan, n_jobs, precision=7)
89
-
90
- return np.max(target_correlations, axis=0)
91
-
92
-
93
- def get_target_correlations(
94
- df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: Optional[int] = None, precision: int = 15
95
- ):
96
- df = np.asarray(df, dtype=np.float32)
97
- target_correlations = np.zeros((2, df.shape[1]))
98
- target_correlations[0, :] = np.nan_to_num(
99
- calculate_spearman_corr_with_target(df, target, omit_nan, n_jobs), copy=False
100
- )
101
- target_correlations[1, :] = np.nan_to_num(np.abs(np.corrcoef(df.T, target.T, rowvar=True)[-1, :-1]))
102
-
103
- target_correlations = np.trunc(target_correlations * 10**precision) / (10**precision)
104
-
105
- return target_correlations
106
-
107
-
108
- def calculate_spearman_corr_with_target(
109
- X: Union[pd.DataFrame, np.ndarray], y: pd.Series, omit_nan: bool = False, n_jobs: Optional[int] = None
110
- ) -> np.ndarray:
111
- if isinstance(X, pd.DataFrame):
112
- X = np.asarray(X, dtype=np.float32)
113
-
114
- if X.size == 0:
115
- return np.ndarray(shape=(0,))
116
-
117
- all_correlations = np.zeros(X.shape[1])
118
- all_correlations.fill(np.nan)
119
- cols2calc = np.where([c.size > 0 and not (c == c[0]).all() for c in X.T])[0]
120
-
121
- if omit_nan:
122
- results = Parallel(n_jobs=n_jobs or cpu_count(logical=False))(
123
- delayed(mstats.spearmanr)(
124
- X[:, i],
125
- y,
126
- nan_policy="omit",
127
- axis=0,
128
- )
129
- for i in cols2calc
130
- )
131
- target_correlations = np.array([abs(res.correlation) for res in results])
132
- else:
133
- cols2calc = cols2calc[np.where(~np.isnan(X[:, cols2calc]).any(axis=0))[0]]
134
- target_correlations = calculate_spearman(X[:, cols2calc], y, nan_policy="raise")
135
- if isinstance(target_correlations, float):
136
- target_correlations = np.abs([target_correlations])
137
- else:
138
- target_correlations = np.abs(target_correlations)[-1, :-1]
139
-
140
- all_correlations[cols2calc] = target_correlations
141
-
142
- return all_correlations
143
-
144
-
145
- def calculate_spearman(X: np.ndarray, y: Optional[pd.Series], nan_policy: str):
146
- features_num = X.shape[1]
147
- if y is not None:
148
- features_num += 1
149
-
150
- if features_num < 2:
151
- return 1.0
152
- else:
153
- return spearmanr(X, y, nan_policy=nan_policy).correlation
154
-
155
-
156
- def hash_series(series: pd.Series) -> int:
157
- return int(hashlib.sha256(pd.util.hash_pandas_object(series, index=True).values).hexdigest(), 16)
158
-
159
-
160
- def prepare_target(target: pd.Series, model_task_type: ModelTaskType) -> pd.Series:
161
- target_name = target.name
162
- if model_task_type != ModelTaskType.REGRESSION or (
163
- not is_numeric_dtype(target) and not is_datetime64_any_dtype(target)
164
- ):
165
- target = target.astype(str).astype("category").cat.codes
166
-
167
- elif model_task_type == ModelTaskType.REGRESSION:
168
- skewness = round(abs(skew(target)), 2)
169
- if (target.min() >= 0) and (skewness >= 0.9):
170
- target = np.log1p(target)
171
-
172
- return pd.Series(target, name=target_name)
File without changes
File without changes
File without changes