upgini 1.2.59a3818.dev1__tar.gz → 1.2.60__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (69) hide show
  1. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/PKG-INFO +2 -1
  2. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/pyproject.toml +1 -0
  3. upgini-1.2.60/src/upgini/__about__.py +1 -0
  4. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/autofe/date.py +1 -1
  5. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/autofe/vector.py +1 -1
  6. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/data_source/data_source_publisher.py +1 -0
  7. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/dataset.py +16 -8
  8. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/features_enricher.py +141 -60
  9. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/metrics.py +4 -7
  10. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/resource_bundle/strings.properties +2 -0
  11. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/datetime_utils.py +2 -0
  12. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/email_utils.py +6 -6
  13. upgini-1.2.60/src/upgini/utils/mstats.py +177 -0
  14. upgini-1.2.60/src/upgini/utils/sort.py +172 -0
  15. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/target_utils.py +3 -3
  16. upgini-1.2.59a3818.dev1/src/upgini/__about__.py +0 -1
  17. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/.gitignore +0 -0
  18. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/LICENSE +0 -0
  19. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/README.md +0 -0
  20. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/__init__.py +0 -0
  21. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/ads.py +0 -0
  22. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/ads_management/__init__.py +0 -0
  23. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/ads_management/ads_manager.py +0 -0
  24. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/autofe/__init__.py +0 -0
  25. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/autofe/all_operands.py +0 -0
  26. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/autofe/binary.py +0 -0
  27. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/autofe/feature.py +0 -0
  28. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/autofe/groupby.py +0 -0
  29. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/autofe/operand.py +0 -0
  30. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/autofe/unary.py +0 -0
  31. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/data_source/__init__.py +0 -0
  32. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/errors.py +0 -0
  33. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/http.py +0 -0
  34. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/lazy_import.py +0 -0
  35. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/mdc/__init__.py +0 -0
  36. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/mdc/context.py +0 -0
  37. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/metadata.py +0 -0
  38. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/normalizer/__init__.py +0 -0
  39. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/normalizer/normalize_utils.py +0 -0
  40. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/resource_bundle/__init__.py +0 -0
  41. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/resource_bundle/exceptions.py +0 -0
  42. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  43. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/sampler/__init__.py +0 -0
  44. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/sampler/base.py +0 -0
  45. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/sampler/random_under_sampler.py +0 -0
  46. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/sampler/utils.py +0 -0
  47. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/search_task.py +0 -0
  48. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/spinner.py +0 -0
  49. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  50. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/__init__.py +0 -0
  51. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/base_search_key_detector.py +0 -0
  52. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/blocked_time_series.py +0 -0
  53. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/country_utils.py +0 -0
  54. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/custom_loss_utils.py +0 -0
  55. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/cv_utils.py +0 -0
  56. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/deduplicate_utils.py +0 -0
  57. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/display_utils.py +0 -0
  58. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/fallback_progress_bar.py +0 -0
  59. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/feature_info.py +0 -0
  60. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/features_validator.py +0 -0
  61. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/format.py +0 -0
  62. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/ip_utils.py +0 -0
  63. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/phone_utils.py +0 -0
  64. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/postal_code_utils.py +0 -0
  65. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/progress_bar.py +0 -0
  66. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/sklearn_ext.py +0 -0
  67. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/track_info.py +0 -0
  68. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/utils/warning_counter.py +0 -0
  69. {upgini-1.2.59a3818.dev1 → upgini-1.2.60}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.59a3818.dev1
3
+ Version: 1.2.60
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -30,6 +30,7 @@ Requires-Dist: jarowinkler>=2.0.0
30
30
  Requires-Dist: levenshtein>=0.25.1
31
31
  Requires-Dist: numpy<=1.26.4,>=1.19.0
32
32
  Requires-Dist: pandas<3.0.0,>=1.1.0
33
+ Requires-Dist: psutil>=6.0.0
33
34
  Requires-Dist: pydantic<3.0.0,>1.0.0
34
35
  Requires-Dist: pyjwt>=2.8.0
35
36
  Requires-Dist: python-bidi==0.4.2
@@ -50,6 +50,7 @@ dependencies = [
50
50
  "xhtml2pdf>=0.2.11,<0.3.0",
51
51
  "jarowinkler>=2.0.0",
52
52
  "levenshtein>=0.25.1",
53
+ "psutil>=6.0.0",
53
54
  ]
54
55
 
55
56
  [project.urls]
@@ -0,0 +1 @@
1
+ __version__ = "1.2.60"
@@ -1,6 +1,6 @@
1
1
  import abc
2
2
  import json
3
- from typing import Any, Dict, List, Optional, Union
3
+ from typing import Dict, List, Optional, Union
4
4
 
5
5
  import numpy as np
6
6
  import pandas as pd
@@ -55,7 +55,7 @@ class TimeSeriesBase(PandasOperand, abc.ABC):
55
55
  ts.set_index(date.name, inplace=True)
56
56
  ts = ts[ts.index.notna()].sort_index()
57
57
  ts = (
58
- ts.groupby([c.name for c in data[1:-1]], group_keys=True)
58
+ ts.groupby([c.name for c in data[1:-1]])
59
59
  .apply(self._shift)[data[-1].name]
60
60
  .to_frame()
61
61
  .reset_index()
@@ -386,6 +386,7 @@ class DataSourcePublisher:
386
386
  search_keys = [k.value.value for k in search_keys] if search_keys else None
387
387
  request = {"bqTableId": bq_table_id, "searchKeys": search_keys}
388
388
  task_id = self._rest_client.upload_online(request, trace_id)
389
+ print(f"Uploading online task created. task_id={task_id}")
389
390
  with Spinner():
390
391
  status_response = self._rest_client.poll_ads_management_task_status(task_id, trace_id)
391
392
  while status_response["status"] not in self.FINAL_STATUSES:
@@ -587,15 +587,23 @@ class Dataset: # (pd.DataFrame):
587
587
  if (
588
588
  runtime_parameters is not None
589
589
  and runtime_parameters.properties is not None
590
- and "generate_features" in runtime_parameters.properties
591
590
  ):
592
- generate_features = runtime_parameters.properties["generate_features"].split(",")
593
- renamed_generate_features = []
594
- for f in generate_features:
595
- for new_column, orig_column in self.columns_renaming.items():
596
- if f == orig_column:
597
- renamed_generate_features.append(new_column)
598
- runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
591
+ if "generate_features" in runtime_parameters.properties:
592
+ generate_features = runtime_parameters.properties["generate_features"].split(",")
593
+ renamed_generate_features = []
594
+ for f in generate_features:
595
+ for new_column, orig_column in self.columns_renaming.items():
596
+ if f == orig_column:
597
+ renamed_generate_features.append(new_column)
598
+ runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
599
+ if "columns_for_online_api" in runtime_parameters.properties:
600
+ columns_for_online_api = runtime_parameters.properties["columns_for_online_api"].split(",")
601
+ renamed_columns_for_online_api = []
602
+ for f in columns_for_online_api:
603
+ for new_column, orig_column in self.columns_renaming.items():
604
+ if f == orig_column:
605
+ renamed_columns_for_online_api.append(new_column)
606
+ runtime_parameters.properties["columns_for_online_api"] = ",".join(renamed_columns_for_online_api)
599
607
 
600
608
  return runtime_parameters
601
609
 
@@ -112,6 +112,7 @@ try:
112
112
  except Exception:
113
113
  from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
114
114
 
115
+ from upgini.utils.sort import sort_columns
115
116
  from upgini.utils.target_utils import (
116
117
  balance_undersample_forced,
117
118
  calculate_psi,
@@ -222,6 +223,7 @@ class FeaturesEnricher(TransformerMixin):
222
223
  loss: Optional[str] = None,
223
224
  detect_missing_search_keys: bool = True,
224
225
  generate_features: Optional[List[str]] = None,
226
+ columns_for_online_api: Optional[List[str]] = None,
225
227
  round_embeddings: Optional[int] = None,
226
228
  logs_enabled: bool = True,
227
229
  raise_validation_error: bool = True,
@@ -345,6 +347,9 @@ class FeaturesEnricher(TransformerMixin):
345
347
  self.logger.error(msg)
346
348
  raise ValidationError(msg)
347
349
  self.runtime_parameters.properties["round_embeddings"] = round_embeddings
350
+ self.columns_for_online_api = columns_for_online_api
351
+ if columns_for_online_api is not None:
352
+ self.runtime_parameters.properties["columns_for_online_api"] = ",".join(columns_for_online_api)
348
353
  maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
349
354
  if maybe_downsampling_limit is not None:
350
355
  Dataset.FIT_SAMPLE_THRESHOLD = int(maybe_downsampling_limit)
@@ -1257,7 +1262,7 @@ class FeaturesEnricher(TransformerMixin):
1257
1262
  for feature, shap in new_shaps.items()
1258
1263
  if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
1259
1264
  }
1260
- self.__prepare_feature_importances(trace_id, x_columns, new_shaps, silent=True)
1265
+ self.__prepare_feature_importances(trace_id, x_columns, new_shaps)
1261
1266
 
1262
1267
  if self.features_info_display_handle is not None:
1263
1268
  try:
@@ -1564,9 +1569,23 @@ class FeaturesEnricher(TransformerMixin):
1564
1569
 
1565
1570
  fitting_eval_set_dict = {}
1566
1571
  fitting_x_columns = fitting_X.columns.to_list()
1567
- self.logger.info(f"Final list of fitting X columns: {fitting_x_columns}")
1572
+ # Idempotently sort columns
1573
+ fitting_x_columns = sort_columns(
1574
+ fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
1575
+ )
1576
+ fitting_X = fitting_X[fitting_x_columns]
1577
+ self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
1568
1578
  fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
1569
- self.logger.info(f"Final list of fitting enriched X columns: {fitting_enriched_x_columns}")
1579
+ fitting_enriched_x_columns = sort_columns(
1580
+ fitting_enriched_X,
1581
+ enriched_y_sorted,
1582
+ search_keys,
1583
+ self.model_task_type,
1584
+ sort_all_columns=True,
1585
+ logger=self.logger,
1586
+ )
1587
+ fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
1588
+ self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
1570
1589
  for idx, eval_tuple in eval_set_sampled_dict.items():
1571
1590
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
1572
1591
  eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
@@ -1730,11 +1749,15 @@ class FeaturesEnricher(TransformerMixin):
1730
1749
  if eval_set is not None
1731
1750
  else (Dataset.FIT_SAMPLE_THRESHOLD, Dataset.FIT_SAMPLE_ROWS)
1732
1751
  )
1752
+
1753
+ df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET, columns_renaming, silent=True)
1754
+ # Sample after sorting by system_record_id for idempotency
1755
+ df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
1756
+
1733
1757
  if num_samples > sample_threshold:
1734
1758
  self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
1735
1759
  df = df.sample(n=sample_rows, random_state=self.random_state)
1736
1760
 
1737
- df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
1738
1761
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
1739
1762
  df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1740
1763
 
@@ -1873,15 +1896,12 @@ class FeaturesEnricher(TransformerMixin):
1873
1896
 
1874
1897
  # downsample if need to eval_set threshold
1875
1898
  num_samples = _num_samples(df)
1876
- phone_column = self._get_phone_column(self.search_keys)
1877
1899
  force_downsampling = (
1878
1900
  not self.disable_force_downsampling
1879
- and self.generate_features is not None
1880
- and phone_column is not None
1881
- and self.fit_columns_renaming is not None
1882
- and self.fit_columns_renaming.get(phone_column) in self.generate_features
1901
+ and self.columns_for_online_api is not None
1883
1902
  and num_samples > Dataset.FORCE_SAMPLE_SIZE
1884
1903
  )
1904
+ # TODO: check that system_record_id was added before this step
1885
1905
  if force_downsampling:
1886
1906
  self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1887
1907
  df = balance_undersample_forced(
@@ -1915,6 +1935,7 @@ class FeaturesEnricher(TransformerMixin):
1915
1935
  progress_bar=progress_bar,
1916
1936
  progress_callback=progress_callback,
1917
1937
  add_fit_system_record_id=True,
1938
+ target_name=tmp_target_name,
1918
1939
  )
1919
1940
  if enriched_df is None:
1920
1941
  return None
@@ -1948,7 +1969,28 @@ class FeaturesEnricher(TransformerMixin):
1948
1969
  df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1949
1970
 
1950
1971
  num_samples = _num_samples(df)
1951
- if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
1972
+ force_downsampling = (
1973
+ not self.disable_force_downsampling
1974
+ and self.columns_for_online_api is not None
1975
+ and num_samples > Dataset.FORCE_SAMPLE_SIZE
1976
+ )
1977
+
1978
+ if force_downsampling:
1979
+ self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1980
+ df = balance_undersample_forced(
1981
+ df=df,
1982
+ target_column=TARGET,
1983
+ id_columns=self.id_columns,
1984
+ date_column=self._get_date_column(self.search_keys),
1985
+ task_type=self.model_task_type,
1986
+ cv_type=self.cv,
1987
+ random_state=self.random_state,
1988
+ sample_size=Dataset.FORCE_SAMPLE_SIZE,
1989
+ logger=self.logger,
1990
+ bundle=self.bundle,
1991
+ warning_callback=self.__log_warning,
1992
+ )
1993
+ elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
1952
1994
  self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
1953
1995
  df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
1954
1996
 
@@ -1964,6 +2006,7 @@ class FeaturesEnricher(TransformerMixin):
1964
2006
  progress_bar=progress_bar,
1965
2007
  progress_callback=progress_callback,
1966
2008
  add_fit_system_record_id=True,
2009
+ target_name=tmp_target_name,
1967
2010
  )
1968
2011
  if enriched_Xy is None:
1969
2012
  return None
@@ -2125,6 +2168,7 @@ if response.status_code == 200:
2125
2168
  progress_bar: Optional[ProgressBar] = None,
2126
2169
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
2127
2170
  add_fit_system_record_id: bool = False,
2171
+ target_name: Optional[str] = None,
2128
2172
  ) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
2129
2173
  if self._search_task is None:
2130
2174
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
@@ -2309,8 +2353,16 @@ if response.status_code == 200:
2309
2353
  and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
2310
2354
  ]
2311
2355
 
2312
- if add_fit_system_record_id:
2313
- df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
2356
+ if add_fit_system_record_id and target_name is not None:
2357
+ reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
2358
+ df = self.__add_fit_system_record_id(
2359
+ df,
2360
+ search_keys,
2361
+ SYSTEM_RECORD_ID,
2362
+ reversed_columns_renaming.get(target_name, target_name),
2363
+ columns_renaming,
2364
+ silent=True,
2365
+ )
2314
2366
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2315
2367
  features_not_to_pass.append(SORT_ID)
2316
2368
 
@@ -2620,17 +2672,18 @@ if response.status_code == 200:
2620
2672
  checked_generate_features = []
2621
2673
  for gen_feature in self.generate_features:
2622
2674
  if gen_feature not in x_columns:
2623
- if gen_feature == self._get_phone_column(self.search_keys):
2624
- raise ValidationError(
2625
- self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
2626
- )
2627
- else:
2628
- self.__log_warning(self.bundle.get("missing_generate_feature").format(gen_feature, x_columns))
2675
+ msg = self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
2676
+ self.__log_warning(msg)
2629
2677
  else:
2630
2678
  checked_generate_features.append(gen_feature)
2631
2679
  self.generate_features = checked_generate_features
2632
2680
  self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
2633
2681
 
2682
+ if self.columns_for_online_api is not None and len(self.columns_for_online_api) > 0:
2683
+ for column in self.columns_for_online_api:
2684
+ if column not in validated_X.columns:
2685
+ raise ValidationError(self.bundle.get("missing_column_for_online_api").format(column))
2686
+
2634
2687
  if self.id_columns is not None:
2635
2688
  for id_column in self.id_columns:
2636
2689
  if id_column not in validated_X.columns:
@@ -2754,7 +2807,9 @@ if response.status_code == 200:
2754
2807
  self.__log_warning(full_duplicates_warning)
2755
2808
 
2756
2809
  # Explode multiple search keys
2757
- df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2810
+ df = self.__add_fit_system_record_id(
2811
+ df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming
2812
+ )
2758
2813
 
2759
2814
  # TODO check that this is correct for enrichment
2760
2815
  self.df_with_original_index = df.copy()
@@ -2836,7 +2891,9 @@ if response.status_code == 200:
2836
2891
  if eval_set is not None and len(eval_set) > 0:
2837
2892
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2838
2893
 
2839
- df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
2894
+ df = self.__add_fit_system_record_id(
2895
+ df, self.fit_search_keys, SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming, silent=True
2896
+ )
2840
2897
 
2841
2898
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2842
2899
  df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
@@ -2852,9 +2909,7 @@ if response.status_code == 200:
2852
2909
  # Force downsampling to 7000 for API features generation
2853
2910
  force_downsampling = (
2854
2911
  not self.disable_force_downsampling
2855
- and self.generate_features is not None
2856
- and phone_column is not None
2857
- and self.fit_columns_renaming[phone_column] in self.generate_features
2912
+ and self.columns_for_online_api is not None
2858
2913
  and len(df) > Dataset.FORCE_SAMPLE_SIZE
2859
2914
  )
2860
2915
  if force_downsampling:
@@ -3525,56 +3580,82 @@ if response.status_code == 200:
3525
3580
  def __add_fit_system_record_id(
3526
3581
  self,
3527
3582
  df: pd.DataFrame,
3528
- # meaning_types: Dict[str, FileColumnMeaningType],
3529
3583
  search_keys: Dict[str, SearchKey],
3530
3584
  id_name: str,
3585
+ target_name: str,
3586
+ columns_renaming: Dict[str, str],
3587
+ silent: bool = False,
3531
3588
  ) -> pd.DataFrame:
3532
- # save original order or rows
3533
3589
  original_index_name = df.index.name
3534
3590
  index_name = df.index.name or DEFAULT_INDEX
3535
3591
  original_order_name = "original_order"
3592
+ # Save original index
3536
3593
  df = df.reset_index().rename(columns={index_name: ORIGINAL_INDEX})
3594
+ # Save original order
3537
3595
  df = df.reset_index().rename(columns={DEFAULT_INDEX: original_order_name})
3538
3596
 
3539
- # order by date and idempotent order by other keys
3540
- if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
3541
- sort_exclude_columns = [
3542
- original_order_name,
3543
- ORIGINAL_INDEX,
3544
- EVAL_SET_INDEX,
3545
- TARGET,
3546
- "__target",
3547
- ENTITY_SYSTEM_RECORD_ID,
3548
- ]
3549
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3550
- date_column = DateTimeSearchKeyConverter.DATETIME_COL
3551
- sort_exclude_columns.append(self._get_date_column(search_keys))
3552
- else:
3553
- date_column = self._get_date_column(search_keys)
3554
- sort_columns = [date_column] if date_column is not None else []
3597
+ # order by date and idempotent order by other keys and features
3555
3598
 
3556
- sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
3557
- sorted_other_keys = [k for k in sorted_other_keys if k not in sort_exclude_columns]
3599
+ sort_exclude_columns = [
3600
+ original_order_name,
3601
+ ORIGINAL_INDEX,
3602
+ EVAL_SET_INDEX,
3603
+ TARGET,
3604
+ "__target",
3605
+ ENTITY_SYSTEM_RECORD_ID,
3606
+ ]
3607
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3608
+ date_column = DateTimeSearchKeyConverter.DATETIME_COL
3609
+ sort_exclude_columns.append(FeaturesEnricher._get_date_column(search_keys))
3610
+ else:
3611
+ date_column = FeaturesEnricher._get_date_column(search_keys)
3612
+ sort_exclude_columns.append(date_column)
3613
+ columns_to_sort = [date_column] if date_column is not None else []
3614
+
3615
+ do_sorting = True
3616
+ if self.id_columns and self.cv in [CVType.time_series, CVType.blocked_time_series]:
3617
+ # Check duplicates by date and id_columns
3618
+ reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
3619
+ renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in self.id_columns]
3620
+ duplicate_check_columns = [c for c in renamed_id_columns if c in df.columns]
3621
+ if date_column is not None:
3622
+ duplicate_check_columns.append(date_column)
3558
3623
 
3559
- other_columns = sorted(
3560
- [
3561
- c
3562
- for c in df.columns
3563
- if c not in sort_columns
3564
- and c not in sorted_other_keys
3565
- and c not in sort_exclude_columns
3566
- and df[c].nunique() > 1
3567
- ]
3624
+ duplicates = df.duplicated(subset=duplicate_check_columns, keep=False)
3625
+ if duplicates.any():
3626
+ if not silent:
3627
+ self.__log_warning(self.bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
3628
+ else:
3629
+ self.logger.warning(
3630
+ f"Found {duplicates.sum()} duplicate rows by date and ID columns: {duplicate_check_columns}."
3631
+ " Will not sort dataset"
3632
+ )
3633
+ do_sorting = False
3634
+ else:
3635
+ columns_to_hash = list(search_keys.keys()) + renamed_id_columns + [target_name]
3636
+ columns_to_hash = sort_columns(
3637
+ df[columns_to_hash],
3638
+ target_name,
3639
+ search_keys,
3640
+ self.model_task_type,
3641
+ sort_exclude_columns,
3642
+ logger=self.logger,
3643
+ )
3644
+ else:
3645
+ columns_to_hash = sort_columns(
3646
+ df, target_name, search_keys, self.model_task_type, sort_exclude_columns, logger=self.logger
3568
3647
  )
3569
-
3570
- all_other_columns = sorted_other_keys + other_columns
3571
-
3648
+ if do_sorting:
3572
3649
  search_keys_hash = "search_keys_hash"
3573
- if len(all_other_columns) > 0:
3574
- sort_columns.append(search_keys_hash)
3575
- df[search_keys_hash] = pd.util.hash_pandas_object(df[all_other_columns], index=False)
3576
-
3577
- df = df.sort_values(by=sort_columns)
3650
+ if len(columns_to_hash) > 0:
3651
+ factorized_df = df.copy()
3652
+ for col in columns_to_hash:
3653
+ if col not in search_keys and not is_numeric_dtype(factorized_df[col]):
3654
+ factorized_df[col] = factorized_df[col].factorize(sort=True)[0]
3655
+ df[search_keys_hash] = pd.util.hash_pandas_object(factorized_df[columns_to_hash], index=False)
3656
+ columns_to_sort.append(search_keys_hash)
3657
+
3658
+ df = df.sort_values(by=columns_to_sort)
3578
3659
 
3579
3660
  if search_keys_hash in df.columns:
3580
3661
  df.drop(columns=search_keys_hash, inplace=True)
@@ -30,8 +30,8 @@ except ImportError:
30
30
  from sklearn.metrics._regression import (
31
31
  _check_reg_targets,
32
32
  check_consistent_length,
33
- mean_squared_error,
34
33
  )
34
+ from sklearn.metrics import mean_squared_error
35
35
  from sklearn.model_selection import BaseCrossValidator
36
36
 
37
37
  from upgini.errors import ValidationError
@@ -289,9 +289,6 @@ class EstimatorWrapper:
289
289
  else:
290
290
  x, y = self._remove_empty_target_rows(x, y)
291
291
 
292
- # Make order of columns idempotent
293
- x = x[sorted(x.columns)]
294
-
295
292
  self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
296
293
  return x, y, groups
297
294
 
@@ -569,7 +566,7 @@ class CatBoostWrapper(EstimatorWrapper):
569
566
  if all([isinstance(c, int) for c in estimator_cat_features]):
570
567
  cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
571
568
  cat_features_idx.update(estimator_cat_features)
572
- self.cat_features = [x.columns[idx] for idx in sorted(cat_features_idx)]
569
+ self.cat_features = [x.columns[idx] for idx in cat_features_idx]
573
570
  elif all([isinstance(c, str) for c in estimator_cat_features]):
574
571
  self.cat_features = list(set(self.cat_features + estimator_cat_features))
575
572
  else:
@@ -940,13 +937,13 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
940
937
  if (y_true < 0).any():
941
938
  raise ValidationError(bundle.get("metrics_msle_negative_target"))
942
939
 
943
- return mean_squared_error(
940
+ mse = mean_squared_error(
944
941
  log1p(y_true),
945
942
  log1p(y_pred.clip(0)),
946
943
  sample_weight=sample_weight,
947
944
  multioutput=multioutput,
948
- squared=squared,
949
945
  )
946
+ return mse if squared else np.sqrt(mse)
950
947
 
951
948
 
952
949
  def fill_na_cat_features(df: pd.DataFrame, cat_features: List[str]) -> pd.DataFrame:
@@ -35,6 +35,7 @@ trial_quota_limit_riched=You have reached the quota limit of trial data usage. P
35
35
  loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
36
36
  loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
37
37
  multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
38
+ date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns
38
39
  group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
39
40
  current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
40
41
  # Errors
@@ -111,6 +112,7 @@ x_is_empty=X is empty
111
112
  y_is_empty=y is empty
112
113
  x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
113
114
  missing_generate_feature=Feature {} specified in `generate_features` is not present in input columns: {}
115
+ missing_column_for_online_api=Column {} specified in `columns_for_online_api` is not present in input columns: {}
114
116
  x_unstable_by_date=Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
115
117
  train_unstable_target=Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
116
118
  eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
@@ -166,6 +166,8 @@ class DateTimeSearchKeyConverter:
166
166
 
167
167
  # Drop intermediate columns if not needed
168
168
  df.drop(columns=["second", "minute", "hour"], inplace=True)
169
+ else:
170
+ keep_time = False
169
171
 
170
172
  for generated_feature in self.generated_features[:]:
171
173
  if df[generated_feature].dropna().nunique() <= 1:
@@ -116,17 +116,17 @@ class EmailSearchKeyConverter:
116
116
  else:
117
117
  df[self.hem_column] = df[self.hem_column].astype("string").str.lower()
118
118
 
119
- del self.search_keys[self.email_column]
120
- if self.email_column in self.unnest_search_keys:
121
- self.unnest_search_keys.remove(self.email_column)
119
+ # del self.search_keys[self.email_column]
120
+ # if self.email_column in self.unnest_search_keys:
121
+ # self.unnest_search_keys.remove(self.email_column)
122
122
 
123
123
  one_domain_name = self.email_column + self.ONE_DOMAIN_SUFFIX
124
124
  df[one_domain_name] = df[self.email_column].apply(self._email_to_one_domain)
125
125
  self.columns_renaming[one_domain_name] = original_email_column
126
126
  self.search_keys[one_domain_name] = SearchKey.EMAIL_ONE_DOMAIN
127
127
 
128
- if self.email_converted_to_hem:
129
- df = df.drop(columns=self.email_column)
130
- del self.columns_renaming[self.email_column]
128
+ # if self.email_converted_to_hem:
129
+ # df = df.drop(columns=self.email_column)
130
+ # del self.columns_renaming[self.email_column]
131
131
 
132
132
  return df
@@ -0,0 +1,177 @@
1
+ import warnings
2
+ from collections import namedtuple
3
+
4
+ import numpy as np
5
+ import numpy.ma as ma
6
+ import scipy
7
+ from joblib import Parallel, delayed
8
+ from numpy import ndarray
9
+ from psutil import cpu_count
10
+
11
+ np.seterr(divide="ignore")
12
+
13
+
14
+ warnings.simplefilter(action="ignore", category=RuntimeWarning)
15
+
16
+
17
+ def _find_repeats(arr):
18
+ # This function assumes it may clobber its input.
19
+ if len(arr) == 0:
20
+ return np.array(0, np.float64), np.array(0, np.intp)
21
+
22
+ # XXX This cast was previously needed for the Fortran implementation,
23
+ # should we ditch it?
24
+ arr = np.asarray(arr, np.float64).ravel()
25
+ arr.sort()
26
+
27
+ # Taken from NumPy 1.9's np.unique.
28
+ change = np.concatenate(([True], arr[1:] != arr[:-1]))
29
+ unique = arr[change]
30
+ change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
31
+ freq = np.diff(change_idx)
32
+ atleast2 = freq > 1
33
+ return unique[atleast2], freq[atleast2]
34
+
35
+
36
+ def find_repeats(arr):
37
+ # Make sure we get a copy. ma.compressed promises a "new array", but can
38
+ # actually return a reference.
39
+ compr = np.asarray(ma.compressed(arr), dtype=np.float64)
40
+ try:
41
+ need_copy = np.may_share_memory(compr, arr)
42
+ except AttributeError:
43
+ # numpy < 1.8.2 bug: np.may_share_memory([], []) raises,
44
+ # while in numpy 1.8.2 and above it just (correctly) returns False.
45
+ need_copy = False
46
+ if need_copy:
47
+ compr = compr.copy()
48
+ return _find_repeats(compr)
49
+
50
+
51
+ def rankdata(data, axis=None, use_missing=False):
52
+ def _rank1d(data, use_missing=False):
53
+ n = data.count()
54
+ rk = np.empty(data.size, dtype=float)
55
+ idx = data.argsort()
56
+ rk[idx[:n]] = np.arange(1, n + 1)
57
+
58
+ if use_missing:
59
+ rk[idx[n:]] = (n + 1) / 2.0
60
+ else:
61
+ rk[idx[n:]] = 0
62
+
63
+ repeats = find_repeats(data.copy())
64
+ for r in repeats[0]:
65
+ condition = (data == r).filled(False)
66
+ rk[condition] = rk[condition].mean()
67
+ return rk
68
+
69
+ data = ma.array(data, copy=False)
70
+ if axis is None:
71
+ if data.ndim > 1:
72
+ return _rank1d(data.ravel(), use_missing).reshape(data.shape)
73
+ else:
74
+ return _rank1d(data, use_missing)
75
+ else:
76
+ return ma.apply_along_axis(_rank1d, axis, data, use_missing).view(ndarray)
77
+
78
+
79
+ def _chk_asarray(a, axis):
80
+ # Always returns a masked array, raveled for axis=None
81
+ a = ma.asanyarray(a)
82
+ if axis is None:
83
+ a = ma.ravel(a)
84
+ outaxis = 0
85
+ else:
86
+ outaxis = axis
87
+ return a, outaxis
88
+
89
+
90
+ SpearmanrResult = namedtuple("SpearmanrResult", ("correlation", "pvalue"))
91
+
92
+
93
+ # Taken from scipy.mstats with following tweaks:
94
+ # 1. parallel pairwise computation
95
+ # 2. custom masking
96
+ def spearmanr(
97
+ x, y=None, use_ties=True, axis=None, nan_policy="propagate", alternative="two-sided", mask_fn=ma.masked_invalid
98
+ ):
99
+ if not use_ties:
100
+ raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")
101
+
102
+ # Always returns a masked array, raveled if axis=None
103
+ x, axisout = _chk_asarray(x, axis)
104
+ if y is not None:
105
+ # Deal only with 2-D `x` case.
106
+ y, _ = _chk_asarray(y, axis)
107
+ if axisout == 0:
108
+ x = ma.column_stack((x, y))
109
+ else:
110
+ x = ma.row_stack((x, y))
111
+
112
+ if axisout == 1:
113
+ # To simplify the code that follow (always use `n_obs, n_vars` shape)
114
+ x = x.T
115
+
116
+ if nan_policy == "omit":
117
+ x = mask_fn(x)
118
+
119
+ def _spearmanr_2cols(x):
120
+ # Mask the same observations for all variables, and then drop those
121
+ # observations (can't leave them masked, rankdata is weird).
122
+ x = ma.mask_rowcols(x, axis=0)
123
+ x = x[~x.mask.any(axis=1), :]
124
+
125
+ # If either column is entirely NaN or Inf
126
+ if not np.any(x.data):
127
+ return SpearmanrResult(np.nan, np.nan)
128
+
129
+ m = ma.getmask(x)
130
+ n_obs = x.shape[0]
131
+ dof = n_obs - 2 - int(m.sum(axis=0)[0])
132
+ if dof < 0:
133
+ return SpearmanrResult(np.nan, np.nan)
134
+
135
+ # Gets the ranks and rank differences
136
+ x_ranked = rankdata(x, axis=0)
137
+ rs = ma.corrcoef(x_ranked, rowvar=False).data
138
+
139
+ # rs can have elements equal to 1, so avoid zero division warnings
140
+ with np.errstate(divide="ignore"):
141
+ # clip the small negative values possibly caused by rounding
142
+ # errors before taking the square root
143
+ t = rs * np.sqrt((dof / ((rs + 1.0) * (1.0 - rs))).clip(0))
144
+
145
+ t, prob = scipy.stats._mstats_basic._ttest_finish(dof, t, alternative)
146
+
147
+ # For backwards compatibility, return scalars when comparing 2 columns
148
+ if rs.shape == (2, 2):
149
+ return SpearmanrResult(rs[1, 0], prob[1, 0])
150
+ else:
151
+ return SpearmanrResult(rs, prob)
152
+
153
+ # Need to do this per pair of variables, otherwise the dropped observations
154
+ # in a third column mess up the result for a pair.
155
+ n_vars = x.shape[1]
156
+ if n_vars == 2:
157
+ return _spearmanr_2cols(x)
158
+ else:
159
+ max_cpu_cores = cpu_count(logical=False)
160
+ with np.errstate(divide="ignore"):
161
+ results = Parallel(n_jobs=max_cpu_cores)(
162
+ delayed(_spearmanr_2cols)(x[:, [var1, var2]])
163
+ for var1 in range(n_vars - 1)
164
+ for var2 in range(var1 + 1, n_vars)
165
+ )
166
+
167
+ rs = np.ones((n_vars, n_vars), dtype=float)
168
+ prob = np.zeros((n_vars, n_vars), dtype=float)
169
+ for var1 in range(n_vars - 1):
170
+ for var2 in range(var1 + 1, n_vars):
171
+ result = results.pop(0)
172
+ rs[var1, var2] = result.correlation
173
+ rs[var2, var1] = result.correlation
174
+ prob[var1, var2] = result.pvalue
175
+ prob[var2, var1] = result.pvalue
176
+
177
+ return SpearmanrResult(rs, prob)
@@ -0,0 +1,172 @@
1
+ import hashlib
2
+ import logging
3
+ from typing import Any, Dict, List, Optional, Union
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from joblib import Parallel, delayed
8
+ from pandas.api.types import is_datetime64_any_dtype, is_numeric_dtype
9
+ from psutil import cpu_count
10
+ from scipy.stats import skew, spearmanr
11
+
12
+ from upgini.metadata import ModelTaskType, SearchKey
13
+ from upgini.utils import mstats
14
+
15
+
16
+ def sort_columns(
17
+ df: pd.DataFrame,
18
+ target_column: Union[str, pd.Series],
19
+ search_keys: Dict[str, SearchKey],
20
+ model_task_type: ModelTaskType,
21
+ exclude_columns: Optional[List[str]] = None,
22
+ sort_all_columns: bool = False,
23
+ logger: Optional[logging.Logger] = None,
24
+ ) -> List[str]:
25
+ if exclude_columns is None:
26
+ exclude_columns = []
27
+ if logger is None:
28
+ logger = logging.getLogger(__name__)
29
+ logger.setLevel(logging.FATAL)
30
+ df = df.copy() # avoid side effects
31
+
32
+ # Check multiple search keys
33
+ search_key_values = list(search_keys.values())
34
+ has_duplicate_search_keys = len(search_key_values) != len(set(search_key_values))
35
+ if has_duplicate_search_keys:
36
+ logging.warning(f"WARNING: Found duplicate SearchKey values in search_keys: {search_keys}")
37
+
38
+ sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
39
+ sorted_keys = [k for k in sorted_keys if k in df.columns and k not in exclude_columns]
40
+
41
+ other_columns = sorted(
42
+ [
43
+ c
44
+ for c in df.columns
45
+ if c not in sorted_keys and c not in exclude_columns and (df[c].nunique() > 1 or sort_all_columns)
46
+ ]
47
+ )
48
+ target = target_column if isinstance(target_column, pd.Series) else df[target_column]
49
+ target = prepare_target(target, model_task_type)
50
+ sort_dict = get_sort_columns_dict(
51
+ df[sorted_keys + other_columns], target, sorted_keys, omit_nan=True, sort_all_columns=sort_all_columns
52
+ )
53
+ other_columns = [c for c in other_columns if c in sort_dict]
54
+ columns_for_sort = sorted_keys + sorted(other_columns, key=lambda e: sort_dict[e], reverse=True)
55
+ return columns_for_sort
56
+
57
+
58
+ def get_sort_columns_dict(
59
+ df: pd.DataFrame,
60
+ target: pd.Series,
61
+ sorted_keys: List[str],
62
+ omit_nan: bool,
63
+ n_jobs: Optional[int] = None,
64
+ sort_all_columns: bool = False,
65
+ ) -> Dict[str, Any]:
66
+ string_features = [c for c in df.select_dtypes(exclude=[np.number]).columns if c not in sorted_keys]
67
+ columns_for_sort = [c for c in df.columns if c not in sorted_keys + string_features]
68
+ if len(string_features) > 0:
69
+ if len(df) > len(df.drop(columns=string_features).drop_duplicates()) or sort_all_columns:
70
+ # factorize string features
71
+ for c in string_features:
72
+ df.loc[:, c] = pd.Series(df[c].factorize(sort=True)[0], index=df.index, dtype="int")
73
+ columns_for_sort.extend(string_features)
74
+
75
+ if len(columns_for_sort) == 0:
76
+ return {}
77
+
78
+ df = df[columns_for_sort]
79
+ hashes = [hash_series(df[col]) for col in columns_for_sort]
80
+ df = np.asarray(df, dtype=np.float32)
81
+ correlations = get_sort_columns_correlations(df, target, omit_nan, n_jobs)
82
+
83
+ sort_dict = {col: (corr, h) for col, corr, h in zip(columns_for_sort, correlations, hashes)}
84
+ return sort_dict
85
+
86
+
87
+ def get_sort_columns_correlations(df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: Optional[int] = None):
88
+ target_correlations = get_target_correlations(df, target, omit_nan, n_jobs, precision=7)
89
+
90
+ return np.max(target_correlations, axis=0)
91
+
92
+
93
+ def get_target_correlations(
94
+ df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: Optional[int] = None, precision: int = 15
95
+ ):
96
+ df = np.asarray(df, dtype=np.float32)
97
+ target_correlations = np.zeros((2, df.shape[1]))
98
+ target_correlations[0, :] = np.nan_to_num(
99
+ calculate_spearman_corr_with_target(df, target, omit_nan, n_jobs), copy=False
100
+ )
101
+ target_correlations[1, :] = np.nan_to_num(np.abs(np.corrcoef(df.T, target.T, rowvar=True)[-1, :-1]))
102
+
103
+ target_correlations = np.trunc(target_correlations * 10**precision) / (10**precision)
104
+
105
+ return target_correlations
106
+
107
+
108
+ def calculate_spearman_corr_with_target(
109
+ X: Union[pd.DataFrame, np.ndarray], y: pd.Series, omit_nan: bool = False, n_jobs: Optional[int] = None
110
+ ) -> np.ndarray:
111
+ if isinstance(X, pd.DataFrame):
112
+ X = np.asarray(X, dtype=np.float32)
113
+
114
+ if X.size == 0:
115
+ return np.ndarray(shape=(0,))
116
+
117
+ all_correlations = np.zeros(X.shape[1])
118
+ all_correlations.fill(np.nan)
119
+ cols2calc = np.where([c.size > 0 and not (c == c[0]).all() for c in X.T])[0]
120
+
121
+ if omit_nan:
122
+ results = Parallel(n_jobs=n_jobs or cpu_count(logical=False))(
123
+ delayed(mstats.spearmanr)(
124
+ X[:, i],
125
+ y,
126
+ nan_policy="omit",
127
+ axis=0,
128
+ )
129
+ for i in cols2calc
130
+ )
131
+ target_correlations = np.array([abs(res.correlation) for res in results])
132
+ else:
133
+ cols2calc = cols2calc[np.where(~np.isnan(X[:, cols2calc]).any(axis=0))[0]]
134
+ target_correlations = calculate_spearman(X[:, cols2calc], y, nan_policy="raise")
135
+ if isinstance(target_correlations, float):
136
+ target_correlations = np.abs([target_correlations])
137
+ else:
138
+ target_correlations = np.abs(target_correlations)[-1, :-1]
139
+
140
+ all_correlations[cols2calc] = target_correlations
141
+
142
+ return all_correlations
143
+
144
+
145
+ def calculate_spearman(X: np.ndarray, y: Optional[pd.Series], nan_policy: str):
146
+ features_num = X.shape[1]
147
+ if y is not None:
148
+ features_num += 1
149
+
150
+ if features_num < 2:
151
+ return 1.0
152
+ else:
153
+ return spearmanr(X, y, nan_policy=nan_policy).correlation
154
+
155
+
156
+ def hash_series(series: pd.Series) -> int:
157
+ return int(hashlib.sha256(pd.util.hash_pandas_object(series, index=True).values).hexdigest(), 16)
158
+
159
+
160
+ def prepare_target(target: pd.Series, model_task_type: ModelTaskType) -> pd.Series:
161
+ target_name = target.name
162
+ if model_task_type != ModelTaskType.REGRESSION or (
163
+ not is_numeric_dtype(target) and not is_datetime64_any_dtype(target)
164
+ ):
165
+ target = target.astype(str).astype("category").cat.codes
166
+
167
+ elif model_task_type == ModelTaskType.REGRESSION:
168
+ skewness = round(abs(skew(target)), 2)
169
+ if (target.min() >= 0) and (skewness >= 0.9):
170
+ target = np.log1p(target)
171
+
172
+ return pd.Series(target, name=target_name)
@@ -1,4 +1,3 @@
1
- import itertools
2
1
  import logging
3
2
  from typing import Callable, List, Optional, Union
4
3
 
@@ -207,7 +206,7 @@ def balance_undersample_forced(
207
206
  id_columns: List[str],
208
207
  date_column: str,
209
208
  task_type: ModelTaskType,
210
- cv_type: CVType | None,
209
+ cv_type: Optional[CVType],
211
210
  random_state: int,
212
211
  sample_size: int = 7000,
213
212
  logger: Optional[logging.Logger] = None,
@@ -319,7 +318,8 @@ def balance_undersample_time_series(
319
318
  if len(id_counts) < min_different_ids:
320
319
  if logger is not None:
321
320
  logger.info(
322
- f"Different ids count {len(id_counts)} for sample size {sample_size} is less than min different ids {min_different_ids}, sampling time window"
321
+ f"Different ids count {len(id_counts)} for sample size {sample_size}"
322
+ f" is less than min different ids {min_different_ids}, sampling time window"
323
323
  )
324
324
  date_counts = df.groupby(id_columns)[date_column].nunique().sort_values(ascending=False)
325
325
  ids_to_sample = date_counts.index[:min_different_ids] if len(id_counts) > 0 else date_counts.index
@@ -1 +0,0 @@
1
- __version__ = "1.2.59a3818.dev1"
File without changes
File without changes
File without changes