upgini 1.2.60a3792.dev2__tar.gz → 1.2.62__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/PKG-INFO +2 -1
  2. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/pyproject.toml +1 -0
  3. upgini-1.2.62/src/upgini/__about__.py +1 -0
  4. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/autofe/date.py +1 -1
  5. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/data_source/data_source_publisher.py +9 -4
  6. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/features_enricher.py +107 -45
  7. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/metrics.py +4 -7
  8. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/resource_bundle/strings.properties +1 -0
  9. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/datetime_utils.py +2 -0
  10. upgini-1.2.62/src/upgini/utils/mstats.py +177 -0
  11. upgini-1.2.62/src/upgini/utils/sort.py +172 -0
  12. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/target_utils.py +3 -3
  13. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/ts_utils.py +0 -6
  14. upgini-1.2.60a3792.dev2/src/upgini/__about__.py +0 -1
  15. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/.gitignore +0 -0
  16. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/LICENSE +0 -0
  17. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/README.md +0 -0
  18. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/__init__.py +0 -0
  19. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/ads.py +0 -0
  20. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/ads_management/__init__.py +0 -0
  21. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/ads_management/ads_manager.py +0 -0
  22. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/autofe/__init__.py +0 -0
  23. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/autofe/all_operands.py +0 -0
  24. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/autofe/binary.py +0 -0
  25. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/autofe/feature.py +0 -0
  26. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/autofe/groupby.py +0 -0
  27. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/autofe/operand.py +0 -0
  28. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/autofe/unary.py +0 -0
  29. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/autofe/vector.py +0 -0
  30. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/data_source/__init__.py +0 -0
  31. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/dataset.py +0 -0
  32. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/errors.py +0 -0
  33. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/http.py +0 -0
  34. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/lazy_import.py +0 -0
  35. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/mdc/__init__.py +0 -0
  36. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/mdc/context.py +0 -0
  37. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/metadata.py +0 -0
  38. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/normalizer/__init__.py +0 -0
  39. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/normalizer/normalize_utils.py +0 -0
  40. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/resource_bundle/__init__.py +0 -0
  41. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/resource_bundle/exceptions.py +0 -0
  42. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  43. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/sampler/__init__.py +0 -0
  44. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/sampler/base.py +0 -0
  45. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/sampler/random_under_sampler.py +0 -0
  46. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/sampler/utils.py +0 -0
  47. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/search_task.py +0 -0
  48. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/spinner.py +0 -0
  49. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  50. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/__init__.py +0 -0
  51. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/base_search_key_detector.py +0 -0
  52. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/blocked_time_series.py +0 -0
  53. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/country_utils.py +0 -0
  54. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/custom_loss_utils.py +0 -0
  55. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/cv_utils.py +0 -0
  56. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/deduplicate_utils.py +0 -0
  57. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/display_utils.py +0 -0
  58. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/email_utils.py +0 -0
  59. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/fallback_progress_bar.py +0 -0
  60. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/feature_info.py +0 -0
  61. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/features_validator.py +0 -0
  62. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/format.py +0 -0
  63. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/ip_utils.py +0 -0
  64. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/phone_utils.py +0 -0
  65. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/postal_code_utils.py +0 -0
  66. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/progress_bar.py +0 -0
  67. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/sklearn_ext.py +0 -0
  68. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/track_info.py +0 -0
  69. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/utils/warning_counter.py +0 -0
  70. {upgini-1.2.60a3792.dev2 → upgini-1.2.62}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.60a3792.dev2
3
+ Version: 1.2.62
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -30,6 +30,7 @@ Requires-Dist: jarowinkler>=2.0.0
30
30
  Requires-Dist: levenshtein>=0.25.1
31
31
  Requires-Dist: numpy<=1.26.4,>=1.19.0
32
32
  Requires-Dist: pandas<3.0.0,>=1.1.0
33
+ Requires-Dist: psutil>=6.0.0
33
34
  Requires-Dist: pydantic<3.0.0,>1.0.0
34
35
  Requires-Dist: pyjwt>=2.8.0
35
36
  Requires-Dist: python-bidi==0.4.2
@@ -50,6 +50,7 @@ dependencies = [
50
50
  "xhtml2pdf>=0.2.11,<0.3.0",
51
51
  "jarowinkler>=2.0.0",
52
52
  "levenshtein>=0.25.1",
53
+ "psutil>=6.0.0",
53
54
  ]
54
55
 
55
56
  [project.urls]
@@ -0,0 +1 @@
1
+ __version__ = "1.2.62"
@@ -1,6 +1,6 @@
1
1
  import abc
2
2
  import json
3
- from typing import Any, Dict, List, Optional, Union
3
+ from typing import Dict, List, Optional, Union
4
4
 
5
5
  import numpy as np
6
6
  import pandas as pd
@@ -63,6 +63,7 @@ class DataSourcePublisher:
63
63
  keep_features: Optional[List[str]] = None,
64
64
  date_features: Optional[List[str]] = None,
65
65
  date_vector_features: Optional[List[str]] = None,
66
+ date_features_format: Optional[str] = None,
66
67
  generate_runtime_embeddings: Optional[List[str]] = None,
67
68
  exclude_raw: Optional[List[str]] = None,
68
69
  _force_generation=False,
@@ -160,13 +161,17 @@ class DataSourcePublisher:
160
161
  if keep_features is not None:
161
162
  request["keepFeatures"] = keep_features
162
163
  if date_features is not None:
163
- if date_format is None:
164
- raise ValidationError("date_format should be presented if you use date features")
164
+ if date_features_format is None:
165
+ raise ValidationError("date_features_format should be presented if you use date features")
165
166
  request["dateFeatures"] = date_features
167
+ request["dateFeaturesFormat"] = date_features_format
166
168
  if date_vector_features is not None:
167
- if date_format is None:
168
- raise ValidationError("date_format should be presented if you use date vector features")
169
+ if date_features_format is None:
170
+ raise ValidationError(
171
+ "date_features_format should be presented if you use date vector features"
172
+ )
169
173
  request["dateVectorFeatures"] = date_vector_features
174
+ request["dateFeaturesFormat"] = date_features_format
170
175
  if generate_runtime_embeddings is not None:
171
176
  request["generateRuntimeEmbeddingsFeatures"] = generate_runtime_embeddings
172
177
  if exclude_raw is not None:
@@ -112,6 +112,7 @@ try:
112
112
  except Exception:
113
113
  from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
114
114
 
115
+ from upgini.utils.sort import sort_columns
115
116
  from upgini.utils.target_utils import (
116
117
  balance_undersample_forced,
117
118
  calculate_psi,
@@ -1261,7 +1262,7 @@ class FeaturesEnricher(TransformerMixin):
1261
1262
  for feature, shap in new_shaps.items()
1262
1263
  if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
1263
1264
  }
1264
- self.__prepare_feature_importances(trace_id, x_columns, new_shaps, silent=True)
1265
+ self.__prepare_feature_importances(trace_id, x_columns, new_shaps)
1265
1266
 
1266
1267
  if self.features_info_display_handle is not None:
1267
1268
  try:
@@ -1568,9 +1569,23 @@ class FeaturesEnricher(TransformerMixin):
1568
1569
 
1569
1570
  fitting_eval_set_dict = {}
1570
1571
  fitting_x_columns = fitting_X.columns.to_list()
1571
- self.logger.info(f"Final list of fitting X columns: {fitting_x_columns}")
1572
+ # Idempotently sort columns
1573
+ fitting_x_columns = sort_columns(
1574
+ fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
1575
+ )
1576
+ fitting_X = fitting_X[fitting_x_columns]
1577
+ self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
1572
1578
  fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
1573
- self.logger.info(f"Final list of fitting enriched X columns: {fitting_enriched_x_columns}")
1579
+ fitting_enriched_x_columns = sort_columns(
1580
+ fitting_enriched_X,
1581
+ enriched_y_sorted,
1582
+ search_keys,
1583
+ self.model_task_type,
1584
+ sort_all_columns=True,
1585
+ logger=self.logger,
1586
+ )
1587
+ fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
1588
+ self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
1574
1589
  for idx, eval_tuple in eval_set_sampled_dict.items():
1575
1590
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
1576
1591
  eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
@@ -1734,11 +1749,15 @@ class FeaturesEnricher(TransformerMixin):
1734
1749
  if eval_set is not None
1735
1750
  else (Dataset.FIT_SAMPLE_THRESHOLD, Dataset.FIT_SAMPLE_ROWS)
1736
1751
  )
1752
+
1753
+ df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET, columns_renaming, silent=True)
1754
+ # Sample after sorting by system_record_id for idempotency
1755
+ df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
1756
+
1737
1757
  if num_samples > sample_threshold:
1738
1758
  self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
1739
1759
  df = df.sample(n=sample_rows, random_state=self.random_state)
1740
1760
 
1741
- df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
1742
1761
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
1743
1762
  df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1744
1763
 
@@ -1882,6 +1901,7 @@ class FeaturesEnricher(TransformerMixin):
1882
1901
  and self.columns_for_online_api is not None
1883
1902
  and num_samples > Dataset.FORCE_SAMPLE_SIZE
1884
1903
  )
1904
+ # TODO: check that system_record_id was added before this step
1885
1905
  if force_downsampling:
1886
1906
  self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1887
1907
  df = balance_undersample_forced(
@@ -1915,6 +1935,7 @@ class FeaturesEnricher(TransformerMixin):
1915
1935
  progress_bar=progress_bar,
1916
1936
  progress_callback=progress_callback,
1917
1937
  add_fit_system_record_id=True,
1938
+ target_name=tmp_target_name,
1918
1939
  )
1919
1940
  if enriched_df is None:
1920
1941
  return None
@@ -1953,6 +1974,7 @@ class FeaturesEnricher(TransformerMixin):
1953
1974
  and self.columns_for_online_api is not None
1954
1975
  and num_samples > Dataset.FORCE_SAMPLE_SIZE
1955
1976
  )
1977
+
1956
1978
  if force_downsampling:
1957
1979
  self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1958
1980
  df = balance_undersample_forced(
@@ -1984,6 +2006,7 @@ class FeaturesEnricher(TransformerMixin):
1984
2006
  progress_bar=progress_bar,
1985
2007
  progress_callback=progress_callback,
1986
2008
  add_fit_system_record_id=True,
2009
+ target_name=tmp_target_name,
1987
2010
  )
1988
2011
  if enriched_Xy is None:
1989
2012
  return None
@@ -2145,6 +2168,7 @@ if response.status_code == 200:
2145
2168
  progress_bar: Optional[ProgressBar] = None,
2146
2169
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
2147
2170
  add_fit_system_record_id: bool = False,
2171
+ target_name: Optional[str] = None,
2148
2172
  ) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
2149
2173
  if self._search_task is None:
2150
2174
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
@@ -2329,8 +2353,16 @@ if response.status_code == 200:
2329
2353
  and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
2330
2354
  ]
2331
2355
 
2332
- if add_fit_system_record_id:
2333
- df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
2356
+ if add_fit_system_record_id and target_name is not None:
2357
+ reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
2358
+ df = self.__add_fit_system_record_id(
2359
+ df,
2360
+ search_keys,
2361
+ SYSTEM_RECORD_ID,
2362
+ reversed_columns_renaming.get(target_name, target_name),
2363
+ columns_renaming,
2364
+ silent=True,
2365
+ )
2334
2366
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2335
2367
  features_not_to_pass.append(SORT_ID)
2336
2368
 
@@ -2775,7 +2807,9 @@ if response.status_code == 200:
2775
2807
  self.__log_warning(full_duplicates_warning)
2776
2808
 
2777
2809
  # Explode multiple search keys
2778
- df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2810
+ df = self.__add_fit_system_record_id(
2811
+ df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming
2812
+ )
2779
2813
 
2780
2814
  # TODO check that this is correct for enrichment
2781
2815
  self.df_with_original_index = df.copy()
@@ -2857,7 +2891,9 @@ if response.status_code == 200:
2857
2891
  if eval_set is not None and len(eval_set) > 0:
2858
2892
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2859
2893
 
2860
- df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
2894
+ df = self.__add_fit_system_record_id(
2895
+ df, self.fit_search_keys, SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming, silent=True
2896
+ )
2861
2897
 
2862
2898
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2863
2899
  df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
@@ -3544,56 +3580,82 @@ if response.status_code == 200:
3544
3580
  def __add_fit_system_record_id(
3545
3581
  self,
3546
3582
  df: pd.DataFrame,
3547
- # meaning_types: Dict[str, FileColumnMeaningType],
3548
3583
  search_keys: Dict[str, SearchKey],
3549
3584
  id_name: str,
3585
+ target_name: str,
3586
+ columns_renaming: Dict[str, str],
3587
+ silent: bool = False,
3550
3588
  ) -> pd.DataFrame:
3551
- # save original order or rows
3552
3589
  original_index_name = df.index.name
3553
3590
  index_name = df.index.name or DEFAULT_INDEX
3554
3591
  original_order_name = "original_order"
3592
+ # Save original index
3555
3593
  df = df.reset_index().rename(columns={index_name: ORIGINAL_INDEX})
3594
+ # Save original order
3556
3595
  df = df.reset_index().rename(columns={DEFAULT_INDEX: original_order_name})
3557
3596
 
3558
- # order by date and idempotent order by other keys
3559
- if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
3560
- sort_exclude_columns = [
3561
- original_order_name,
3562
- ORIGINAL_INDEX,
3563
- EVAL_SET_INDEX,
3564
- TARGET,
3565
- "__target",
3566
- ENTITY_SYSTEM_RECORD_ID,
3567
- ]
3568
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3569
- date_column = DateTimeSearchKeyConverter.DATETIME_COL
3570
- sort_exclude_columns.append(self._get_date_column(search_keys))
3571
- else:
3572
- date_column = self._get_date_column(search_keys)
3573
- sort_columns = [date_column] if date_column is not None else []
3597
+ # order by date and idempotent order by other keys and features
3574
3598
 
3575
- sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
3576
- sorted_other_keys = [k for k in sorted_other_keys if k not in sort_exclude_columns]
3599
+ sort_exclude_columns = [
3600
+ original_order_name,
3601
+ ORIGINAL_INDEX,
3602
+ EVAL_SET_INDEX,
3603
+ TARGET,
3604
+ "__target",
3605
+ ENTITY_SYSTEM_RECORD_ID,
3606
+ ]
3607
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3608
+ date_column = DateTimeSearchKeyConverter.DATETIME_COL
3609
+ sort_exclude_columns.append(FeaturesEnricher._get_date_column(search_keys))
3610
+ else:
3611
+ date_column = FeaturesEnricher._get_date_column(search_keys)
3612
+ sort_exclude_columns.append(date_column)
3613
+ columns_to_sort = [date_column] if date_column is not None else []
3614
+
3615
+ do_sorting = True
3616
+ if self.id_columns and self.cv in [CVType.time_series, CVType.blocked_time_series]:
3617
+ # Check duplicates by date and id_columns
3618
+ reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
3619
+ renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in self.id_columns]
3620
+ duplicate_check_columns = [c for c in renamed_id_columns if c in df.columns]
3621
+ if date_column is not None:
3622
+ duplicate_check_columns.append(date_column)
3577
3623
 
3578
- other_columns = sorted(
3579
- [
3580
- c
3581
- for c in df.columns
3582
- if c not in sort_columns
3583
- and c not in sorted_other_keys
3584
- and c not in sort_exclude_columns
3585
- and df[c].nunique() > 1
3586
- ]
3624
+ duplicates = df.duplicated(subset=duplicate_check_columns, keep=False)
3625
+ if duplicates.any():
3626
+ if not silent:
3627
+ self.__log_warning(self.bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
3628
+ else:
3629
+ self.logger.warning(
3630
+ f"Found {duplicates.sum()} duplicate rows by date and ID columns: {duplicate_check_columns}."
3631
+ " Will not sort dataset"
3632
+ )
3633
+ do_sorting = False
3634
+ else:
3635
+ columns_to_hash = list(search_keys.keys()) + renamed_id_columns + [target_name]
3636
+ columns_to_hash = sort_columns(
3637
+ df[columns_to_hash],
3638
+ target_name,
3639
+ search_keys,
3640
+ self.model_task_type,
3641
+ sort_exclude_columns,
3642
+ logger=self.logger,
3643
+ )
3644
+ else:
3645
+ columns_to_hash = sort_columns(
3646
+ df, target_name, search_keys, self.model_task_type, sort_exclude_columns, logger=self.logger
3587
3647
  )
3588
-
3589
- all_other_columns = sorted_other_keys + other_columns
3590
-
3648
+ if do_sorting:
3591
3649
  search_keys_hash = "search_keys_hash"
3592
- if len(all_other_columns) > 0:
3593
- sort_columns.append(search_keys_hash)
3594
- df[search_keys_hash] = pd.util.hash_pandas_object(df[all_other_columns], index=False)
3595
-
3596
- df = df.sort_values(by=sort_columns)
3650
+ if len(columns_to_hash) > 0:
3651
+ factorized_df = df.copy()
3652
+ for col in columns_to_hash:
3653
+ if col not in search_keys and not is_numeric_dtype(factorized_df[col]):
3654
+ factorized_df[col] = factorized_df[col].factorize(sort=True)[0]
3655
+ df[search_keys_hash] = pd.util.hash_pandas_object(factorized_df[columns_to_hash], index=False)
3656
+ columns_to_sort.append(search_keys_hash)
3657
+
3658
+ df = df.sort_values(by=columns_to_sort)
3597
3659
 
3598
3660
  if search_keys_hash in df.columns:
3599
3661
  df.drop(columns=search_keys_hash, inplace=True)
@@ -30,8 +30,8 @@ except ImportError:
30
30
  from sklearn.metrics._regression import (
31
31
  _check_reg_targets,
32
32
  check_consistent_length,
33
- mean_squared_error,
34
33
  )
34
+ from sklearn.metrics import mean_squared_error
35
35
  from sklearn.model_selection import BaseCrossValidator
36
36
 
37
37
  from upgini.errors import ValidationError
@@ -289,9 +289,6 @@ class EstimatorWrapper:
289
289
  else:
290
290
  x, y = self._remove_empty_target_rows(x, y)
291
291
 
292
- # Make order of columns idempotent
293
- x = x[sorted(x.columns)]
294
-
295
292
  self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
296
293
  return x, y, groups
297
294
 
@@ -569,7 +566,7 @@ class CatBoostWrapper(EstimatorWrapper):
569
566
  if all([isinstance(c, int) for c in estimator_cat_features]):
570
567
  cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
571
568
  cat_features_idx.update(estimator_cat_features)
572
- self.cat_features = [x.columns[idx] for idx in sorted(cat_features_idx)]
569
+ self.cat_features = [x.columns[idx] for idx in cat_features_idx]
573
570
  elif all([isinstance(c, str) for c in estimator_cat_features]):
574
571
  self.cat_features = list(set(self.cat_features + estimator_cat_features))
575
572
  else:
@@ -940,13 +937,13 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
940
937
  if (y_true < 0).any():
941
938
  raise ValidationError(bundle.get("metrics_msle_negative_target"))
942
939
 
943
- return mean_squared_error(
940
+ mse = mean_squared_error(
944
941
  log1p(y_true),
945
942
  log1p(y_pred.clip(0)),
946
943
  sample_weight=sample_weight,
947
944
  multioutput=multioutput,
948
- squared=squared,
949
945
  )
946
+ return mse if squared else np.sqrt(mse)
950
947
 
951
948
 
952
949
  def fill_na_cat_features(df: pd.DataFrame, cat_features: List[str]) -> pd.DataFrame:
@@ -35,6 +35,7 @@ trial_quota_limit_riched=You have reached the quota limit of trial data usage. P
35
35
  loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
36
36
  loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
37
37
  multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
38
+ date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns
38
39
  group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
39
40
  current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
40
41
  # Errors
@@ -166,6 +166,8 @@ class DateTimeSearchKeyConverter:
166
166
 
167
167
  # Drop intermediate columns if not needed
168
168
  df.drop(columns=["second", "minute", "hour"], inplace=True)
169
+ else:
170
+ keep_time = False
169
171
 
170
172
  for generated_feature in self.generated_features[:]:
171
173
  if df[generated_feature].dropna().nunique() <= 1:
@@ -0,0 +1,177 @@
1
+ import warnings
2
+ from collections import namedtuple
3
+
4
+ import numpy as np
5
+ import numpy.ma as ma
6
+ import scipy
7
+ from joblib import Parallel, delayed
8
+ from numpy import ndarray
9
+ from psutil import cpu_count
10
+
11
+ np.seterr(divide="ignore")
12
+
13
+
14
+ warnings.simplefilter(action="ignore", category=RuntimeWarning)
15
+
16
+
17
+ def _find_repeats(arr):
18
+ # This function assumes it may clobber its input.
19
+ if len(arr) == 0:
20
+ return np.array(0, np.float64), np.array(0, np.intp)
21
+
22
+ # XXX This cast was previously needed for the Fortran implementation,
23
+ # should we ditch it?
24
+ arr = np.asarray(arr, np.float64).ravel()
25
+ arr.sort()
26
+
27
+ # Taken from NumPy 1.9's np.unique.
28
+ change = np.concatenate(([True], arr[1:] != arr[:-1]))
29
+ unique = arr[change]
30
+ change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
31
+ freq = np.diff(change_idx)
32
+ atleast2 = freq > 1
33
+ return unique[atleast2], freq[atleast2]
34
+
35
+
36
+ def find_repeats(arr):
37
+ # Make sure we get a copy. ma.compressed promises a "new array", but can
38
+ # actually return a reference.
39
+ compr = np.asarray(ma.compressed(arr), dtype=np.float64)
40
+ try:
41
+ need_copy = np.may_share_memory(compr, arr)
42
+ except AttributeError:
43
+ # numpy < 1.8.2 bug: np.may_share_memory([], []) raises,
44
+ # while in numpy 1.8.2 and above it just (correctly) returns False.
45
+ need_copy = False
46
+ if need_copy:
47
+ compr = compr.copy()
48
+ return _find_repeats(compr)
49
+
50
+
51
+ def rankdata(data, axis=None, use_missing=False):
52
+ def _rank1d(data, use_missing=False):
53
+ n = data.count()
54
+ rk = np.empty(data.size, dtype=float)
55
+ idx = data.argsort()
56
+ rk[idx[:n]] = np.arange(1, n + 1)
57
+
58
+ if use_missing:
59
+ rk[idx[n:]] = (n + 1) / 2.0
60
+ else:
61
+ rk[idx[n:]] = 0
62
+
63
+ repeats = find_repeats(data.copy())
64
+ for r in repeats[0]:
65
+ condition = (data == r).filled(False)
66
+ rk[condition] = rk[condition].mean()
67
+ return rk
68
+
69
+ data = ma.array(data, copy=False)
70
+ if axis is None:
71
+ if data.ndim > 1:
72
+ return _rank1d(data.ravel(), use_missing).reshape(data.shape)
73
+ else:
74
+ return _rank1d(data, use_missing)
75
+ else:
76
+ return ma.apply_along_axis(_rank1d, axis, data, use_missing).view(ndarray)
77
+
78
+
79
+ def _chk_asarray(a, axis):
80
+ # Always returns a masked array, raveled for axis=None
81
+ a = ma.asanyarray(a)
82
+ if axis is None:
83
+ a = ma.ravel(a)
84
+ outaxis = 0
85
+ else:
86
+ outaxis = axis
87
+ return a, outaxis
88
+
89
+
90
+ SpearmanrResult = namedtuple("SpearmanrResult", ("correlation", "pvalue"))
91
+
92
+
93
+ # Taken from scipy.mstats with following tweaks:
94
+ # 1. parallel pairwise computation
95
+ # 2. custom masking
96
+ def spearmanr(
97
+ x, y=None, use_ties=True, axis=None, nan_policy="propagate", alternative="two-sided", mask_fn=ma.masked_invalid
98
+ ):
99
+ if not use_ties:
100
+ raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")
101
+
102
+ # Always returns a masked array, raveled if axis=None
103
+ x, axisout = _chk_asarray(x, axis)
104
+ if y is not None:
105
+ # Deal only with 2-D `x` case.
106
+ y, _ = _chk_asarray(y, axis)
107
+ if axisout == 0:
108
+ x = ma.column_stack((x, y))
109
+ else:
110
+ x = ma.row_stack((x, y))
111
+
112
+ if axisout == 1:
113
+ # To simplify the code that follow (always use `n_obs, n_vars` shape)
114
+ x = x.T
115
+
116
+ if nan_policy == "omit":
117
+ x = mask_fn(x)
118
+
119
+ def _spearmanr_2cols(x):
120
+ # Mask the same observations for all variables, and then drop those
121
+ # observations (can't leave them masked, rankdata is weird).
122
+ x = ma.mask_rowcols(x, axis=0)
123
+ x = x[~x.mask.any(axis=1), :]
124
+
125
+ # If either column is entirely NaN or Inf
126
+ if not np.any(x.data):
127
+ return SpearmanrResult(np.nan, np.nan)
128
+
129
+ m = ma.getmask(x)
130
+ n_obs = x.shape[0]
131
+ dof = n_obs - 2 - int(m.sum(axis=0)[0])
132
+ if dof < 0:
133
+ return SpearmanrResult(np.nan, np.nan)
134
+
135
+ # Gets the ranks and rank differences
136
+ x_ranked = rankdata(x, axis=0)
137
+ rs = ma.corrcoef(x_ranked, rowvar=False).data
138
+
139
+ # rs can have elements equal to 1, so avoid zero division warnings
140
+ with np.errstate(divide="ignore"):
141
+ # clip the small negative values possibly caused by rounding
142
+ # errors before taking the square root
143
+ t = rs * np.sqrt((dof / ((rs + 1.0) * (1.0 - rs))).clip(0))
144
+
145
+ t, prob = scipy.stats._mstats_basic._ttest_finish(dof, t, alternative)
146
+
147
+ # For backwards compatibility, return scalars when comparing 2 columns
148
+ if rs.shape == (2, 2):
149
+ return SpearmanrResult(rs[1, 0], prob[1, 0])
150
+ else:
151
+ return SpearmanrResult(rs, prob)
152
+
153
+ # Need to do this per pair of variables, otherwise the dropped observations
154
+ # in a third column mess up the result for a pair.
155
+ n_vars = x.shape[1]
156
+ if n_vars == 2:
157
+ return _spearmanr_2cols(x)
158
+ else:
159
+ max_cpu_cores = cpu_count(logical=False)
160
+ with np.errstate(divide="ignore"):
161
+ results = Parallel(n_jobs=max_cpu_cores)(
162
+ delayed(_spearmanr_2cols)(x[:, [var1, var2]])
163
+ for var1 in range(n_vars - 1)
164
+ for var2 in range(var1 + 1, n_vars)
165
+ )
166
+
167
+ rs = np.ones((n_vars, n_vars), dtype=float)
168
+ prob = np.zeros((n_vars, n_vars), dtype=float)
169
+ for var1 in range(n_vars - 1):
170
+ for var2 in range(var1 + 1, n_vars):
171
+ result = results.pop(0)
172
+ rs[var1, var2] = result.correlation
173
+ rs[var2, var1] = result.correlation
174
+ prob[var1, var2] = result.pvalue
175
+ prob[var2, var1] = result.pvalue
176
+
177
+ return SpearmanrResult(rs, prob)
@@ -0,0 +1,172 @@
1
+ import hashlib
2
+ import logging
3
+ from typing import Any, Dict, List, Optional, Union
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from joblib import Parallel, delayed
8
+ from pandas.api.types import is_datetime64_any_dtype, is_numeric_dtype
9
+ from psutil import cpu_count
10
+ from scipy.stats import skew, spearmanr
11
+
12
+ from upgini.metadata import ModelTaskType, SearchKey
13
+ from upgini.utils import mstats
14
+
15
+
16
+ def sort_columns(
17
+ df: pd.DataFrame,
18
+ target_column: Union[str, pd.Series],
19
+ search_keys: Dict[str, SearchKey],
20
+ model_task_type: ModelTaskType,
21
+ exclude_columns: Optional[List[str]] = None,
22
+ sort_all_columns: bool = False,
23
+ logger: Optional[logging.Logger] = None,
24
+ ) -> List[str]:
25
+ if exclude_columns is None:
26
+ exclude_columns = []
27
+ if logger is None:
28
+ logger = logging.getLogger(__name__)
29
+ logger.setLevel(logging.FATAL)
30
+ df = df.copy() # avoid side effects
31
+
32
+ # Check multiple search keys
33
+ search_key_values = list(search_keys.values())
34
+ has_duplicate_search_keys = len(search_key_values) != len(set(search_key_values))
35
+ if has_duplicate_search_keys:
36
+ logging.warning(f"WARNING: Found duplicate SearchKey values in search_keys: {search_keys}")
37
+
38
+ sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
39
+ sorted_keys = [k for k in sorted_keys if k in df.columns and k not in exclude_columns]
40
+
41
+ other_columns = sorted(
42
+ [
43
+ c
44
+ for c in df.columns
45
+ if c not in sorted_keys and c not in exclude_columns and (df[c].nunique() > 1 or sort_all_columns)
46
+ ]
47
+ )
48
+ target = target_column if isinstance(target_column, pd.Series) else df[target_column]
49
+ target = prepare_target(target, model_task_type)
50
+ sort_dict = get_sort_columns_dict(
51
+ df[sorted_keys + other_columns], target, sorted_keys, omit_nan=True, sort_all_columns=sort_all_columns
52
+ )
53
+ other_columns = [c for c in other_columns if c in sort_dict]
54
+ columns_for_sort = sorted_keys + sorted(other_columns, key=lambda e: sort_dict[e], reverse=True)
55
+ return columns_for_sort
56
+
57
+
58
+ def get_sort_columns_dict(
59
+ df: pd.DataFrame,
60
+ target: pd.Series,
61
+ sorted_keys: List[str],
62
+ omit_nan: bool,
63
+ n_jobs: Optional[int] = None,
64
+ sort_all_columns: bool = False,
65
+ ) -> Dict[str, Any]:
66
+ string_features = [c for c in df.select_dtypes(exclude=[np.number]).columns if c not in sorted_keys]
67
+ columns_for_sort = [c for c in df.columns if c not in sorted_keys + string_features]
68
+ if len(string_features) > 0:
69
+ if len(df) > len(df.drop(columns=string_features).drop_duplicates()) or sort_all_columns:
70
+ # factorize string features
71
+ for c in string_features:
72
+ df.loc[:, c] = pd.Series(df[c].factorize(sort=True)[0], index=df.index, dtype="int")
73
+ columns_for_sort.extend(string_features)
74
+
75
+ if len(columns_for_sort) == 0:
76
+ return {}
77
+
78
+ df = df[columns_for_sort]
79
+ hashes = [hash_series(df[col]) for col in columns_for_sort]
80
+ df = np.asarray(df, dtype=np.float32)
81
+ correlations = get_sort_columns_correlations(df, target, omit_nan, n_jobs)
82
+
83
+ sort_dict = {col: (corr, h) for col, corr, h in zip(columns_for_sort, correlations, hashes)}
84
+ return sort_dict
85
+
86
+
87
+ def get_sort_columns_correlations(df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: Optional[int] = None):
88
+ target_correlations = get_target_correlations(df, target, omit_nan, n_jobs, precision=7)
89
+
90
+ return np.max(target_correlations, axis=0)
91
+
92
+
93
+ def get_target_correlations(
94
+ df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: Optional[int] = None, precision: int = 15
95
+ ):
96
+ df = np.asarray(df, dtype=np.float32)
97
+ target_correlations = np.zeros((2, df.shape[1]))
98
+ target_correlations[0, :] = np.nan_to_num(
99
+ calculate_spearman_corr_with_target(df, target, omit_nan, n_jobs), copy=False
100
+ )
101
+ target_correlations[1, :] = np.nan_to_num(np.abs(np.corrcoef(df.T, target.T, rowvar=True)[-1, :-1]))
102
+
103
+ target_correlations = np.trunc(target_correlations * 10**precision) / (10**precision)
104
+
105
+ return target_correlations
106
+
107
+
108
+ def calculate_spearman_corr_with_target(
109
+ X: Union[pd.DataFrame, np.ndarray], y: pd.Series, omit_nan: bool = False, n_jobs: Optional[int] = None
110
+ ) -> np.ndarray:
111
+ if isinstance(X, pd.DataFrame):
112
+ X = np.asarray(X, dtype=np.float32)
113
+
114
+ if X.size == 0:
115
+ return np.ndarray(shape=(0,))
116
+
117
+ all_correlations = np.zeros(X.shape[1])
118
+ all_correlations.fill(np.nan)
119
+ cols2calc = np.where([c.size > 0 and not (c == c[0]).all() for c in X.T])[0]
120
+
121
+ if omit_nan:
122
+ results = Parallel(n_jobs=n_jobs or cpu_count(logical=False))(
123
+ delayed(mstats.spearmanr)(
124
+ X[:, i],
125
+ y,
126
+ nan_policy="omit",
127
+ axis=0,
128
+ )
129
+ for i in cols2calc
130
+ )
131
+ target_correlations = np.array([abs(res.correlation) for res in results])
132
+ else:
133
+ cols2calc = cols2calc[np.where(~np.isnan(X[:, cols2calc]).any(axis=0))[0]]
134
+ target_correlations = calculate_spearman(X[:, cols2calc], y, nan_policy="raise")
135
+ if isinstance(target_correlations, float):
136
+ target_correlations = np.abs([target_correlations])
137
+ else:
138
+ target_correlations = np.abs(target_correlations)[-1, :-1]
139
+
140
+ all_correlations[cols2calc] = target_correlations
141
+
142
+ return all_correlations
143
+
144
+
145
+ def calculate_spearman(X: np.ndarray, y: Optional[pd.Series], nan_policy: str):
146
+ features_num = X.shape[1]
147
+ if y is not None:
148
+ features_num += 1
149
+
150
+ if features_num < 2:
151
+ return 1.0
152
+ else:
153
+ return spearmanr(X, y, nan_policy=nan_policy).correlation
154
+
155
+
156
+ def hash_series(series: pd.Series) -> int:
157
+ return int(hashlib.sha256(pd.util.hash_pandas_object(series, index=True).values).hexdigest(), 16)
158
+
159
+
160
+ def prepare_target(target: pd.Series, model_task_type: ModelTaskType) -> pd.Series:
161
+ target_name = target.name
162
+ if model_task_type != ModelTaskType.REGRESSION or (
163
+ not is_numeric_dtype(target) and not is_datetime64_any_dtype(target)
164
+ ):
165
+ target = target.astype(str).astype("category").cat.codes
166
+
167
+ elif model_task_type == ModelTaskType.REGRESSION:
168
+ skewness = round(abs(skew(target)), 2)
169
+ if (target.min() >= 0) and (skewness >= 0.9):
170
+ target = np.log1p(target)
171
+
172
+ return pd.Series(target, name=target_name)
@@ -1,4 +1,3 @@
1
- import itertools
2
1
  import logging
3
2
  from typing import Callable, List, Optional, Union
4
3
 
@@ -208,7 +207,7 @@ def balance_undersample_forced(
208
207
  id_columns: List[str],
209
208
  date_column: str,
210
209
  task_type: ModelTaskType,
211
- cv_type: CVType | None,
210
+ cv_type: Optional[CVType],
212
211
  random_state: int,
213
212
  sample_size: int = 7000,
214
213
  logger: Optional[logging.Logger] = None,
@@ -372,7 +371,8 @@ def balance_undersample_time_series(
372
371
  if len(id_counts) < min_different_ids:
373
372
  if logger is not None:
374
373
  logger.info(
375
- f"Different ids count {len(id_counts)} for sample size {sample_size} is less than min different ids {min_different_ids}, sampling time window"
374
+ f"Different ids count {len(id_counts)} for sample size {sample_size}"
375
+ f" is less than min different ids {min_different_ids}, sampling time window"
376
376
  )
377
377
  date_counts = df.groupby(id_columns)[date_column].nunique().sort_values(ascending=False)
378
378
  ids_to_sample = date_counts.index[:min_different_ids] if len(id_counts) > 0 else date_counts.index
@@ -8,23 +8,17 @@ def get_most_frequent_time_unit(df: pd.DataFrame, id_columns: List[str], date_co
8
8
  def closest_unit(diff):
9
9
  return pd.tseries.frequencies.to_offset(pd.Timedelta(diff, unit="s"))
10
10
 
11
- # Calculate differences for each ID group
12
11
  all_diffs = []
13
12
  groups = df.groupby(id_columns) if id_columns else [(None, df)]
14
13
  for _, group in groups:
15
- # Get sorted dates for this group
16
14
  group_dates = group[date_column].sort_values().unique()
17
15
  if len(group_dates) > 1:
18
- # Calculate time differences between consecutive dates
19
16
  diff_series = pd.Series(group_dates[1:] - group_dates[:-1])
20
- # Convert to nanoseconds
21
17
  diff_ns = diff_series.dt.total_seconds()
22
18
  all_diffs.extend(diff_ns)
23
19
 
24
- # Convert to series for easier processing
25
20
  all_diffs = pd.Series(all_diffs)
26
21
 
27
- # Get most common time unit across all groups
28
22
  most_frequent_unit = all_diffs.apply(closest_unit).mode().min()
29
23
 
30
24
  return most_frequent_unit if isinstance(most_frequent_unit, pd.DateOffset) else None
@@ -1 +0,0 @@
1
- __version__ = "1.2.60a3792.dev2"
File without changes
File without changes
File without changes