upgini 1.2.58a1__tar.gz → 1.2.59a3818.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (69) hide show
  1. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/PKG-INFO +2 -2
  2. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/pyproject.toml +1 -1
  3. upgini-1.2.59a3818.dev1/src/upgini/__about__.py +1 -0
  4. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/autofe/date.py +8 -0
  5. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/autofe/feature.py +1 -10
  6. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/autofe/vector.py +1 -1
  7. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/features_enricher.py +40 -54
  8. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/resource_bundle/strings.properties +0 -1
  9. upgini-1.2.58a1/src/upgini/__about__.py +0 -1
  10. upgini-1.2.58a1/src/upgini/utils/mstats.py +0 -177
  11. upgini-1.2.58a1/src/upgini/utils/sort.py +0 -160
  12. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/.gitignore +0 -0
  13. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/LICENSE +0 -0
  14. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/README.md +0 -0
  15. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/__init__.py +0 -0
  16. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/ads.py +0 -0
  17. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/ads_management/__init__.py +0 -0
  18. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/ads_management/ads_manager.py +0 -0
  19. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/autofe/__init__.py +0 -0
  20. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/autofe/all_operands.py +0 -0
  21. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/autofe/binary.py +0 -0
  22. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/autofe/groupby.py +0 -0
  23. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/autofe/operand.py +0 -0
  24. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/autofe/unary.py +0 -0
  25. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/data_source/__init__.py +0 -0
  26. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/data_source/data_source_publisher.py +0 -0
  27. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/dataset.py +0 -0
  28. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/errors.py +0 -0
  29. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/http.py +0 -0
  30. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/lazy_import.py +0 -0
  31. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/mdc/__init__.py +0 -0
  32. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/mdc/context.py +0 -0
  33. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/metadata.py +0 -0
  34. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/metrics.py +0 -0
  35. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/normalizer/__init__.py +0 -0
  36. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/normalizer/normalize_utils.py +0 -0
  37. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/resource_bundle/__init__.py +0 -0
  38. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/resource_bundle/exceptions.py +0 -0
  39. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  40. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/sampler/__init__.py +0 -0
  41. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/sampler/base.py +0 -0
  42. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/sampler/random_under_sampler.py +0 -0
  43. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/sampler/utils.py +0 -0
  44. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/search_task.py +0 -0
  45. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/spinner.py +0 -0
  46. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  47. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/__init__.py +0 -0
  48. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/base_search_key_detector.py +0 -0
  49. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/blocked_time_series.py +0 -0
  50. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/country_utils.py +0 -0
  51. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/custom_loss_utils.py +0 -0
  52. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/cv_utils.py +0 -0
  53. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/datetime_utils.py +0 -0
  54. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/deduplicate_utils.py +0 -0
  55. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/display_utils.py +0 -0
  56. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/email_utils.py +0 -0
  57. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/fallback_progress_bar.py +0 -0
  58. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/feature_info.py +0 -0
  59. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/features_validator.py +0 -0
  60. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/format.py +0 -0
  61. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/ip_utils.py +0 -0
  62. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/phone_utils.py +0 -0
  63. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/postal_code_utils.py +0 -0
  64. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/progress_bar.py +0 -0
  65. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/sklearn_ext.py +0 -0
  66. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/target_utils.py +0 -0
  67. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/track_info.py +0 -0
  68. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/warning_counter.py +0 -0
  69. {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.58a1
3
+ Version: 1.2.59a3818.dev1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -34,7 +34,7 @@ Requires-Dist: pydantic<3.0.0,>1.0.0
34
34
  Requires-Dist: pyjwt>=2.8.0
35
35
  Requires-Dist: python-bidi==0.4.2
36
36
  Requires-Dist: python-dateutil>=2.8.0
37
- Requires-Dist: python-json-logger>=2.0.2
37
+ Requires-Dist: python-json-logger>=3.3.0
38
38
  Requires-Dist: requests>=2.8.0
39
39
  Requires-Dist: scikit-learn>=1.3.0
40
40
  Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
@@ -43,7 +43,7 @@ dependencies = [
43
43
  "pydantic>1.0.0,<3.0.0",
44
44
  "pyjwt>=2.8.0",
45
45
  "python-dateutil>=2.8.0",
46
- "python-json-logger>=2.0.2",
46
+ "python-json-logger>=3.3.0",
47
47
  "requests>=2.8.0",
48
48
  "scikit-learn>=1.3.0",
49
49
  "python-bidi==0.4.2",
@@ -0,0 +1 @@
1
+ __version__ = "1.2.59a3818.dev1"
@@ -64,6 +64,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
64
64
  return res
65
65
 
66
66
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
67
+ if left.isna().all() or right.isna().all():
68
+ return pd.Series([None] * len(left))
69
+
67
70
  left = self._convert_to_date(left, self.left_unit)
68
71
  right = self._convert_to_date(right, self.right_unit)
69
72
  diff = self._convert_diff_to_unit(left.dt.date - right.dt.date)
@@ -142,6 +145,9 @@ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
142
145
  return cls(aggregation=aggregation)
143
146
 
144
147
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
148
+ if left.isna().all() or right.isna().all():
149
+ return pd.Series([None] * len(left), dtype=np.float64)
150
+
145
151
  left = self._convert_to_date(left, self.left_unit)
146
152
  right_mask = right.apply(lambda x: len(x) > 0)
147
153
  mask = left.notna() & right.notna() & right_mask
@@ -230,6 +236,8 @@ class DatePercentileBase(PandasOperand, abc.ABC):
230
236
  pass
231
237
 
232
238
  def _perc(self, f, bounds):
239
+ if f is None or np.isnan(f):
240
+ return np.nan
233
241
  hit = np.where(f >= np.array(bounds))[0]
234
242
  if hit.size > 0:
235
243
  return np.max(hit) + 1
@@ -26,18 +26,9 @@ class Column:
26
26
  return dict()
27
27
 
28
28
  def rename_columns(self, mapping: Dict[str, str]) -> "Column":
29
- self.name = self._unhash(mapping.get(self.name) or self.name)
29
+ self.name = mapping.get(self.name) or self.name
30
30
  return self
31
31
 
32
- def _unhash(self, feature_name: str) -> str:
33
- last_component_idx = feature_name.rfind("_")
34
- if not feature_name.startswith("f_"):
35
- return feature_name # etalon feature
36
- elif last_component_idx == 1:
37
- return feature_name[2:] # fully hashed name, cannot unhash
38
- else:
39
- return feature_name[2:last_component_idx]
40
-
41
32
  def delete_data(self):
42
33
  self.data = None
43
34
 
@@ -55,7 +55,7 @@ class TimeSeriesBase(PandasOperand, abc.ABC):
55
55
  ts.set_index(date.name, inplace=True)
56
56
  ts = ts[ts.index.notna()].sort_index()
57
57
  ts = (
58
- ts.groupby([c.name for c in data[1:-1]])
58
+ ts.groupby([c.name for c in data[1:-1]], group_keys=True)
59
59
  .apply(self._shift)[data[-1].name]
60
60
  .to_frame()
61
61
  .reset_index()
@@ -112,7 +112,6 @@ try:
112
112
  except Exception:
113
113
  from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
114
114
 
115
- from upgini.utils.sort import sort_columns
116
115
  from upgini.utils.target_utils import (
117
116
  balance_undersample_forced,
118
117
  calculate_psi,
@@ -1258,7 +1257,7 @@ class FeaturesEnricher(TransformerMixin):
1258
1257
  for feature, shap in new_shaps.items()
1259
1258
  if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
1260
1259
  }
1261
- self.__prepare_feature_importances(trace_id, x_columns, new_shaps)
1260
+ self.__prepare_feature_importances(trace_id, x_columns, new_shaps, silent=True)
1262
1261
 
1263
1262
  if self.features_info_display_handle is not None:
1264
1263
  try:
@@ -1735,7 +1734,7 @@ class FeaturesEnricher(TransformerMixin):
1735
1734
  self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
1736
1735
  df = df.sample(n=sample_rows, random_state=self.random_state)
1737
1736
 
1738
- df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET)
1737
+ df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
1739
1738
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
1740
1739
  df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1741
1740
 
@@ -1916,7 +1915,6 @@ class FeaturesEnricher(TransformerMixin):
1916
1915
  progress_bar=progress_bar,
1917
1916
  progress_callback=progress_callback,
1918
1917
  add_fit_system_record_id=True,
1919
- target_name=tmp_target_name,
1920
1918
  )
1921
1919
  if enriched_df is None:
1922
1920
  return None
@@ -1966,7 +1964,6 @@ class FeaturesEnricher(TransformerMixin):
1966
1964
  progress_bar=progress_bar,
1967
1965
  progress_callback=progress_callback,
1968
1966
  add_fit_system_record_id=True,
1969
- target_name=tmp_target_name,
1970
1967
  )
1971
1968
  if enriched_Xy is None:
1972
1969
  return None
@@ -2128,7 +2125,6 @@ if response.status_code == 200:
2128
2125
  progress_bar: Optional[ProgressBar] = None,
2129
2126
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
2130
2127
  add_fit_system_record_id: bool = False,
2131
- target_name: Optional[str] = None,
2132
2128
  ) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
2133
2129
  if self._search_task is None:
2134
2130
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
@@ -2313,11 +2309,8 @@ if response.status_code == 200:
2313
2309
  and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
2314
2310
  ]
2315
2311
 
2316
- if add_fit_system_record_id and target_name is not None:
2317
- reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
2318
- df = self.__add_fit_system_record_id(
2319
- df, search_keys, SYSTEM_RECORD_ID, reversed_columns_renaming.get(target_name, target_name)
2320
- )
2312
+ if add_fit_system_record_id:
2313
+ df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
2321
2314
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2322
2315
  features_not_to_pass.append(SORT_ID)
2323
2316
 
@@ -2761,7 +2754,7 @@ if response.status_code == 200:
2761
2754
  self.__log_warning(full_duplicates_warning)
2762
2755
 
2763
2756
  # Explode multiple search keys
2764
- df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID, TARGET)
2757
+ df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2765
2758
 
2766
2759
  # TODO check that this is correct for enrichment
2767
2760
  self.df_with_original_index = df.copy()
@@ -2843,7 +2836,7 @@ if response.status_code == 200:
2843
2836
  if eval_set is not None and len(eval_set) > 0:
2844
2837
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2845
2838
 
2846
- df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID, TARGET)
2839
+ df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
2847
2840
 
2848
2841
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2849
2842
  df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
@@ -3535,60 +3528,53 @@ if response.status_code == 200:
3535
3528
  # meaning_types: Dict[str, FileColumnMeaningType],
3536
3529
  search_keys: Dict[str, SearchKey],
3537
3530
  id_name: str,
3538
- target_name: str,
3539
3531
  ) -> pd.DataFrame:
3532
+ # save original order or rows
3540
3533
  original_index_name = df.index.name
3541
3534
  index_name = df.index.name or DEFAULT_INDEX
3542
3535
  original_order_name = "original_order"
3543
- # Save original index
3544
3536
  df = df.reset_index().rename(columns={index_name: ORIGINAL_INDEX})
3545
- # Save original order
3546
3537
  df = df.reset_index().rename(columns={DEFAULT_INDEX: original_order_name})
3547
3538
 
3548
- # order by date and idempotent order by other keys and features
3539
+ # order by date and idempotent order by other keys
3540
+ if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
3541
+ sort_exclude_columns = [
3542
+ original_order_name,
3543
+ ORIGINAL_INDEX,
3544
+ EVAL_SET_INDEX,
3545
+ TARGET,
3546
+ "__target",
3547
+ ENTITY_SYSTEM_RECORD_ID,
3548
+ ]
3549
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3550
+ date_column = DateTimeSearchKeyConverter.DATETIME_COL
3551
+ sort_exclude_columns.append(self._get_date_column(search_keys))
3552
+ else:
3553
+ date_column = self._get_date_column(search_keys)
3554
+ sort_columns = [date_column] if date_column is not None else []
3549
3555
 
3550
- sort_exclude_columns = [
3551
- original_order_name,
3552
- ORIGINAL_INDEX,
3553
- EVAL_SET_INDEX,
3554
- TARGET,
3555
- "__target",
3556
- ENTITY_SYSTEM_RECORD_ID,
3557
- ]
3558
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3559
- date_column = DateTimeSearchKeyConverter.DATETIME_COL
3560
- sort_exclude_columns.append(self._get_date_column(search_keys))
3561
- else:
3562
- date_column = self._get_date_column(search_keys)
3563
- sort_exclude_columns.append(date_column)
3564
- columns_to_sort = [date_column] if date_column is not None else []
3556
+ sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
3557
+ sorted_other_keys = [k for k in sorted_other_keys if k not in sort_exclude_columns]
3565
3558
 
3566
- do_sorting = True
3567
- if self.id_columns and self.cv in [CVType.time_series, CVType.blocked_time_series]:
3568
- # Check duplicates by date and id_columns
3569
- duplicate_check_columns = [c for c in self.id_columns if c in df.columns]
3570
- if date_column is not None:
3571
- duplicate_check_columns.append(date_column)
3559
+ other_columns = sorted(
3560
+ [
3561
+ c
3562
+ for c in df.columns
3563
+ if c not in sort_columns
3564
+ and c not in sorted_other_keys
3565
+ and c not in sort_exclude_columns
3566
+ and df[c].nunique() > 1
3567
+ ]
3568
+ )
3572
3569
 
3573
- duplicates = df.duplicated(subset=duplicate_check_columns, keep=False)
3574
- if duplicates.any():
3575
- self.__log_warning(self.bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
3576
- do_sorting = False
3577
- else:
3578
- columns_to_hash = list(search_keys.keys()) + self.id_columns
3579
- columns_to_hash = sort_columns(
3580
- df[columns_to_hash], target_name, search_keys, self.model_task_type, sort_exclude_columns
3581
- )
3582
- else:
3583
- columns_to_hash = sort_columns(df, target_name, search_keys, self.model_task_type, sort_exclude_columns)
3570
+ all_other_columns = sorted_other_keys + other_columns
3584
3571
 
3585
- if do_sorting:
3586
3572
  search_keys_hash = "search_keys_hash"
3587
- if len(columns_to_hash) > 0:
3588
- df[search_keys_hash] = pd.util.hash_pandas_object(df[columns_to_hash], index=False)
3589
- columns_to_sort.append(search_keys_hash)
3573
+ if len(all_other_columns) > 0:
3574
+ sort_columns.append(search_keys_hash)
3575
+ df[search_keys_hash] = pd.util.hash_pandas_object(df[all_other_columns], index=False)
3590
3576
 
3591
- df = df.sort_values(by=columns_to_sort)
3577
+ df = df.sort_values(by=sort_columns)
3592
3578
 
3593
3579
  if search_keys_hash in df.columns:
3594
3580
  df.drop(columns=search_keys_hash, inplace=True)
@@ -35,7 +35,6 @@ trial_quota_limit_riched=You have reached the quota limit of trial data usage. P
35
35
  loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
36
36
  loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
37
37
  multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
38
- date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns
39
38
  group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
40
39
  current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
41
40
  # Errors
@@ -1 +0,0 @@
1
- __version__ = "1.2.58a1"
@@ -1,177 +0,0 @@
1
- import warnings
2
- from collections import namedtuple
3
-
4
- import numpy as np
5
- import numpy.ma as ma
6
- import scipy
7
- from joblib import Parallel, delayed
8
- from numpy import ndarray
9
- from psutil import cpu_count
10
-
11
- np.seterr(divide="ignore")
12
-
13
-
14
- warnings.simplefilter(action="ignore", category=RuntimeWarning)
15
-
16
-
17
- def _find_repeats(arr):
18
- # This function assumes it may clobber its input.
19
- if len(arr) == 0:
20
- return np.array(0, np.float64), np.array(0, np.intp)
21
-
22
- # XXX This cast was previously needed for the Fortran implementation,
23
- # should we ditch it?
24
- arr = np.asarray(arr, np.float64).ravel()
25
- arr.sort()
26
-
27
- # Taken from NumPy 1.9's np.unique.
28
- change = np.concatenate(([True], arr[1:] != arr[:-1]))
29
- unique = arr[change]
30
- change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
31
- freq = np.diff(change_idx)
32
- atleast2 = freq > 1
33
- return unique[atleast2], freq[atleast2]
34
-
35
-
36
- def find_repeats(arr):
37
- # Make sure we get a copy. ma.compressed promises a "new array", but can
38
- # actually return a reference.
39
- compr = np.asarray(ma.compressed(arr), dtype=np.float64)
40
- try:
41
- need_copy = np.may_share_memory(compr, arr)
42
- except AttributeError:
43
- # numpy < 1.8.2 bug: np.may_share_memory([], []) raises,
44
- # while in numpy 1.8.2 and above it just (correctly) returns False.
45
- need_copy = False
46
- if need_copy:
47
- compr = compr.copy()
48
- return _find_repeats(compr)
49
-
50
-
51
- def rankdata(data, axis=None, use_missing=False):
52
- def _rank1d(data, use_missing=False):
53
- n = data.count()
54
- rk = np.empty(data.size, dtype=float)
55
- idx = data.argsort()
56
- rk[idx[:n]] = np.arange(1, n + 1)
57
-
58
- if use_missing:
59
- rk[idx[n:]] = (n + 1) / 2.0
60
- else:
61
- rk[idx[n:]] = 0
62
-
63
- repeats = find_repeats(data.copy())
64
- for r in repeats[0]:
65
- condition = (data == r).filled(False)
66
- rk[condition] = rk[condition].mean()
67
- return rk
68
-
69
- data = ma.array(data, copy=False)
70
- if axis is None:
71
- if data.ndim > 1:
72
- return _rank1d(data.ravel(), use_missing).reshape(data.shape)
73
- else:
74
- return _rank1d(data, use_missing)
75
- else:
76
- return ma.apply_along_axis(_rank1d, axis, data, use_missing).view(ndarray)
77
-
78
-
79
- def _chk_asarray(a, axis):
80
- # Always returns a masked array, raveled for axis=None
81
- a = ma.asanyarray(a)
82
- if axis is None:
83
- a = ma.ravel(a)
84
- outaxis = 0
85
- else:
86
- outaxis = axis
87
- return a, outaxis
88
-
89
-
90
- SpearmanrResult = namedtuple("SpearmanrResult", ("correlation", "pvalue"))
91
-
92
-
93
- # Taken from scipy.mstats with following tweaks:
94
- # 1. parallel pairwise computation
95
- # 2. custom masking
96
- def spearmanr(
97
- x, y=None, use_ties=True, axis=None, nan_policy="propagate", alternative="two-sided", mask_fn=ma.masked_invalid
98
- ):
99
- if not use_ties:
100
- raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")
101
-
102
- # Always returns a masked array, raveled if axis=None
103
- x, axisout = _chk_asarray(x, axis)
104
- if y is not None:
105
- # Deal only with 2-D `x` case.
106
- y, _ = _chk_asarray(y, axis)
107
- if axisout == 0:
108
- x = ma.column_stack((x, y))
109
- else:
110
- x = ma.row_stack((x, y))
111
-
112
- if axisout == 1:
113
- # To simplify the code that follow (always use `n_obs, n_vars` shape)
114
- x = x.T
115
-
116
- if nan_policy == "omit":
117
- x = mask_fn(x)
118
-
119
- def _spearmanr_2cols(x):
120
- # Mask the same observations for all variables, and then drop those
121
- # observations (can't leave them masked, rankdata is weird).
122
- x = ma.mask_rowcols(x, axis=0)
123
- x = x[~x.mask.any(axis=1), :]
124
-
125
- # If either column is entirely NaN or Inf
126
- if not np.any(x.data):
127
- return SpearmanrResult(np.nan, np.nan)
128
-
129
- m = ma.getmask(x)
130
- n_obs = x.shape[0]
131
- dof = n_obs - 2 - int(m.sum(axis=0)[0])
132
- if dof < 0:
133
- return SpearmanrResult(np.nan, np.nan)
134
-
135
- # Gets the ranks and rank differences
136
- x_ranked = rankdata(x, axis=0)
137
- rs = ma.corrcoef(x_ranked, rowvar=False).data
138
-
139
- # rs can have elements equal to 1, so avoid zero division warnings
140
- with np.errstate(divide="ignore"):
141
- # clip the small negative values possibly caused by rounding
142
- # errors before taking the square root
143
- t = rs * np.sqrt((dof / ((rs + 1.0) * (1.0 - rs))).clip(0))
144
-
145
- t, prob = scipy.stats._mstats_basic._ttest_finish(dof, t, alternative)
146
-
147
- # For backwards compatibility, return scalars when comparing 2 columns
148
- if rs.shape == (2, 2):
149
- return SpearmanrResult(rs[1, 0], prob[1, 0])
150
- else:
151
- return SpearmanrResult(rs, prob)
152
-
153
- # Need to do this per pair of variables, otherwise the dropped observations
154
- # in a third column mess up the result for a pair.
155
- n_vars = x.shape[1]
156
- if n_vars == 2:
157
- return _spearmanr_2cols(x)
158
- else:
159
- max_cpu_cores = cpu_count(logical=False)
160
- with np.errstate(divide="ignore"):
161
- results = Parallel(n_jobs=max_cpu_cores)(
162
- delayed(_spearmanr_2cols)(x[:, [var1, var2]])
163
- for var1 in range(n_vars - 1)
164
- for var2 in range(var1 + 1, n_vars)
165
- )
166
-
167
- rs = np.ones((n_vars, n_vars), dtype=float)
168
- prob = np.zeros((n_vars, n_vars), dtype=float)
169
- for var1 in range(n_vars - 1):
170
- for var2 in range(var1 + 1, n_vars):
171
- result = results.pop(0)
172
- rs[var1, var2] = result.correlation
173
- rs[var2, var1] = result.correlation
174
- prob[var1, var2] = result.pvalue
175
- prob[var2, var1] = result.pvalue
176
-
177
- return SpearmanrResult(rs, prob)
@@ -1,160 +0,0 @@
1
- import hashlib
2
- from typing import Any, Dict, List
3
-
4
- import numpy as np
5
- import pandas as pd
6
- from joblib import Parallel, delayed
7
- from pandas.api.types import is_datetime64_any_dtype, is_numeric_dtype
8
- from psutil import cpu_count
9
- from scipy.stats import skew, spearmanr
10
-
11
- from upgini.metadata import ModelTaskType, SearchKey
12
- from upgini.utils import mstats
13
-
14
-
15
- def sort_columns(
16
- df: pd.DataFrame,
17
- target_column: str,
18
- search_keys: Dict[str, SearchKey],
19
- model_task_type: ModelTaskType,
20
- exclude_columns: List[str],
21
- ) -> List[str]:
22
- df = df.copy() # avoid side effects
23
- sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
24
- sorted_keys = [k for k in sorted_keys if k not in exclude_columns]
25
-
26
- other_columns = sorted(
27
- [
28
- c
29
- for c in df.columns
30
- if c not in sorted_keys
31
- and c not in exclude_columns
32
- and df[c].nunique() > 1
33
- ]
34
- )
35
-
36
- target = prepare_target(df[target_column], model_task_type)
37
- sort_dict = get_sort_columns_dict(df[sorted_keys + other_columns], target, sorted_keys, omit_nan=True)
38
- other_columns = [c for c in other_columns if c in sort_dict]
39
- columns_for_sort = sorted_keys + sorted(other_columns, key=lambda e: sort_dict[e], reverse=True)
40
- return columns_for_sort
41
-
42
-
43
- def get_sort_columns_dict(
44
- df: pd.DataFrame,
45
- target: pd.Series,
46
- sorted_keys: List[str],
47
- omit_nan: bool,
48
- n_jobs: int | None = None,
49
- ) -> dict[str, Any]:
50
- string_features = [c for c in df.select_dtypes(exclude=[np.number]).columns if c not in sorted_keys]
51
- columns_for_sort = [c for c in df.columns if c not in sorted_keys + string_features]
52
- if len(string_features) > 0:
53
- if len(df) > len(df.drop(columns=string_features).drop_duplicates()):
54
- # factorize string features
55
- for c in string_features:
56
- df[c] = df[c].factorize(sort=True)[0]
57
- columns_for_sort.extend(string_features)
58
-
59
- if len(columns_for_sort) == 0:
60
- return {}
61
-
62
- df = df[columns_for_sort]
63
- hashes = [hash_series(df[col]) for col in columns_for_sort]
64
- df = np.asarray(df, dtype=np.float32)
65
- correlations = get_sort_columns_correlations(df, target, omit_nan, n_jobs)
66
-
67
- sort_dict = {col: (corr, h) for col, corr, h in zip(columns_for_sort, correlations, hashes)}
68
- return sort_dict
69
-
70
-
71
- def get_sort_columns_correlations(df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: int | None = None):
72
- target_correlations = get_target_correlations(df, target, omit_nan, n_jobs, precision=7)
73
-
74
- return np.max(target_correlations, axis=0)
75
-
76
-
77
- def get_target_correlations(
78
- df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: int | None = None, precision: int = 15
79
- ):
80
- df = np.asarray(df, dtype=np.float32)
81
- target_correlations = np.zeros((2, df.shape[1]))
82
- target_correlations[0, :] = np.nan_to_num(
83
- calculate_spearman_corr_with_target(df, target, omit_nan, n_jobs), copy=False
84
- )
85
- target_correlations[1, :] = np.nan_to_num(np.abs(np.corrcoef(df.T, target.T, rowvar=True)[-1, :-1]))
86
-
87
- target_correlations = np.trunc(target_correlations * 10**precision) / (10**precision)
88
-
89
- return target_correlations
90
-
91
-
92
- def corr_dict_from_sort_dict(sort_dict: dict[str, tuple[float, int]]) -> dict[str, float]:
93
- return {k: v[0] for k, v in sort_dict.items()}
94
-
95
-
96
- def calculate_spearman_corr_with_target(
97
- X: pd.DataFrame | np.ndarray, y: pd.Series, omit_nan: bool = False, n_jobs: int | None = None
98
- ) -> np.ndarray:
99
- if isinstance(X, pd.DataFrame):
100
- X = np.asarray(X, dtype=np.float32)
101
-
102
- if X.size == 0:
103
- return np.ndarray(shape=(0,))
104
-
105
- all_correlations = np.zeros(X.shape[1])
106
- all_correlations.fill(np.nan)
107
- cols2calc = np.where([c.size > 0 and not (c == c[0]).all() for c in X.T])[0]
108
-
109
- if omit_nan:
110
- results = Parallel(n_jobs=n_jobs or cpu_count(logical=False))(
111
- delayed(mstats.spearmanr)(
112
- X[:, i],
113
- y,
114
- nan_policy="omit",
115
- axis=0,
116
- )
117
- for i in cols2calc
118
- )
119
- target_correlations = np.array([abs(res.correlation) for res in results])
120
- else:
121
- cols2calc = cols2calc[np.where(~np.isnan(X[:, cols2calc]).any(axis=0))[0]]
122
- target_correlations = calculate_spearman(X[:, cols2calc], y, nan_policy="raise")
123
- if isinstance(target_correlations, float):
124
- target_correlations = np.abs([target_correlations])
125
- else:
126
- target_correlations = np.abs(target_correlations)[-1, :-1]
127
-
128
- all_correlations[cols2calc] = target_correlations
129
-
130
- return all_correlations
131
-
132
-
133
- def calculate_spearman(X: np.ndarray, y: pd.Series | None, nan_policy: str):
134
- features_num = X.shape[1]
135
- if y is not None:
136
- features_num += 1
137
-
138
- if features_num < 2:
139
- return 1.0
140
- else:
141
- return spearmanr(X, y, nan_policy=nan_policy).correlation
142
-
143
-
144
- def hash_series(series: pd.Series) -> int:
145
- return int(hashlib.sha256(pd.util.hash_pandas_object(series, index=True).values).hexdigest(), 16)
146
-
147
-
148
- def prepare_target(target: pd.Series, model_task_type: ModelTaskType) -> pd.Series:
149
- target_name = target.name
150
- if model_task_type != ModelTaskType.REGRESSION or (
151
- not is_numeric_dtype(target) and not is_datetime64_any_dtype(target)
152
- ):
153
- target = target.astype(str).astype("category").cat.codes
154
-
155
- elif model_task_type == ModelTaskType.REGRESSION:
156
- skewness = round(abs(skew(target)), 2)
157
- if (target.min() >= 0) and (skewness >= 0.9):
158
- target = np.log1p(target)
159
-
160
- return pd.Series(target, name=target_name)
File without changes
File without changes
File without changes