upgini 1.2.57a3675.dev5__tar.gz → 1.2.58a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (69) hide show
  1. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/PKG-INFO +2 -2
  2. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/pyproject.toml +1 -1
  3. upgini-1.2.58a1/src/upgini/__about__.py +1 -0
  4. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/autofe/date.py +0 -8
  5. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/autofe/feature.py +10 -1
  6. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/data_source/data_source_publisher.py +0 -1
  7. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/dataset.py +8 -16
  8. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/features_enricher.py +60 -51
  9. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/resource_bundle/strings.properties +1 -1
  10. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/email_utils.py +6 -6
  11. upgini-1.2.58a1/src/upgini/utils/mstats.py +177 -0
  12. upgini-1.2.58a1/src/upgini/utils/sort.py +160 -0
  13. upgini-1.2.57a3675.dev5/src/upgini/__about__.py +0 -1
  14. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/.gitignore +0 -0
  15. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/LICENSE +0 -0
  16. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/README.md +0 -0
  17. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/__init__.py +0 -0
  18. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/ads.py +0 -0
  19. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/ads_management/__init__.py +0 -0
  20. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/ads_management/ads_manager.py +0 -0
  21. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/autofe/__init__.py +0 -0
  22. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/autofe/all_operands.py +0 -0
  23. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/autofe/binary.py +0 -0
  24. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/autofe/groupby.py +0 -0
  25. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/autofe/operand.py +0 -0
  26. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/autofe/unary.py +0 -0
  27. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/autofe/vector.py +0 -0
  28. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/data_source/__init__.py +0 -0
  29. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/errors.py +0 -0
  30. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/http.py +0 -0
  31. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/lazy_import.py +0 -0
  32. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/mdc/__init__.py +0 -0
  33. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/mdc/context.py +0 -0
  34. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/metadata.py +0 -0
  35. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/metrics.py +0 -0
  36. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/normalizer/__init__.py +0 -0
  37. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/normalizer/normalize_utils.py +0 -0
  38. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/resource_bundle/__init__.py +0 -0
  39. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/resource_bundle/exceptions.py +0 -0
  40. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  41. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/sampler/__init__.py +0 -0
  42. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/sampler/base.py +0 -0
  43. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/sampler/random_under_sampler.py +0 -0
  44. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/sampler/utils.py +0 -0
  45. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/search_task.py +0 -0
  46. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/spinner.py +0 -0
  47. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  48. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/__init__.py +0 -0
  49. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/base_search_key_detector.py +0 -0
  50. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/blocked_time_series.py +0 -0
  51. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/country_utils.py +0 -0
  52. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/custom_loss_utils.py +0 -0
  53. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/cv_utils.py +0 -0
  54. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/datetime_utils.py +0 -0
  55. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/deduplicate_utils.py +0 -0
  56. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/display_utils.py +0 -0
  57. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
  58. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/feature_info.py +0 -0
  59. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/features_validator.py +0 -0
  60. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/format.py +0 -0
  61. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/ip_utils.py +0 -0
  62. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/phone_utils.py +0 -0
  63. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/postal_code_utils.py +0 -0
  64. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/progress_bar.py +0 -0
  65. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/sklearn_ext.py +0 -0
  66. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/target_utils.py +0 -0
  67. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/track_info.py +0 -0
  68. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/utils/warning_counter.py +0 -0
  69. {upgini-1.2.57a3675.dev5 → upgini-1.2.58a1}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.57a3675.dev5
3
+ Version: 1.2.58a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -34,7 +34,7 @@ Requires-Dist: pydantic<3.0.0,>1.0.0
34
34
  Requires-Dist: pyjwt>=2.8.0
35
35
  Requires-Dist: python-bidi==0.4.2
36
36
  Requires-Dist: python-dateutil>=2.8.0
37
- Requires-Dist: python-json-logger>=3.3.0
37
+ Requires-Dist: python-json-logger>=2.0.2
38
38
  Requires-Dist: requests>=2.8.0
39
39
  Requires-Dist: scikit-learn>=1.3.0
40
40
  Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
@@ -43,7 +43,7 @@ dependencies = [
43
43
  "pydantic>1.0.0,<3.0.0",
44
44
  "pyjwt>=2.8.0",
45
45
  "python-dateutil>=2.8.0",
46
- "python-json-logger>=3.3.0",
46
+ "python-json-logger>=2.0.2",
47
47
  "requests>=2.8.0",
48
48
  "scikit-learn>=1.3.0",
49
49
  "python-bidi==0.4.2",
@@ -0,0 +1 @@
1
+ __version__ = "1.2.58a1"
@@ -64,9 +64,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
64
64
  return res
65
65
 
66
66
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
67
- if left.isna().all() or right.isna().all():
68
- return pd.Series([None] * len(left))
69
-
70
67
  left = self._convert_to_date(left, self.left_unit)
71
68
  right = self._convert_to_date(right, self.right_unit)
72
69
  diff = self._convert_diff_to_unit(left.dt.date - right.dt.date)
@@ -145,9 +142,6 @@ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
145
142
  return cls(aggregation=aggregation)
146
143
 
147
144
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
148
- if left.isna().all() or right.isna().all():
149
- return pd.Series([None] * len(left), dtype=np.float64)
150
-
151
145
  left = self._convert_to_date(left, self.left_unit)
152
146
  right_mask = right.apply(lambda x: len(x) > 0)
153
147
  mask = left.notna() & right.notna() & right_mask
@@ -236,8 +230,6 @@ class DatePercentileBase(PandasOperand, abc.ABC):
236
230
  pass
237
231
 
238
232
  def _perc(self, f, bounds):
239
- if f is None or np.isnan(f):
240
- return np.nan
241
233
  hit = np.where(f >= np.array(bounds))[0]
242
234
  if hit.size > 0:
243
235
  return np.max(hit) + 1
@@ -26,9 +26,18 @@ class Column:
26
26
  return dict()
27
27
 
28
28
  def rename_columns(self, mapping: Dict[str, str]) -> "Column":
29
- self.name = mapping.get(self.name) or self.name
29
+ self.name = self._unhash(mapping.get(self.name) or self.name)
30
30
  return self
31
31
 
32
+ def _unhash(self, feature_name: str) -> str:
33
+ last_component_idx = feature_name.rfind("_")
34
+ if not feature_name.startswith("f_"):
35
+ return feature_name # etalon feature
36
+ elif last_component_idx == 1:
37
+ return feature_name[2:] # fully hashed name, cannot unhash
38
+ else:
39
+ return feature_name[2:last_component_idx]
40
+
32
41
  def delete_data(self):
33
42
  self.data = None
34
43
 
@@ -386,7 +386,6 @@ class DataSourcePublisher:
386
386
  search_keys = [k.value.value for k in search_keys] if search_keys else None
387
387
  request = {"bqTableId": bq_table_id, "searchKeys": search_keys}
388
388
  task_id = self._rest_client.upload_online(request, trace_id)
389
- print(f"Uploading online task created. task_id={task_id}")
390
389
  with Spinner():
391
390
  status_response = self._rest_client.poll_ads_management_task_status(task_id, trace_id)
392
391
  while status_response["status"] not in self.FINAL_STATUSES:
@@ -587,23 +587,15 @@ class Dataset: # (pd.DataFrame):
587
587
  if (
588
588
  runtime_parameters is not None
589
589
  and runtime_parameters.properties is not None
590
+ and "generate_features" in runtime_parameters.properties
590
591
  ):
591
- if "generate_features" in runtime_parameters.properties:
592
- generate_features = runtime_parameters.properties["generate_features"].split(",")
593
- renamed_generate_features = []
594
- for f in generate_features:
595
- for new_column, orig_column in self.columns_renaming.items():
596
- if f == orig_column:
597
- renamed_generate_features.append(new_column)
598
- runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
599
- if "columns_for_online_api" in runtime_parameters.properties:
600
- columns_for_online_api = runtime_parameters.properties["columns_for_online_api"].split(",")
601
- renamed_columns_for_online_api = []
602
- for f in columns_for_online_api:
603
- for new_column, orig_column in self.columns_renaming.items():
604
- if f == orig_column:
605
- renamed_columns_for_online_api.append(new_column)
606
- runtime_parameters.properties["columns_for_online_api"] = ",".join(renamed_columns_for_online_api)
592
+ generate_features = runtime_parameters.properties["generate_features"].split(",")
593
+ renamed_generate_features = []
594
+ for f in generate_features:
595
+ for new_column, orig_column in self.columns_renaming.items():
596
+ if f == orig_column:
597
+ renamed_generate_features.append(new_column)
598
+ runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
607
599
 
608
600
  return runtime_parameters
609
601
 
@@ -112,6 +112,7 @@ try:
112
112
  except Exception:
113
113
  from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
114
114
 
115
+ from upgini.utils.sort import sort_columns
115
116
  from upgini.utils.target_utils import (
116
117
  balance_undersample_forced,
117
118
  calculate_psi,
@@ -222,7 +223,6 @@ class FeaturesEnricher(TransformerMixin):
222
223
  loss: Optional[str] = None,
223
224
  detect_missing_search_keys: bool = True,
224
225
  generate_features: Optional[List[str]] = None,
225
- columns_for_online_api: Optional[List[str]] = None,
226
226
  round_embeddings: Optional[int] = None,
227
227
  logs_enabled: bool = True,
228
228
  raise_validation_error: bool = True,
@@ -346,9 +346,6 @@ class FeaturesEnricher(TransformerMixin):
346
346
  self.logger.error(msg)
347
347
  raise ValidationError(msg)
348
348
  self.runtime_parameters.properties["round_embeddings"] = round_embeddings
349
- self.columns_for_online_api = columns_for_online_api
350
- if columns_for_online_api is not None:
351
- self.runtime_parameters.properties["columns_for_online_api"] = ",".join(columns_for_online_api)
352
349
  maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
353
350
  if maybe_downsampling_limit is not None:
354
351
  Dataset.FIT_SAMPLE_THRESHOLD = int(maybe_downsampling_limit)
@@ -1261,7 +1258,7 @@ class FeaturesEnricher(TransformerMixin):
1261
1258
  for feature, shap in new_shaps.items()
1262
1259
  if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
1263
1260
  }
1264
- self.__prepare_feature_importances(trace_id, x_columns, new_shaps, silent=True)
1261
+ self.__prepare_feature_importances(trace_id, x_columns, new_shaps)
1265
1262
 
1266
1263
  if self.features_info_display_handle is not None:
1267
1264
  try:
@@ -1738,7 +1735,7 @@ class FeaturesEnricher(TransformerMixin):
1738
1735
  self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
1739
1736
  df = df.sample(n=sample_rows, random_state=self.random_state)
1740
1737
 
1741
- df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
1738
+ df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET)
1742
1739
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
1743
1740
  df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1744
1741
 
@@ -1919,6 +1916,7 @@ class FeaturesEnricher(TransformerMixin):
1919
1916
  progress_bar=progress_bar,
1920
1917
  progress_callback=progress_callback,
1921
1918
  add_fit_system_record_id=True,
1919
+ target_name=tmp_target_name,
1922
1920
  )
1923
1921
  if enriched_df is None:
1924
1922
  return None
@@ -1968,6 +1966,7 @@ class FeaturesEnricher(TransformerMixin):
1968
1966
  progress_bar=progress_bar,
1969
1967
  progress_callback=progress_callback,
1970
1968
  add_fit_system_record_id=True,
1969
+ target_name=tmp_target_name,
1971
1970
  )
1972
1971
  if enriched_Xy is None:
1973
1972
  return None
@@ -2129,6 +2128,7 @@ if response.status_code == 200:
2129
2128
  progress_bar: Optional[ProgressBar] = None,
2130
2129
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
2131
2130
  add_fit_system_record_id: bool = False,
2131
+ target_name: Optional[str] = None,
2132
2132
  ) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
2133
2133
  if self._search_task is None:
2134
2134
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
@@ -2313,8 +2313,11 @@ if response.status_code == 200:
2313
2313
  and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
2314
2314
  ]
2315
2315
 
2316
- if add_fit_system_record_id:
2317
- df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
2316
+ if add_fit_system_record_id and target_name is not None:
2317
+ reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
2318
+ df = self.__add_fit_system_record_id(
2319
+ df, search_keys, SYSTEM_RECORD_ID, reversed_columns_renaming.get(target_name, target_name)
2320
+ )
2318
2321
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2319
2322
  features_not_to_pass.append(SORT_ID)
2320
2323
 
@@ -2624,18 +2627,17 @@ if response.status_code == 200:
2624
2627
  checked_generate_features = []
2625
2628
  for gen_feature in self.generate_features:
2626
2629
  if gen_feature not in x_columns:
2627
- msg = self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
2628
- self.__log_warning(msg)
2630
+ if gen_feature == self._get_phone_column(self.search_keys):
2631
+ raise ValidationError(
2632
+ self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
2633
+ )
2634
+ else:
2635
+ self.__log_warning(self.bundle.get("missing_generate_feature").format(gen_feature, x_columns))
2629
2636
  else:
2630
2637
  checked_generate_features.append(gen_feature)
2631
2638
  self.generate_features = checked_generate_features
2632
2639
  self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
2633
2640
 
2634
- if self.columns_for_online_api is not None and len(self.columns_for_online_api) > 0:
2635
- for column in self.columns_for_online_api:
2636
- if column not in validated_X.columns:
2637
- raise ValidationError(self.bundle.get("missing_column_for_online_api").format(column))
2638
-
2639
2641
  if self.id_columns is not None:
2640
2642
  for id_column in self.id_columns:
2641
2643
  if id_column not in validated_X.columns:
@@ -2759,7 +2761,7 @@ if response.status_code == 200:
2759
2761
  self.__log_warning(full_duplicates_warning)
2760
2762
 
2761
2763
  # Explode multiple search keys
2762
- df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2764
+ df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID, TARGET)
2763
2765
 
2764
2766
  # TODO check that this is correct for enrichment
2765
2767
  self.df_with_original_index = df.copy()
@@ -2841,7 +2843,7 @@ if response.status_code == 200:
2841
2843
  if eval_set is not None and len(eval_set) > 0:
2842
2844
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2843
2845
 
2844
- df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
2846
+ df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID, TARGET)
2845
2847
 
2846
2848
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2847
2849
  df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
@@ -3533,53 +3535,60 @@ if response.status_code == 200:
3533
3535
  # meaning_types: Dict[str, FileColumnMeaningType],
3534
3536
  search_keys: Dict[str, SearchKey],
3535
3537
  id_name: str,
3538
+ target_name: str,
3536
3539
  ) -> pd.DataFrame:
3537
- # save original order or rows
3538
3540
  original_index_name = df.index.name
3539
3541
  index_name = df.index.name or DEFAULT_INDEX
3540
3542
  original_order_name = "original_order"
3543
+ # Save original index
3541
3544
  df = df.reset_index().rename(columns={index_name: ORIGINAL_INDEX})
3545
+ # Save original order
3542
3546
  df = df.reset_index().rename(columns={DEFAULT_INDEX: original_order_name})
3543
3547
 
3544
- # order by date and idempotent order by other keys
3545
- if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
3546
- sort_exclude_columns = [
3547
- original_order_name,
3548
- ORIGINAL_INDEX,
3549
- EVAL_SET_INDEX,
3550
- TARGET,
3551
- "__target",
3552
- ENTITY_SYSTEM_RECORD_ID,
3553
- ]
3554
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3555
- date_column = DateTimeSearchKeyConverter.DATETIME_COL
3556
- sort_exclude_columns.append(self._get_date_column(search_keys))
3557
- else:
3558
- date_column = self._get_date_column(search_keys)
3559
- sort_columns = [date_column] if date_column is not None else []
3548
+ # order by date and idempotent order by other keys and features
3560
3549
 
3561
- sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
3562
- sorted_other_keys = [k for k in sorted_other_keys if k not in sort_exclude_columns]
3550
+ sort_exclude_columns = [
3551
+ original_order_name,
3552
+ ORIGINAL_INDEX,
3553
+ EVAL_SET_INDEX,
3554
+ TARGET,
3555
+ "__target",
3556
+ ENTITY_SYSTEM_RECORD_ID,
3557
+ ]
3558
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3559
+ date_column = DateTimeSearchKeyConverter.DATETIME_COL
3560
+ sort_exclude_columns.append(self._get_date_column(search_keys))
3561
+ else:
3562
+ date_column = self._get_date_column(search_keys)
3563
+ sort_exclude_columns.append(date_column)
3564
+ columns_to_sort = [date_column] if date_column is not None else []
3563
3565
 
3564
- other_columns = sorted(
3565
- [
3566
- c
3567
- for c in df.columns
3568
- if c not in sort_columns
3569
- and c not in sorted_other_keys
3570
- and c not in sort_exclude_columns
3571
- and df[c].nunique() > 1
3572
- ]
3573
- )
3566
+ do_sorting = True
3567
+ if self.id_columns and self.cv in [CVType.time_series, CVType.blocked_time_series]:
3568
+ # Check duplicates by date and id_columns
3569
+ duplicate_check_columns = [c for c in self.id_columns if c in df.columns]
3570
+ if date_column is not None:
3571
+ duplicate_check_columns.append(date_column)
3574
3572
 
3575
- all_other_columns = sorted_other_keys + other_columns
3573
+ duplicates = df.duplicated(subset=duplicate_check_columns, keep=False)
3574
+ if duplicates.any():
3575
+ self.__log_warning(self.bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
3576
+ do_sorting = False
3577
+ else:
3578
+ columns_to_hash = list(search_keys.keys()) + self.id_columns
3579
+ columns_to_hash = sort_columns(
3580
+ df[columns_to_hash], target_name, search_keys, self.model_task_type, sort_exclude_columns
3581
+ )
3582
+ else:
3583
+ columns_to_hash = sort_columns(df, target_name, search_keys, self.model_task_type, sort_exclude_columns)
3576
3584
 
3585
+ if do_sorting:
3577
3586
  search_keys_hash = "search_keys_hash"
3578
- if len(all_other_columns) > 0:
3579
- sort_columns.append(search_keys_hash)
3580
- df[search_keys_hash] = pd.util.hash_pandas_object(df[all_other_columns], index=False)
3587
+ if len(columns_to_hash) > 0:
3588
+ df[search_keys_hash] = pd.util.hash_pandas_object(df[columns_to_hash], index=False)
3589
+ columns_to_sort.append(search_keys_hash)
3581
3590
 
3582
- df = df.sort_values(by=sort_columns)
3591
+ df = df.sort_values(by=columns_to_sort)
3583
3592
 
3584
3593
  if search_keys_hash in df.columns:
3585
3594
  df.drop(columns=search_keys_hash, inplace=True)
@@ -35,6 +35,7 @@ trial_quota_limit_riched=You have reached the quota limit of trial data usage. P
35
35
  loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
36
36
  loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
37
37
  multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
38
+ date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns
38
39
  group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
39
40
  current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
40
41
  # Errors
@@ -111,7 +112,6 @@ x_is_empty=X is empty
111
112
  y_is_empty=y is empty
112
113
  x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
113
114
  missing_generate_feature=Feature {} specified in `generate_features` is not present in input columns: {}
114
- missing_column_for_online_api=Column {} specified in `columns_for_online_api` is not present in input columns: {}
115
115
  x_unstable_by_date=Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
116
116
  train_unstable_target=Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
117
117
  eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
@@ -116,17 +116,17 @@ class EmailSearchKeyConverter:
116
116
  else:
117
117
  df[self.hem_column] = df[self.hem_column].astype("string").str.lower()
118
118
 
119
- # del self.search_keys[self.email_column]
120
- # if self.email_column in self.unnest_search_keys:
121
- # self.unnest_search_keys.remove(self.email_column)
119
+ del self.search_keys[self.email_column]
120
+ if self.email_column in self.unnest_search_keys:
121
+ self.unnest_search_keys.remove(self.email_column)
122
122
 
123
123
  one_domain_name = self.email_column + self.ONE_DOMAIN_SUFFIX
124
124
  df[one_domain_name] = df[self.email_column].apply(self._email_to_one_domain)
125
125
  self.columns_renaming[one_domain_name] = original_email_column
126
126
  self.search_keys[one_domain_name] = SearchKey.EMAIL_ONE_DOMAIN
127
127
 
128
- # if self.email_converted_to_hem:
129
- # df = df.drop(columns=self.email_column)
130
- # del self.columns_renaming[self.email_column]
128
+ if self.email_converted_to_hem:
129
+ df = df.drop(columns=self.email_column)
130
+ del self.columns_renaming[self.email_column]
131
131
 
132
132
  return df
@@ -0,0 +1,177 @@
1
+ import warnings
2
+ from collections import namedtuple
3
+
4
+ import numpy as np
5
+ import numpy.ma as ma
6
+ import scipy
7
+ from joblib import Parallel, delayed
8
+ from numpy import ndarray
9
+ from psutil import cpu_count
10
+
11
+ np.seterr(divide="ignore")
12
+
13
+
14
+ warnings.simplefilter(action="ignore", category=RuntimeWarning)
15
+
16
+
17
+ def _find_repeats(arr):
18
+ # This function assumes it may clobber its input.
19
+ if len(arr) == 0:
20
+ return np.array(0, np.float64), np.array(0, np.intp)
21
+
22
+ # XXX This cast was previously needed for the Fortran implementation,
23
+ # should we ditch it?
24
+ arr = np.asarray(arr, np.float64).ravel()
25
+ arr.sort()
26
+
27
+ # Taken from NumPy 1.9's np.unique.
28
+ change = np.concatenate(([True], arr[1:] != arr[:-1]))
29
+ unique = arr[change]
30
+ change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
31
+ freq = np.diff(change_idx)
32
+ atleast2 = freq > 1
33
+ return unique[atleast2], freq[atleast2]
34
+
35
+
36
+ def find_repeats(arr):
37
+ # Make sure we get a copy. ma.compressed promises a "new array", but can
38
+ # actually return a reference.
39
+ compr = np.asarray(ma.compressed(arr), dtype=np.float64)
40
+ try:
41
+ need_copy = np.may_share_memory(compr, arr)
42
+ except AttributeError:
43
+ # numpy < 1.8.2 bug: np.may_share_memory([], []) raises,
44
+ # while in numpy 1.8.2 and above it just (correctly) returns False.
45
+ need_copy = False
46
+ if need_copy:
47
+ compr = compr.copy()
48
+ return _find_repeats(compr)
49
+
50
+
51
+ def rankdata(data, axis=None, use_missing=False):
52
+ def _rank1d(data, use_missing=False):
53
+ n = data.count()
54
+ rk = np.empty(data.size, dtype=float)
55
+ idx = data.argsort()
56
+ rk[idx[:n]] = np.arange(1, n + 1)
57
+
58
+ if use_missing:
59
+ rk[idx[n:]] = (n + 1) / 2.0
60
+ else:
61
+ rk[idx[n:]] = 0
62
+
63
+ repeats = find_repeats(data.copy())
64
+ for r in repeats[0]:
65
+ condition = (data == r).filled(False)
66
+ rk[condition] = rk[condition].mean()
67
+ return rk
68
+
69
+ data = ma.array(data, copy=False)
70
+ if axis is None:
71
+ if data.ndim > 1:
72
+ return _rank1d(data.ravel(), use_missing).reshape(data.shape)
73
+ else:
74
+ return _rank1d(data, use_missing)
75
+ else:
76
+ return ma.apply_along_axis(_rank1d, axis, data, use_missing).view(ndarray)
77
+
78
+
79
+ def _chk_asarray(a, axis):
80
+ # Always returns a masked array, raveled for axis=None
81
+ a = ma.asanyarray(a)
82
+ if axis is None:
83
+ a = ma.ravel(a)
84
+ outaxis = 0
85
+ else:
86
+ outaxis = axis
87
+ return a, outaxis
88
+
89
+
90
+ SpearmanrResult = namedtuple("SpearmanrResult", ("correlation", "pvalue"))
91
+
92
+
93
+ # Taken from scipy.mstats with following tweaks:
94
+ # 1. parallel pairwise computation
95
+ # 2. custom masking
96
+ def spearmanr(
97
+ x, y=None, use_ties=True, axis=None, nan_policy="propagate", alternative="two-sided", mask_fn=ma.masked_invalid
98
+ ):
99
+ if not use_ties:
100
+ raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")
101
+
102
+ # Always returns a masked array, raveled if axis=None
103
+ x, axisout = _chk_asarray(x, axis)
104
+ if y is not None:
105
+ # Deal only with 2-D `x` case.
106
+ y, _ = _chk_asarray(y, axis)
107
+ if axisout == 0:
108
+ x = ma.column_stack((x, y))
109
+ else:
110
+ x = ma.row_stack((x, y))
111
+
112
+ if axisout == 1:
113
+ # To simplify the code that follow (always use `n_obs, n_vars` shape)
114
+ x = x.T
115
+
116
+ if nan_policy == "omit":
117
+ x = mask_fn(x)
118
+
119
+ def _spearmanr_2cols(x):
120
+ # Mask the same observations for all variables, and then drop those
121
+ # observations (can't leave them masked, rankdata is weird).
122
+ x = ma.mask_rowcols(x, axis=0)
123
+ x = x[~x.mask.any(axis=1), :]
124
+
125
+ # If either column is entirely NaN or Inf
126
+ if not np.any(x.data):
127
+ return SpearmanrResult(np.nan, np.nan)
128
+
129
+ m = ma.getmask(x)
130
+ n_obs = x.shape[0]
131
+ dof = n_obs - 2 - int(m.sum(axis=0)[0])
132
+ if dof < 0:
133
+ return SpearmanrResult(np.nan, np.nan)
134
+
135
+ # Gets the ranks and rank differences
136
+ x_ranked = rankdata(x, axis=0)
137
+ rs = ma.corrcoef(x_ranked, rowvar=False).data
138
+
139
+ # rs can have elements equal to 1, so avoid zero division warnings
140
+ with np.errstate(divide="ignore"):
141
+ # clip the small negative values possibly caused by rounding
142
+ # errors before taking the square root
143
+ t = rs * np.sqrt((dof / ((rs + 1.0) * (1.0 - rs))).clip(0))
144
+
145
+ t, prob = scipy.stats._mstats_basic._ttest_finish(dof, t, alternative)
146
+
147
+ # For backwards compatibility, return scalars when comparing 2 columns
148
+ if rs.shape == (2, 2):
149
+ return SpearmanrResult(rs[1, 0], prob[1, 0])
150
+ else:
151
+ return SpearmanrResult(rs, prob)
152
+
153
+ # Need to do this per pair of variables, otherwise the dropped observations
154
+ # in a third column mess up the result for a pair.
155
+ n_vars = x.shape[1]
156
+ if n_vars == 2:
157
+ return _spearmanr_2cols(x)
158
+ else:
159
+ max_cpu_cores = cpu_count(logical=False)
160
+ with np.errstate(divide="ignore"):
161
+ results = Parallel(n_jobs=max_cpu_cores)(
162
+ delayed(_spearmanr_2cols)(x[:, [var1, var2]])
163
+ for var1 in range(n_vars - 1)
164
+ for var2 in range(var1 + 1, n_vars)
165
+ )
166
+
167
+ rs = np.ones((n_vars, n_vars), dtype=float)
168
+ prob = np.zeros((n_vars, n_vars), dtype=float)
169
+ for var1 in range(n_vars - 1):
170
+ for var2 in range(var1 + 1, n_vars):
171
+ result = results.pop(0)
172
+ rs[var1, var2] = result.correlation
173
+ rs[var2, var1] = result.correlation
174
+ prob[var1, var2] = result.pvalue
175
+ prob[var2, var1] = result.pvalue
176
+
177
+ return SpearmanrResult(rs, prob)
@@ -0,0 +1,160 @@
1
+ import hashlib
2
+ from typing import Any, Dict, List
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from joblib import Parallel, delayed
7
+ from pandas.api.types import is_datetime64_any_dtype, is_numeric_dtype
8
+ from psutil import cpu_count
9
+ from scipy.stats import skew, spearmanr
10
+
11
+ from upgini.metadata import ModelTaskType, SearchKey
12
+ from upgini.utils import mstats
13
+
14
+
15
+ def sort_columns(
16
+ df: pd.DataFrame,
17
+ target_column: str,
18
+ search_keys: Dict[str, SearchKey],
19
+ model_task_type: ModelTaskType,
20
+ exclude_columns: List[str],
21
+ ) -> List[str]:
22
+ df = df.copy() # avoid side effects
23
+ sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
24
+ sorted_keys = [k for k in sorted_keys if k not in exclude_columns]
25
+
26
+ other_columns = sorted(
27
+ [
28
+ c
29
+ for c in df.columns
30
+ if c not in sorted_keys
31
+ and c not in exclude_columns
32
+ and df[c].nunique() > 1
33
+ ]
34
+ )
35
+
36
+ target = prepare_target(df[target_column], model_task_type)
37
+ sort_dict = get_sort_columns_dict(df[sorted_keys + other_columns], target, sorted_keys, omit_nan=True)
38
+ other_columns = [c for c in other_columns if c in sort_dict]
39
+ columns_for_sort = sorted_keys + sorted(other_columns, key=lambda e: sort_dict[e], reverse=True)
40
+ return columns_for_sort
41
+
42
+
43
+ def get_sort_columns_dict(
44
+ df: pd.DataFrame,
45
+ target: pd.Series,
46
+ sorted_keys: List[str],
47
+ omit_nan: bool,
48
+ n_jobs: int | None = None,
49
+ ) -> dict[str, Any]:
50
+ string_features = [c for c in df.select_dtypes(exclude=[np.number]).columns if c not in sorted_keys]
51
+ columns_for_sort = [c for c in df.columns if c not in sorted_keys + string_features]
52
+ if len(string_features) > 0:
53
+ if len(df) > len(df.drop(columns=string_features).drop_duplicates()):
54
+ # factorize string features
55
+ for c in string_features:
56
+ df[c] = df[c].factorize(sort=True)[0]
57
+ columns_for_sort.extend(string_features)
58
+
59
+ if len(columns_for_sort) == 0:
60
+ return {}
61
+
62
+ df = df[columns_for_sort]
63
+ hashes = [hash_series(df[col]) for col in columns_for_sort]
64
+ df = np.asarray(df, dtype=np.float32)
65
+ correlations = get_sort_columns_correlations(df, target, omit_nan, n_jobs)
66
+
67
+ sort_dict = {col: (corr, h) for col, corr, h in zip(columns_for_sort, correlations, hashes)}
68
+ return sort_dict
69
+
70
+
71
+ def get_sort_columns_correlations(df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: int | None = None):
72
+ target_correlations = get_target_correlations(df, target, omit_nan, n_jobs, precision=7)
73
+
74
+ return np.max(target_correlations, axis=0)
75
+
76
+
77
+ def get_target_correlations(
78
+ df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: int | None = None, precision: int = 15
79
+ ):
80
+ df = np.asarray(df, dtype=np.float32)
81
+ target_correlations = np.zeros((2, df.shape[1]))
82
+ target_correlations[0, :] = np.nan_to_num(
83
+ calculate_spearman_corr_with_target(df, target, omit_nan, n_jobs), copy=False
84
+ )
85
+ target_correlations[1, :] = np.nan_to_num(np.abs(np.corrcoef(df.T, target.T, rowvar=True)[-1, :-1]))
86
+
87
+ target_correlations = np.trunc(target_correlations * 10**precision) / (10**precision)
88
+
89
+ return target_correlations
90
+
91
+
92
+ def corr_dict_from_sort_dict(sort_dict: dict[str, tuple[float, int]]) -> dict[str, float]:
93
+ return {k: v[0] for k, v in sort_dict.items()}
94
+
95
+
96
+ def calculate_spearman_corr_with_target(
97
+ X: pd.DataFrame | np.ndarray, y: pd.Series, omit_nan: bool = False, n_jobs: int | None = None
98
+ ) -> np.ndarray:
99
+ if isinstance(X, pd.DataFrame):
100
+ X = np.asarray(X, dtype=np.float32)
101
+
102
+ if X.size == 0:
103
+ return np.ndarray(shape=(0,))
104
+
105
+ all_correlations = np.zeros(X.shape[1])
106
+ all_correlations.fill(np.nan)
107
+ cols2calc = np.where([c.size > 0 and not (c == c[0]).all() for c in X.T])[0]
108
+
109
+ if omit_nan:
110
+ results = Parallel(n_jobs=n_jobs or cpu_count(logical=False))(
111
+ delayed(mstats.spearmanr)(
112
+ X[:, i],
113
+ y,
114
+ nan_policy="omit",
115
+ axis=0,
116
+ )
117
+ for i in cols2calc
118
+ )
119
+ target_correlations = np.array([abs(res.correlation) for res in results])
120
+ else:
121
+ cols2calc = cols2calc[np.where(~np.isnan(X[:, cols2calc]).any(axis=0))[0]]
122
+ target_correlations = calculate_spearman(X[:, cols2calc], y, nan_policy="raise")
123
+ if isinstance(target_correlations, float):
124
+ target_correlations = np.abs([target_correlations])
125
+ else:
126
+ target_correlations = np.abs(target_correlations)[-1, :-1]
127
+
128
+ all_correlations[cols2calc] = target_correlations
129
+
130
+ return all_correlations
131
+
132
+
133
+ def calculate_spearman(X: np.ndarray, y: pd.Series | None, nan_policy: str):
134
+ features_num = X.shape[1]
135
+ if y is not None:
136
+ features_num += 1
137
+
138
+ if features_num < 2:
139
+ return 1.0
140
+ else:
141
+ return spearmanr(X, y, nan_policy=nan_policy).correlation
142
+
143
+
144
+ def hash_series(series: pd.Series) -> int:
145
+ return int(hashlib.sha256(pd.util.hash_pandas_object(series, index=True).values).hexdigest(), 16)
146
+
147
+
148
+ def prepare_target(target: pd.Series, model_task_type: ModelTaskType) -> pd.Series:
149
+ target_name = target.name
150
+ if model_task_type != ModelTaskType.REGRESSION or (
151
+ not is_numeric_dtype(target) and not is_datetime64_any_dtype(target)
152
+ ):
153
+ target = target.astype(str).astype("category").cat.codes
154
+
155
+ elif model_task_type == ModelTaskType.REGRESSION:
156
+ skewness = round(abs(skew(target)), 2)
157
+ if (target.min() >= 0) and (skewness >= 0.9):
158
+ target = np.log1p(target)
159
+
160
+ return pd.Series(target, name=target_name)
@@ -1 +0,0 @@
1
- __version__ = "1.2.57a3675.dev5"
File without changes
File without changes
File without changes