upgini 1.2.58a1__tar.gz → 1.2.59__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (69) hide show
  1. {upgini-1.2.58a1 → upgini-1.2.59}/PKG-INFO +2 -2
  2. {upgini-1.2.58a1 → upgini-1.2.59}/pyproject.toml +1 -1
  3. upgini-1.2.59/src/upgini/__about__.py +1 -0
  4. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/autofe/date.py +8 -0
  5. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/autofe/feature.py +1 -10
  6. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/data_source/data_source_publisher.py +1 -0
  7. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/dataset.py +16 -8
  8. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/features_enricher.py +74 -69
  9. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/resource_bundle/strings.properties +1 -1
  10. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/email_utils.py +6 -6
  11. upgini-1.2.58a1/src/upgini/__about__.py +0 -1
  12. upgini-1.2.58a1/src/upgini/utils/mstats.py +0 -177
  13. upgini-1.2.58a1/src/upgini/utils/sort.py +0 -160
  14. {upgini-1.2.58a1 → upgini-1.2.59}/.gitignore +0 -0
  15. {upgini-1.2.58a1 → upgini-1.2.59}/LICENSE +0 -0
  16. {upgini-1.2.58a1 → upgini-1.2.59}/README.md +0 -0
  17. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/__init__.py +0 -0
  18. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/ads.py +0 -0
  19. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/ads_management/__init__.py +0 -0
  20. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/ads_management/ads_manager.py +0 -0
  21. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/autofe/__init__.py +0 -0
  22. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/autofe/all_operands.py +0 -0
  23. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/autofe/binary.py +0 -0
  24. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/autofe/groupby.py +0 -0
  25. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/autofe/operand.py +0 -0
  26. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/autofe/unary.py +0 -0
  27. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/autofe/vector.py +0 -0
  28. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/data_source/__init__.py +0 -0
  29. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/errors.py +0 -0
  30. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/http.py +0 -0
  31. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/lazy_import.py +0 -0
  32. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/mdc/__init__.py +0 -0
  33. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/mdc/context.py +0 -0
  34. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/metadata.py +0 -0
  35. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/metrics.py +0 -0
  36. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/normalizer/__init__.py +0 -0
  37. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/normalizer/normalize_utils.py +0 -0
  38. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/resource_bundle/__init__.py +0 -0
  39. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/resource_bundle/exceptions.py +0 -0
  40. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  41. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/sampler/__init__.py +0 -0
  42. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/sampler/base.py +0 -0
  43. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/sampler/random_under_sampler.py +0 -0
  44. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/sampler/utils.py +0 -0
  45. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/search_task.py +0 -0
  46. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/spinner.py +0 -0
  47. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  48. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/__init__.py +0 -0
  49. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/base_search_key_detector.py +0 -0
  50. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/blocked_time_series.py +0 -0
  51. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/country_utils.py +0 -0
  52. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/custom_loss_utils.py +0 -0
  53. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/cv_utils.py +0 -0
  54. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/datetime_utils.py +0 -0
  55. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/deduplicate_utils.py +0 -0
  56. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/display_utils.py +0 -0
  57. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/fallback_progress_bar.py +0 -0
  58. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/feature_info.py +0 -0
  59. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/features_validator.py +0 -0
  60. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/format.py +0 -0
  61. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/ip_utils.py +0 -0
  62. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/phone_utils.py +0 -0
  63. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/postal_code_utils.py +0 -0
  64. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/progress_bar.py +0 -0
  65. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/sklearn_ext.py +0 -0
  66. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/target_utils.py +0 -0
  67. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/track_info.py +0 -0
  68. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/utils/warning_counter.py +0 -0
  69. {upgini-1.2.58a1 → upgini-1.2.59}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.58a1
3
+ Version: 1.2.59
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -34,7 +34,7 @@ Requires-Dist: pydantic<3.0.0,>1.0.0
34
34
  Requires-Dist: pyjwt>=2.8.0
35
35
  Requires-Dist: python-bidi==0.4.2
36
36
  Requires-Dist: python-dateutil>=2.8.0
37
- Requires-Dist: python-json-logger>=2.0.2
37
+ Requires-Dist: python-json-logger>=3.3.0
38
38
  Requires-Dist: requests>=2.8.0
39
39
  Requires-Dist: scikit-learn>=1.3.0
40
40
  Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
@@ -43,7 +43,7 @@ dependencies = [
43
43
  "pydantic>1.0.0,<3.0.0",
44
44
  "pyjwt>=2.8.0",
45
45
  "python-dateutil>=2.8.0",
46
- "python-json-logger>=2.0.2",
46
+ "python-json-logger>=3.3.0",
47
47
  "requests>=2.8.0",
48
48
  "scikit-learn>=1.3.0",
49
49
  "python-bidi==0.4.2",
@@ -0,0 +1 @@
1
+ __version__ = "1.2.59"
@@ -64,6 +64,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
64
64
  return res
65
65
 
66
66
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
67
+ if left.isna().all() or right.isna().all():
68
+ return pd.Series([None] * len(left))
69
+
67
70
  left = self._convert_to_date(left, self.left_unit)
68
71
  right = self._convert_to_date(right, self.right_unit)
69
72
  diff = self._convert_diff_to_unit(left.dt.date - right.dt.date)
@@ -142,6 +145,9 @@ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
142
145
  return cls(aggregation=aggregation)
143
146
 
144
147
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
148
+ if left.isna().all() or right.isna().all():
149
+ return pd.Series([None] * len(left), dtype=np.float64)
150
+
145
151
  left = self._convert_to_date(left, self.left_unit)
146
152
  right_mask = right.apply(lambda x: len(x) > 0)
147
153
  mask = left.notna() & right.notna() & right_mask
@@ -230,6 +236,8 @@ class DatePercentileBase(PandasOperand, abc.ABC):
230
236
  pass
231
237
 
232
238
  def _perc(self, f, bounds):
239
+ if f is None or np.isnan(f):
240
+ return np.nan
233
241
  hit = np.where(f >= np.array(bounds))[0]
234
242
  if hit.size > 0:
235
243
  return np.max(hit) + 1
@@ -26,18 +26,9 @@ class Column:
26
26
  return dict()
27
27
 
28
28
  def rename_columns(self, mapping: Dict[str, str]) -> "Column":
29
- self.name = self._unhash(mapping.get(self.name) or self.name)
29
+ self.name = mapping.get(self.name) or self.name
30
30
  return self
31
31
 
32
- def _unhash(self, feature_name: str) -> str:
33
- last_component_idx = feature_name.rfind("_")
34
- if not feature_name.startswith("f_"):
35
- return feature_name # etalon feature
36
- elif last_component_idx == 1:
37
- return feature_name[2:] # fully hashed name, cannot unhash
38
- else:
39
- return feature_name[2:last_component_idx]
40
-
41
32
  def delete_data(self):
42
33
  self.data = None
43
34
 
@@ -386,6 +386,7 @@ class DataSourcePublisher:
386
386
  search_keys = [k.value.value for k in search_keys] if search_keys else None
387
387
  request = {"bqTableId": bq_table_id, "searchKeys": search_keys}
388
388
  task_id = self._rest_client.upload_online(request, trace_id)
389
+ print(f"Uploading online task created. task_id={task_id}")
389
390
  with Spinner():
390
391
  status_response = self._rest_client.poll_ads_management_task_status(task_id, trace_id)
391
392
  while status_response["status"] not in self.FINAL_STATUSES:
@@ -587,15 +587,23 @@ class Dataset: # (pd.DataFrame):
587
587
  if (
588
588
  runtime_parameters is not None
589
589
  and runtime_parameters.properties is not None
590
- and "generate_features" in runtime_parameters.properties
591
590
  ):
592
- generate_features = runtime_parameters.properties["generate_features"].split(",")
593
- renamed_generate_features = []
594
- for f in generate_features:
595
- for new_column, orig_column in self.columns_renaming.items():
596
- if f == orig_column:
597
- renamed_generate_features.append(new_column)
598
- runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
591
+ if "generate_features" in runtime_parameters.properties:
592
+ generate_features = runtime_parameters.properties["generate_features"].split(",")
593
+ renamed_generate_features = []
594
+ for f in generate_features:
595
+ for new_column, orig_column in self.columns_renaming.items():
596
+ if f == orig_column:
597
+ renamed_generate_features.append(new_column)
598
+ runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
599
+ if "columns_for_online_api" in runtime_parameters.properties:
600
+ columns_for_online_api = runtime_parameters.properties["columns_for_online_api"].split(",")
601
+ renamed_columns_for_online_api = []
602
+ for f in columns_for_online_api:
603
+ for new_column, orig_column in self.columns_renaming.items():
604
+ if f == orig_column:
605
+ renamed_columns_for_online_api.append(new_column)
606
+ runtime_parameters.properties["columns_for_online_api"] = ",".join(renamed_columns_for_online_api)
599
607
 
600
608
  return runtime_parameters
601
609
 
@@ -112,7 +112,6 @@ try:
112
112
  except Exception:
113
113
  from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
114
114
 
115
- from upgini.utils.sort import sort_columns
116
115
  from upgini.utils.target_utils import (
117
116
  balance_undersample_forced,
118
117
  calculate_psi,
@@ -223,6 +222,7 @@ class FeaturesEnricher(TransformerMixin):
223
222
  loss: Optional[str] = None,
224
223
  detect_missing_search_keys: bool = True,
225
224
  generate_features: Optional[List[str]] = None,
225
+ columns_for_online_api: Optional[List[str]] = None,
226
226
  round_embeddings: Optional[int] = None,
227
227
  logs_enabled: bool = True,
228
228
  raise_validation_error: bool = True,
@@ -346,6 +346,9 @@ class FeaturesEnricher(TransformerMixin):
346
346
  self.logger.error(msg)
347
347
  raise ValidationError(msg)
348
348
  self.runtime_parameters.properties["round_embeddings"] = round_embeddings
349
+ self.columns_for_online_api = columns_for_online_api
350
+ if columns_for_online_api is not None:
351
+ self.runtime_parameters.properties["columns_for_online_api"] = ",".join(columns_for_online_api)
349
352
  maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
350
353
  if maybe_downsampling_limit is not None:
351
354
  Dataset.FIT_SAMPLE_THRESHOLD = int(maybe_downsampling_limit)
@@ -1258,7 +1261,7 @@ class FeaturesEnricher(TransformerMixin):
1258
1261
  for feature, shap in new_shaps.items()
1259
1262
  if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
1260
1263
  }
1261
- self.__prepare_feature_importances(trace_id, x_columns, new_shaps)
1264
+ self.__prepare_feature_importances(trace_id, x_columns, new_shaps, silent=True)
1262
1265
 
1263
1266
  if self.features_info_display_handle is not None:
1264
1267
  try:
@@ -1735,7 +1738,7 @@ class FeaturesEnricher(TransformerMixin):
1735
1738
  self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
1736
1739
  df = df.sample(n=sample_rows, random_state=self.random_state)
1737
1740
 
1738
- df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET)
1741
+ df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
1739
1742
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
1740
1743
  df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1741
1744
 
@@ -1874,13 +1877,9 @@ class FeaturesEnricher(TransformerMixin):
1874
1877
 
1875
1878
  # downsample if need to eval_set threshold
1876
1879
  num_samples = _num_samples(df)
1877
- phone_column = self._get_phone_column(self.search_keys)
1878
1880
  force_downsampling = (
1879
1881
  not self.disable_force_downsampling
1880
- and self.generate_features is not None
1881
- and phone_column is not None
1882
- and self.fit_columns_renaming is not None
1883
- and self.fit_columns_renaming.get(phone_column) in self.generate_features
1882
+ and self.columns_for_online_api is not None
1884
1883
  and num_samples > Dataset.FORCE_SAMPLE_SIZE
1885
1884
  )
1886
1885
  if force_downsampling:
@@ -1916,7 +1915,6 @@ class FeaturesEnricher(TransformerMixin):
1916
1915
  progress_bar=progress_bar,
1917
1916
  progress_callback=progress_callback,
1918
1917
  add_fit_system_record_id=True,
1919
- target_name=tmp_target_name,
1920
1918
  )
1921
1919
  if enriched_df is None:
1922
1920
  return None
@@ -1950,7 +1948,27 @@ class FeaturesEnricher(TransformerMixin):
1950
1948
  df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1951
1949
 
1952
1950
  num_samples = _num_samples(df)
1953
- if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
1951
+ force_downsampling = (
1952
+ not self.disable_force_downsampling
1953
+ and self.columns_for_online_api is not None
1954
+ and num_samples > Dataset.FORCE_SAMPLE_SIZE
1955
+ )
1956
+ if force_downsampling:
1957
+ self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1958
+ df = balance_undersample_forced(
1959
+ df=df,
1960
+ target_column=TARGET,
1961
+ id_columns=self.id_columns,
1962
+ date_column=self._get_date_column(self.search_keys),
1963
+ task_type=self.model_task_type,
1964
+ cv_type=self.cv,
1965
+ random_state=self.random_state,
1966
+ sample_size=Dataset.FORCE_SAMPLE_SIZE,
1967
+ logger=self.logger,
1968
+ bundle=self.bundle,
1969
+ warning_callback=self.__log_warning,
1970
+ )
1971
+ elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
1954
1972
  self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
1955
1973
  df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
1956
1974
 
@@ -1966,7 +1984,6 @@ class FeaturesEnricher(TransformerMixin):
1966
1984
  progress_bar=progress_bar,
1967
1985
  progress_callback=progress_callback,
1968
1986
  add_fit_system_record_id=True,
1969
- target_name=tmp_target_name,
1970
1987
  )
1971
1988
  if enriched_Xy is None:
1972
1989
  return None
@@ -2128,7 +2145,6 @@ if response.status_code == 200:
2128
2145
  progress_bar: Optional[ProgressBar] = None,
2129
2146
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
2130
2147
  add_fit_system_record_id: bool = False,
2131
- target_name: Optional[str] = None,
2132
2148
  ) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
2133
2149
  if self._search_task is None:
2134
2150
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
@@ -2313,11 +2329,8 @@ if response.status_code == 200:
2313
2329
  and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
2314
2330
  ]
2315
2331
 
2316
- if add_fit_system_record_id and target_name is not None:
2317
- reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
2318
- df = self.__add_fit_system_record_id(
2319
- df, search_keys, SYSTEM_RECORD_ID, reversed_columns_renaming.get(target_name, target_name)
2320
- )
2332
+ if add_fit_system_record_id:
2333
+ df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
2321
2334
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2322
2335
  features_not_to_pass.append(SORT_ID)
2323
2336
 
@@ -2627,17 +2640,18 @@ if response.status_code == 200:
2627
2640
  checked_generate_features = []
2628
2641
  for gen_feature in self.generate_features:
2629
2642
  if gen_feature not in x_columns:
2630
- if gen_feature == self._get_phone_column(self.search_keys):
2631
- raise ValidationError(
2632
- self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
2633
- )
2634
- else:
2635
- self.__log_warning(self.bundle.get("missing_generate_feature").format(gen_feature, x_columns))
2643
+ msg = self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
2644
+ self.__log_warning(msg)
2636
2645
  else:
2637
2646
  checked_generate_features.append(gen_feature)
2638
2647
  self.generate_features = checked_generate_features
2639
2648
  self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
2640
2649
 
2650
+ if self.columns_for_online_api is not None and len(self.columns_for_online_api) > 0:
2651
+ for column in self.columns_for_online_api:
2652
+ if column not in validated_X.columns:
2653
+ raise ValidationError(self.bundle.get("missing_column_for_online_api").format(column))
2654
+
2641
2655
  if self.id_columns is not None:
2642
2656
  for id_column in self.id_columns:
2643
2657
  if id_column not in validated_X.columns:
@@ -2761,7 +2775,7 @@ if response.status_code == 200:
2761
2775
  self.__log_warning(full_duplicates_warning)
2762
2776
 
2763
2777
  # Explode multiple search keys
2764
- df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID, TARGET)
2778
+ df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2765
2779
 
2766
2780
  # TODO check that this is correct for enrichment
2767
2781
  self.df_with_original_index = df.copy()
@@ -2843,7 +2857,7 @@ if response.status_code == 200:
2843
2857
  if eval_set is not None and len(eval_set) > 0:
2844
2858
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2845
2859
 
2846
- df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID, TARGET)
2860
+ df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
2847
2861
 
2848
2862
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2849
2863
  df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
@@ -2859,9 +2873,7 @@ if response.status_code == 200:
2859
2873
  # Force downsampling to 7000 for API features generation
2860
2874
  force_downsampling = (
2861
2875
  not self.disable_force_downsampling
2862
- and self.generate_features is not None
2863
- and phone_column is not None
2864
- and self.fit_columns_renaming[phone_column] in self.generate_features
2876
+ and self.columns_for_online_api is not None
2865
2877
  and len(df) > Dataset.FORCE_SAMPLE_SIZE
2866
2878
  )
2867
2879
  if force_downsampling:
@@ -3535,60 +3547,53 @@ if response.status_code == 200:
3535
3547
  # meaning_types: Dict[str, FileColumnMeaningType],
3536
3548
  search_keys: Dict[str, SearchKey],
3537
3549
  id_name: str,
3538
- target_name: str,
3539
3550
  ) -> pd.DataFrame:
3551
+ # save original order or rows
3540
3552
  original_index_name = df.index.name
3541
3553
  index_name = df.index.name or DEFAULT_INDEX
3542
3554
  original_order_name = "original_order"
3543
- # Save original index
3544
3555
  df = df.reset_index().rename(columns={index_name: ORIGINAL_INDEX})
3545
- # Save original order
3546
3556
  df = df.reset_index().rename(columns={DEFAULT_INDEX: original_order_name})
3547
3557
 
3548
- # order by date and idempotent order by other keys and features
3558
+ # order by date and idempotent order by other keys
3559
+ if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
3560
+ sort_exclude_columns = [
3561
+ original_order_name,
3562
+ ORIGINAL_INDEX,
3563
+ EVAL_SET_INDEX,
3564
+ TARGET,
3565
+ "__target",
3566
+ ENTITY_SYSTEM_RECORD_ID,
3567
+ ]
3568
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3569
+ date_column = DateTimeSearchKeyConverter.DATETIME_COL
3570
+ sort_exclude_columns.append(self._get_date_column(search_keys))
3571
+ else:
3572
+ date_column = self._get_date_column(search_keys)
3573
+ sort_columns = [date_column] if date_column is not None else []
3549
3574
 
3550
- sort_exclude_columns = [
3551
- original_order_name,
3552
- ORIGINAL_INDEX,
3553
- EVAL_SET_INDEX,
3554
- TARGET,
3555
- "__target",
3556
- ENTITY_SYSTEM_RECORD_ID,
3557
- ]
3558
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3559
- date_column = DateTimeSearchKeyConverter.DATETIME_COL
3560
- sort_exclude_columns.append(self._get_date_column(search_keys))
3561
- else:
3562
- date_column = self._get_date_column(search_keys)
3563
- sort_exclude_columns.append(date_column)
3564
- columns_to_sort = [date_column] if date_column is not None else []
3575
+ sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
3576
+ sorted_other_keys = [k for k in sorted_other_keys if k not in sort_exclude_columns]
3565
3577
 
3566
- do_sorting = True
3567
- if self.id_columns and self.cv in [CVType.time_series, CVType.blocked_time_series]:
3568
- # Check duplicates by date and id_columns
3569
- duplicate_check_columns = [c for c in self.id_columns if c in df.columns]
3570
- if date_column is not None:
3571
- duplicate_check_columns.append(date_column)
3578
+ other_columns = sorted(
3579
+ [
3580
+ c
3581
+ for c in df.columns
3582
+ if c not in sort_columns
3583
+ and c not in sorted_other_keys
3584
+ and c not in sort_exclude_columns
3585
+ and df[c].nunique() > 1
3586
+ ]
3587
+ )
3572
3588
 
3573
- duplicates = df.duplicated(subset=duplicate_check_columns, keep=False)
3574
- if duplicates.any():
3575
- self.__log_warning(self.bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
3576
- do_sorting = False
3577
- else:
3578
- columns_to_hash = list(search_keys.keys()) + self.id_columns
3579
- columns_to_hash = sort_columns(
3580
- df[columns_to_hash], target_name, search_keys, self.model_task_type, sort_exclude_columns
3581
- )
3582
- else:
3583
- columns_to_hash = sort_columns(df, target_name, search_keys, self.model_task_type, sort_exclude_columns)
3589
+ all_other_columns = sorted_other_keys + other_columns
3584
3590
 
3585
- if do_sorting:
3586
3591
  search_keys_hash = "search_keys_hash"
3587
- if len(columns_to_hash) > 0:
3588
- df[search_keys_hash] = pd.util.hash_pandas_object(df[columns_to_hash], index=False)
3589
- columns_to_sort.append(search_keys_hash)
3592
+ if len(all_other_columns) > 0:
3593
+ sort_columns.append(search_keys_hash)
3594
+ df[search_keys_hash] = pd.util.hash_pandas_object(df[all_other_columns], index=False)
3590
3595
 
3591
- df = df.sort_values(by=columns_to_sort)
3596
+ df = df.sort_values(by=sort_columns)
3592
3597
 
3593
3598
  if search_keys_hash in df.columns:
3594
3599
  df.drop(columns=search_keys_hash, inplace=True)
@@ -35,7 +35,6 @@ trial_quota_limit_riched=You have reached the quota limit of trial data usage. P
35
35
  loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
36
36
  loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
37
37
  multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
38
- date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns
39
38
  group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
40
39
  current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
41
40
  # Errors
@@ -112,6 +111,7 @@ x_is_empty=X is empty
112
111
  y_is_empty=y is empty
113
112
  x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
114
113
  missing_generate_feature=Feature {} specified in `generate_features` is not present in input columns: {}
114
+ missing_column_for_online_api=Column {} specified in `columns_for_online_api` is not present in input columns: {}
115
115
  x_unstable_by_date=Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
116
116
  train_unstable_target=Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
117
117
  eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
@@ -116,17 +116,17 @@ class EmailSearchKeyConverter:
116
116
  else:
117
117
  df[self.hem_column] = df[self.hem_column].astype("string").str.lower()
118
118
 
119
- del self.search_keys[self.email_column]
120
- if self.email_column in self.unnest_search_keys:
121
- self.unnest_search_keys.remove(self.email_column)
119
+ # del self.search_keys[self.email_column]
120
+ # if self.email_column in self.unnest_search_keys:
121
+ # self.unnest_search_keys.remove(self.email_column)
122
122
 
123
123
  one_domain_name = self.email_column + self.ONE_DOMAIN_SUFFIX
124
124
  df[one_domain_name] = df[self.email_column].apply(self._email_to_one_domain)
125
125
  self.columns_renaming[one_domain_name] = original_email_column
126
126
  self.search_keys[one_domain_name] = SearchKey.EMAIL_ONE_DOMAIN
127
127
 
128
- if self.email_converted_to_hem:
129
- df = df.drop(columns=self.email_column)
130
- del self.columns_renaming[self.email_column]
128
+ # if self.email_converted_to_hem:
129
+ # df = df.drop(columns=self.email_column)
130
+ # del self.columns_renaming[self.email_column]
131
131
 
132
132
  return df
@@ -1 +0,0 @@
1
- __version__ = "1.2.58a1"
@@ -1,177 +0,0 @@
1
- import warnings
2
- from collections import namedtuple
3
-
4
- import numpy as np
5
- import numpy.ma as ma
6
- import scipy
7
- from joblib import Parallel, delayed
8
- from numpy import ndarray
9
- from psutil import cpu_count
10
-
11
- np.seterr(divide="ignore")
12
-
13
-
14
- warnings.simplefilter(action="ignore", category=RuntimeWarning)
15
-
16
-
17
- def _find_repeats(arr):
18
- # This function assumes it may clobber its input.
19
- if len(arr) == 0:
20
- return np.array(0, np.float64), np.array(0, np.intp)
21
-
22
- # XXX This cast was previously needed for the Fortran implementation,
23
- # should we ditch it?
24
- arr = np.asarray(arr, np.float64).ravel()
25
- arr.sort()
26
-
27
- # Taken from NumPy 1.9's np.unique.
28
- change = np.concatenate(([True], arr[1:] != arr[:-1]))
29
- unique = arr[change]
30
- change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
31
- freq = np.diff(change_idx)
32
- atleast2 = freq > 1
33
- return unique[atleast2], freq[atleast2]
34
-
35
-
36
- def find_repeats(arr):
37
- # Make sure we get a copy. ma.compressed promises a "new array", but can
38
- # actually return a reference.
39
- compr = np.asarray(ma.compressed(arr), dtype=np.float64)
40
- try:
41
- need_copy = np.may_share_memory(compr, arr)
42
- except AttributeError:
43
- # numpy < 1.8.2 bug: np.may_share_memory([], []) raises,
44
- # while in numpy 1.8.2 and above it just (correctly) returns False.
45
- need_copy = False
46
- if need_copy:
47
- compr = compr.copy()
48
- return _find_repeats(compr)
49
-
50
-
51
- def rankdata(data, axis=None, use_missing=False):
52
- def _rank1d(data, use_missing=False):
53
- n = data.count()
54
- rk = np.empty(data.size, dtype=float)
55
- idx = data.argsort()
56
- rk[idx[:n]] = np.arange(1, n + 1)
57
-
58
- if use_missing:
59
- rk[idx[n:]] = (n + 1) / 2.0
60
- else:
61
- rk[idx[n:]] = 0
62
-
63
- repeats = find_repeats(data.copy())
64
- for r in repeats[0]:
65
- condition = (data == r).filled(False)
66
- rk[condition] = rk[condition].mean()
67
- return rk
68
-
69
- data = ma.array(data, copy=False)
70
- if axis is None:
71
- if data.ndim > 1:
72
- return _rank1d(data.ravel(), use_missing).reshape(data.shape)
73
- else:
74
- return _rank1d(data, use_missing)
75
- else:
76
- return ma.apply_along_axis(_rank1d, axis, data, use_missing).view(ndarray)
77
-
78
-
79
- def _chk_asarray(a, axis):
80
- # Always returns a masked array, raveled for axis=None
81
- a = ma.asanyarray(a)
82
- if axis is None:
83
- a = ma.ravel(a)
84
- outaxis = 0
85
- else:
86
- outaxis = axis
87
- return a, outaxis
88
-
89
-
90
- SpearmanrResult = namedtuple("SpearmanrResult", ("correlation", "pvalue"))
91
-
92
-
93
- # Taken from scipy.mstats with following tweaks:
94
- # 1. parallel pairwise computation
95
- # 2. custom masking
96
- def spearmanr(
97
- x, y=None, use_ties=True, axis=None, nan_policy="propagate", alternative="two-sided", mask_fn=ma.masked_invalid
98
- ):
99
- if not use_ties:
100
- raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")
101
-
102
- # Always returns a masked array, raveled if axis=None
103
- x, axisout = _chk_asarray(x, axis)
104
- if y is not None:
105
- # Deal only with 2-D `x` case.
106
- y, _ = _chk_asarray(y, axis)
107
- if axisout == 0:
108
- x = ma.column_stack((x, y))
109
- else:
110
- x = ma.row_stack((x, y))
111
-
112
- if axisout == 1:
113
- # To simplify the code that follow (always use `n_obs, n_vars` shape)
114
- x = x.T
115
-
116
- if nan_policy == "omit":
117
- x = mask_fn(x)
118
-
119
- def _spearmanr_2cols(x):
120
- # Mask the same observations for all variables, and then drop those
121
- # observations (can't leave them masked, rankdata is weird).
122
- x = ma.mask_rowcols(x, axis=0)
123
- x = x[~x.mask.any(axis=1), :]
124
-
125
- # If either column is entirely NaN or Inf
126
- if not np.any(x.data):
127
- return SpearmanrResult(np.nan, np.nan)
128
-
129
- m = ma.getmask(x)
130
- n_obs = x.shape[0]
131
- dof = n_obs - 2 - int(m.sum(axis=0)[0])
132
- if dof < 0:
133
- return SpearmanrResult(np.nan, np.nan)
134
-
135
- # Gets the ranks and rank differences
136
- x_ranked = rankdata(x, axis=0)
137
- rs = ma.corrcoef(x_ranked, rowvar=False).data
138
-
139
- # rs can have elements equal to 1, so avoid zero division warnings
140
- with np.errstate(divide="ignore"):
141
- # clip the small negative values possibly caused by rounding
142
- # errors before taking the square root
143
- t = rs * np.sqrt((dof / ((rs + 1.0) * (1.0 - rs))).clip(0))
144
-
145
- t, prob = scipy.stats._mstats_basic._ttest_finish(dof, t, alternative)
146
-
147
- # For backwards compatibility, return scalars when comparing 2 columns
148
- if rs.shape == (2, 2):
149
- return SpearmanrResult(rs[1, 0], prob[1, 0])
150
- else:
151
- return SpearmanrResult(rs, prob)
152
-
153
- # Need to do this per pair of variables, otherwise the dropped observations
154
- # in a third column mess up the result for a pair.
155
- n_vars = x.shape[1]
156
- if n_vars == 2:
157
- return _spearmanr_2cols(x)
158
- else:
159
- max_cpu_cores = cpu_count(logical=False)
160
- with np.errstate(divide="ignore"):
161
- results = Parallel(n_jobs=max_cpu_cores)(
162
- delayed(_spearmanr_2cols)(x[:, [var1, var2]])
163
- for var1 in range(n_vars - 1)
164
- for var2 in range(var1 + 1, n_vars)
165
- )
166
-
167
- rs = np.ones((n_vars, n_vars), dtype=float)
168
- prob = np.zeros((n_vars, n_vars), dtype=float)
169
- for var1 in range(n_vars - 1):
170
- for var2 in range(var1 + 1, n_vars):
171
- result = results.pop(0)
172
- rs[var1, var2] = result.correlation
173
- rs[var2, var1] = result.correlation
174
- prob[var1, var2] = result.pvalue
175
- prob[var2, var1] = result.pvalue
176
-
177
- return SpearmanrResult(rs, prob)
@@ -1,160 +0,0 @@
1
- import hashlib
2
- from typing import Any, Dict, List
3
-
4
- import numpy as np
5
- import pandas as pd
6
- from joblib import Parallel, delayed
7
- from pandas.api.types import is_datetime64_any_dtype, is_numeric_dtype
8
- from psutil import cpu_count
9
- from scipy.stats import skew, spearmanr
10
-
11
- from upgini.metadata import ModelTaskType, SearchKey
12
- from upgini.utils import mstats
13
-
14
-
15
- def sort_columns(
16
- df: pd.DataFrame,
17
- target_column: str,
18
- search_keys: Dict[str, SearchKey],
19
- model_task_type: ModelTaskType,
20
- exclude_columns: List[str],
21
- ) -> List[str]:
22
- df = df.copy() # avoid side effects
23
- sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
24
- sorted_keys = [k for k in sorted_keys if k not in exclude_columns]
25
-
26
- other_columns = sorted(
27
- [
28
- c
29
- for c in df.columns
30
- if c not in sorted_keys
31
- and c not in exclude_columns
32
- and df[c].nunique() > 1
33
- ]
34
- )
35
-
36
- target = prepare_target(df[target_column], model_task_type)
37
- sort_dict = get_sort_columns_dict(df[sorted_keys + other_columns], target, sorted_keys, omit_nan=True)
38
- other_columns = [c for c in other_columns if c in sort_dict]
39
- columns_for_sort = sorted_keys + sorted(other_columns, key=lambda e: sort_dict[e], reverse=True)
40
- return columns_for_sort
41
-
42
-
43
- def get_sort_columns_dict(
44
- df: pd.DataFrame,
45
- target: pd.Series,
46
- sorted_keys: List[str],
47
- omit_nan: bool,
48
- n_jobs: int | None = None,
49
- ) -> dict[str, Any]:
50
- string_features = [c for c in df.select_dtypes(exclude=[np.number]).columns if c not in sorted_keys]
51
- columns_for_sort = [c for c in df.columns if c not in sorted_keys + string_features]
52
- if len(string_features) > 0:
53
- if len(df) > len(df.drop(columns=string_features).drop_duplicates()):
54
- # factorize string features
55
- for c in string_features:
56
- df[c] = df[c].factorize(sort=True)[0]
57
- columns_for_sort.extend(string_features)
58
-
59
- if len(columns_for_sort) == 0:
60
- return {}
61
-
62
- df = df[columns_for_sort]
63
- hashes = [hash_series(df[col]) for col in columns_for_sort]
64
- df = np.asarray(df, dtype=np.float32)
65
- correlations = get_sort_columns_correlations(df, target, omit_nan, n_jobs)
66
-
67
- sort_dict = {col: (corr, h) for col, corr, h in zip(columns_for_sort, correlations, hashes)}
68
- return sort_dict
69
-
70
-
71
- def get_sort_columns_correlations(df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: int | None = None):
72
- target_correlations = get_target_correlations(df, target, omit_nan, n_jobs, precision=7)
73
-
74
- return np.max(target_correlations, axis=0)
75
-
76
-
77
- def get_target_correlations(
78
- df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: int | None = None, precision: int = 15
79
- ):
80
- df = np.asarray(df, dtype=np.float32)
81
- target_correlations = np.zeros((2, df.shape[1]))
82
- target_correlations[0, :] = np.nan_to_num(
83
- calculate_spearman_corr_with_target(df, target, omit_nan, n_jobs), copy=False
84
- )
85
- target_correlations[1, :] = np.nan_to_num(np.abs(np.corrcoef(df.T, target.T, rowvar=True)[-1, :-1]))
86
-
87
- target_correlations = np.trunc(target_correlations * 10**precision) / (10**precision)
88
-
89
- return target_correlations
90
-
91
-
92
- def corr_dict_from_sort_dict(sort_dict: dict[str, tuple[float, int]]) -> dict[str, float]:
93
- return {k: v[0] for k, v in sort_dict.items()}
94
-
95
-
96
- def calculate_spearman_corr_with_target(
97
- X: pd.DataFrame | np.ndarray, y: pd.Series, omit_nan: bool = False, n_jobs: int | None = None
98
- ) -> np.ndarray:
99
- if isinstance(X, pd.DataFrame):
100
- X = np.asarray(X, dtype=np.float32)
101
-
102
- if X.size == 0:
103
- return np.ndarray(shape=(0,))
104
-
105
- all_correlations = np.zeros(X.shape[1])
106
- all_correlations.fill(np.nan)
107
- cols2calc = np.where([c.size > 0 and not (c == c[0]).all() for c in X.T])[0]
108
-
109
- if omit_nan:
110
- results = Parallel(n_jobs=n_jobs or cpu_count(logical=False))(
111
- delayed(mstats.spearmanr)(
112
- X[:, i],
113
- y,
114
- nan_policy="omit",
115
- axis=0,
116
- )
117
- for i in cols2calc
118
- )
119
- target_correlations = np.array([abs(res.correlation) for res in results])
120
- else:
121
- cols2calc = cols2calc[np.where(~np.isnan(X[:, cols2calc]).any(axis=0))[0]]
122
- target_correlations = calculate_spearman(X[:, cols2calc], y, nan_policy="raise")
123
- if isinstance(target_correlations, float):
124
- target_correlations = np.abs([target_correlations])
125
- else:
126
- target_correlations = np.abs(target_correlations)[-1, :-1]
127
-
128
- all_correlations[cols2calc] = target_correlations
129
-
130
- return all_correlations
131
-
132
-
133
- def calculate_spearman(X: np.ndarray, y: pd.Series | None, nan_policy: str):
134
- features_num = X.shape[1]
135
- if y is not None:
136
- features_num += 1
137
-
138
- if features_num < 2:
139
- return 1.0
140
- else:
141
- return spearmanr(X, y, nan_policy=nan_policy).correlation
142
-
143
-
144
- def hash_series(series: pd.Series) -> int:
145
- return int(hashlib.sha256(pd.util.hash_pandas_object(series, index=True).values).hexdigest(), 16)
146
-
147
-
148
- def prepare_target(target: pd.Series, model_task_type: ModelTaskType) -> pd.Series:
149
- target_name = target.name
150
- if model_task_type != ModelTaskType.REGRESSION or (
151
- not is_numeric_dtype(target) and not is_datetime64_any_dtype(target)
152
- ):
153
- target = target.astype(str).astype("category").cat.codes
154
-
155
- elif model_task_type == ModelTaskType.REGRESSION:
156
- skewness = round(abs(skew(target)), 2)
157
- if (target.min() >= 0) and (skewness >= 0.9):
158
- target = np.log1p(target)
159
-
160
- return pd.Series(target, name=target_name)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes