upgini 1.2.134__tar.gz → 1.2.135a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (82) hide show
  1. {upgini-1.2.134 → upgini-1.2.135a1}/PKG-INFO +1 -1
  2. upgini-1.2.135a1/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/features_enricher.py +39 -22
  4. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/datetime_utils.py +2 -3
  5. upgini-1.2.134/src/upgini/__about__.py +0 -1
  6. {upgini-1.2.134 → upgini-1.2.135a1}/.gitignore +0 -0
  7. {upgini-1.2.134 → upgini-1.2.135a1}/LICENSE +0 -0
  8. {upgini-1.2.134 → upgini-1.2.135a1}/README.md +0 -0
  9. {upgini-1.2.134 → upgini-1.2.135a1}/pyproject.toml +0 -0
  10. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/__init__.py +0 -0
  11. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/ads.py +0 -0
  12. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/ads_management/__init__.py +0 -0
  13. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/ads_management/ads_manager.py +0 -0
  14. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/autofe/__init__.py +0 -0
  15. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/autofe/all_operators.py +0 -0
  16. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/autofe/binary.py +0 -0
  17. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/autofe/date.py +0 -0
  18. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/autofe/feature.py +0 -0
  19. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/autofe/groupby.py +0 -0
  20. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/autofe/operator.py +0 -0
  21. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/autofe/timeseries/__init__.py +0 -0
  22. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/autofe/timeseries/base.py +0 -0
  23. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/autofe/timeseries/cross.py +0 -0
  24. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/autofe/timeseries/delta.py +0 -0
  25. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/autofe/timeseries/lag.py +0 -0
  26. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/autofe/timeseries/roll.py +0 -0
  27. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/autofe/timeseries/trend.py +0 -0
  28. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/autofe/timeseries/volatility.py +0 -0
  29. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/autofe/unary.py +0 -0
  30. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/autofe/utils.py +0 -0
  31. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/autofe/vector.py +0 -0
  32. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/data_source/__init__.py +0 -0
  33. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/data_source/data_source_publisher.py +0 -0
  34. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/dataset.py +0 -0
  35. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/errors.py +0 -0
  36. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/http.py +0 -0
  37. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/mdc/__init__.py +0 -0
  38. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/mdc/context.py +0 -0
  39. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/metadata.py +0 -0
  40. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/metrics.py +0 -0
  41. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/normalizer/__init__.py +0 -0
  42. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/normalizer/normalize_utils.py +0 -0
  43. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/resource_bundle/__init__.py +0 -0
  44. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/resource_bundle/exceptions.py +0 -0
  45. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/resource_bundle/strings.properties +0 -0
  46. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  47. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/sampler/__init__.py +0 -0
  48. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/sampler/base.py +0 -0
  49. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/sampler/random_under_sampler.py +0 -0
  50. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/sampler/utils.py +0 -0
  51. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/search_task.py +0 -0
  52. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/spinner.py +0 -0
  53. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  54. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/__init__.py +0 -0
  55. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/base_search_key_detector.py +0 -0
  56. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/blocked_time_series.py +0 -0
  57. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/config.py +0 -0
  58. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/country_utils.py +0 -0
  59. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/custom_loss_utils.py +0 -0
  60. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/cv_utils.py +0 -0
  61. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/deduplicate_utils.py +0 -0
  62. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/display_utils.py +0 -0
  63. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/email_utils.py +0 -0
  64. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
  65. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/feature_info.py +0 -0
  66. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/features_validator.py +0 -0
  67. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/format.py +0 -0
  68. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/hash_utils.py +0 -0
  69. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/ip_utils.py +0 -0
  70. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/mstats.py +0 -0
  71. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/phone_utils.py +0 -0
  72. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/postal_code_utils.py +0 -0
  73. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/progress_bar.py +0 -0
  74. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/psi.py +0 -0
  75. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/sample_utils.py +0 -0
  76. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/sklearn_ext.py +0 -0
  77. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/sort.py +0 -0
  78. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/target_utils.py +0 -0
  79. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/track_info.py +0 -0
  80. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/ts_utils.py +0 -0
  81. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/utils/warning_counter.py +0 -0
  82. {upgini-1.2.134 → upgini-1.2.135a1}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: upgini
3
- Version: 1.2.134
3
+ Version: 1.2.135a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.135a1"
@@ -751,7 +751,6 @@ class FeaturesEnricher(TransformerMixin):
751
751
  exclude_features_sources: list[str] | None = None,
752
752
  keep_input: bool = True,
753
753
  trace_id: str | None = None,
754
- metrics_calculation: bool = False,
755
754
  silent_mode=False,
756
755
  progress_bar: ProgressBar | None = None,
757
756
  progress_callback: Callable[[SearchProgress], Any] | None = None,
@@ -810,11 +809,12 @@ class FeaturesEnricher(TransformerMixin):
810
809
  X,
811
810
  y=y,
812
811
  exclude_features_sources=exclude_features_sources,
813
- metrics_calculation=metrics_calculation,
814
812
  silent_mode=silent_mode,
815
813
  progress_bar=progress_bar,
816
814
  keep_input=keep_input,
817
815
  )
816
+ if TARGET in result.columns:
817
+ result.drop(columns=TARGET, inplace=True)
818
818
  self.logger.info("Transform finished successfully")
819
819
  search_progress = SearchProgress(100.0, ProgressStage.FINISHED)
820
820
  if progress_bar is not None:
@@ -1637,7 +1637,7 @@ class FeaturesEnricher(TransformerMixin):
1637
1637
 
1638
1638
  if not isinstance(_cv, BaseCrossValidator):
1639
1639
  date_column = self._get_date_column(search_keys)
1640
- date_series = X[date_column] if date_column is not None else None
1640
+ date_series = X[date_column] if date_column is not None and date_column in X.columns else None
1641
1641
  _cv, groups = CVConfig(
1642
1642
  _cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
1643
1643
  ).get_cv_and_groups(X)
@@ -1738,7 +1738,7 @@ class FeaturesEnricher(TransformerMixin):
1738
1738
 
1739
1739
  client_features = [
1740
1740
  c
1741
- for c in (validated_X.columns.to_list() + generated_features)
1741
+ for c in validated_X.columns.to_list()
1742
1742
  if (not self.fit_select_features or c in set(self.feature_names_).union(self.id_columns or []))
1743
1743
  and c
1744
1744
  not in (
@@ -1747,6 +1747,7 @@ class FeaturesEnricher(TransformerMixin):
1747
1747
  + [DateTimeConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
1748
1748
  )
1749
1749
  ]
1750
+ client_features.extend(f for f in generated_features if f in self.feature_names_)
1750
1751
  if self.baseline_score_column is not None and self.baseline_score_column not in client_features:
1751
1752
  client_features.append(self.baseline_score_column)
1752
1753
  self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
@@ -1847,7 +1848,7 @@ class FeaturesEnricher(TransformerMixin):
1847
1848
  enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
1848
1849
  enriched_eval_X, eval_y_sampled, self.cv
1849
1850
  )
1850
- if date_column is not None:
1851
+ if date_column is not None and date_column in eval_X_sorted.columns:
1851
1852
  eval_set_dates[idx] = eval_X_sorted[date_column]
1852
1853
  fitting_eval_X = eval_X_sorted[fitting_x_columns].copy()
1853
1854
  fitting_enriched_eval_X = enriched_eval_X_sorted[fitting_enriched_x_columns].copy()
@@ -1936,7 +1937,9 @@ class FeaturesEnricher(TransformerMixin):
1936
1937
  and self.df_with_original_index is not None
1937
1938
  ):
1938
1939
  self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
1939
- return self.__get_enriched_from_fit(eval_set, trace_id, remove_outliers_calc_metrics)
1940
+ return self.__get_enriched_from_fit(
1941
+ validated_X, validated_y, eval_set, trace_id, remove_outliers_calc_metrics
1942
+ )
1940
1943
  else:
1941
1944
  self.logger.info(
1942
1945
  "Dataset is imbalanced or exclude_features_sources or X was passed or this is saved search."
@@ -2074,6 +2077,8 @@ class FeaturesEnricher(TransformerMixin):
2074
2077
 
2075
2078
  def __get_enriched_from_fit(
2076
2079
  self,
2080
+ validated_X: pd.DataFrame,
2081
+ validated_y: pd.Series,
2077
2082
  eval_set: list[tuple] | None,
2078
2083
  trace_id: str,
2079
2084
  remove_outliers_calc_metrics: bool | None,
@@ -2124,6 +2129,24 @@ class FeaturesEnricher(TransformerMixin):
2124
2129
  drop_system_record_id=False,
2125
2130
  )
2126
2131
 
2132
+ enriched_Xy.rename(columns=self.fit_columns_renaming, inplace=True)
2133
+ search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
2134
+ generated_features = [self.fit_columns_renaming.get(c, c) for c in self.fit_generated_features]
2135
+
2136
+ validated_Xy = validated_X.copy()
2137
+ validated_Xy[TARGET] = validated_y
2138
+
2139
+ selecting_columns = self._selecting_input_and_generated_columns(
2140
+ validated_Xy, self.fit_generated_features, keep_input=True, trace_id=trace_id
2141
+ )
2142
+ selecting_columns.extend(
2143
+ c
2144
+ for c in enriched_Xy.columns
2145
+ if (c in self.feature_names_ and c not in selecting_columns and c not in validated_X.columns)
2146
+ or c in [EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SYSTEM_RECORD_ID]
2147
+ )
2148
+ enriched_Xy = enriched_Xy[selecting_columns]
2149
+
2127
2150
  # Handle eval sets extraction based on EVAL_SET_INDEX
2128
2151
  if EVAL_SET_INDEX in enriched_Xy.columns:
2129
2152
  eval_set_indices = list(enriched_Xy[EVAL_SET_INDEX].unique())
@@ -2135,7 +2158,11 @@ class FeaturesEnricher(TransformerMixin):
2135
2158
  ].copy()
2136
2159
  enriched_Xy = enriched_Xy.loc[enriched_Xy[EVAL_SET_INDEX] == 0].copy()
2137
2160
 
2138
- x_columns = [c for c in self.df_with_original_index.columns if c not in [EVAL_SET_INDEX, TARGET]]
2161
+ x_columns = [
2162
+ c
2163
+ for c in [self.fit_columns_renaming.get(k, k) for k in self.df_with_original_index.columns]
2164
+ if c not in [EVAL_SET_INDEX, TARGET] and c in selecting_columns
2165
+ ]
2139
2166
  X_sampled = enriched_Xy[x_columns].copy()
2140
2167
  y_sampled = enriched_Xy[TARGET].copy()
2141
2168
  enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
@@ -2157,15 +2184,6 @@ class FeaturesEnricher(TransformerMixin):
2157
2184
  enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
2158
2185
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
2159
2186
 
2160
- # reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
2161
- X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
2162
- enriched_X.rename(columns=self.fit_columns_renaming, inplace=True)
2163
- for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
2164
- eval_X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
2165
- enriched_eval_X.rename(columns=self.fit_columns_renaming, inplace=True)
2166
- search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
2167
- generated_features = [self.fit_columns_renaming.get(c, c) for c in self.fit_generated_features]
2168
-
2169
2187
  datasets_hash = hash_input(self.X, self.y, self.eval_set)
2170
2188
  return self.__cache_and_return_results(
2171
2189
  datasets_hash,
@@ -2642,7 +2660,7 @@ if response.status_code == 200:
2642
2660
  generated_features = [columns_renaming.get(c, c) for c in generated_features]
2643
2661
  search_keys = {columns_renaming.get(c, c): t for c, t in search_keys.items()}
2644
2662
  selecting_columns = self._selecting_input_and_generated_columns(
2645
- validated_Xy, generated_features, keep_input, trace_id
2663
+ validated_Xy, generated_features, keep_input, trace_id, is_transform=True
2646
2664
  )
2647
2665
  self.logger.warning(f"Filtered columns by existance in dataframe: {selecting_columns}")
2648
2666
  if add_fit_system_record_id:
@@ -2895,7 +2913,7 @@ if response.status_code == 200:
2895
2913
  )
2896
2914
 
2897
2915
  selecting_columns = self._selecting_input_and_generated_columns(
2898
- validated_Xy, generated_features, keep_input, trace_id
2916
+ validated_Xy, generated_features, keep_input, trace_id, is_transform=True
2899
2917
  )
2900
2918
  selecting_columns.extend(
2901
2919
  c
@@ -2933,20 +2951,19 @@ if response.status_code == 200:
2933
2951
  generated_features: list[str],
2934
2952
  keep_input: bool,
2935
2953
  trace_id: str,
2954
+ is_transform: bool = False,
2936
2955
  ):
2937
2956
  fit_input_columns = [c.originalName for c in self._search_task.get_file_metadata(trace_id).columns]
2938
2957
  new_columns_on_transform = [c for c in validated_Xy.columns if c not in fit_input_columns]
2939
2958
 
2940
- selected_generated_features = [
2941
- c for c in generated_features if c in self.feature_names_
2942
- ]
2959
+ selected_generated_features = [c for c in generated_features if c in self.feature_names_]
2943
2960
  if keep_input is True:
2944
2961
  selected_input_columns = [
2945
2962
  c
2946
2963
  for c in validated_Xy.columns
2947
2964
  if not self.fit_select_features
2948
2965
  or c in self.feature_names_
2949
- or c in new_columns_on_transform
2966
+ or (c in new_columns_on_transform and is_transform)
2950
2967
  or c in self.search_keys
2951
2968
  or c in (self.id_columns or [])
2952
2969
  or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
@@ -1,6 +1,5 @@
1
1
  import datetime
2
2
  import logging
3
- import re
4
3
  from typing import Dict, List, Optional
5
4
 
6
5
  import numpy as np
@@ -67,7 +66,7 @@ class DateTimeConverter:
67
66
  try:
68
67
  if s is None or len(str(s).strip()) == 0:
69
68
  return None
70
- if not re.match(DATETIME_PATTERN, str(s)):
69
+ if sum(ch.isdigit() for ch in str(s)) < 6:
71
70
  return None
72
71
  return s
73
72
  except Exception:
@@ -116,7 +115,7 @@ class DateTimeConverter:
116
115
  else:
117
116
  return None
118
117
  else:
119
- date_col = date_col.astype("string") # .apply(self.clean_date)
118
+ date_col = date_col.astype("string").apply(self.clean_date)
120
119
  parsed_datetime = self.parse_string_date(date_col.to_frame(self.date_column), raise_errors)
121
120
  if parsed_datetime.isna().all():
122
121
  raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
@@ -1 +0,0 @@
1
- __version__ = "1.2.134"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes