upgini 1.2.134__tar.gz → 1.2.135__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {upgini-1.2.134 → upgini-1.2.135}/PKG-INFO +1 -1
- upgini-1.2.135/src/upgini/__about__.py +1 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/dataset.py +4 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/features_enricher.py +60 -30
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/metadata.py +1 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/datetime_utils.py +2 -3
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/features_validator.py +5 -3
- upgini-1.2.134/src/upgini/__about__.py +0 -1
- {upgini-1.2.134 → upgini-1.2.135}/.gitignore +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/LICENSE +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/README.md +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/pyproject.toml +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/__init__.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/ads.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/autofe/all_operators.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/autofe/operator.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/autofe/timeseries/__init__.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/autofe/timeseries/base.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/autofe/timeseries/cross.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/autofe/timeseries/delta.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/autofe/timeseries/lag.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/autofe/timeseries/roll.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/autofe/timeseries/trend.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/autofe/timeseries/volatility.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/autofe/utils.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/errors.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/http.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/metrics.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/search_task.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/spinner.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/config.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/hash_utils.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/mstats.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/psi.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/sample_utils.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/sort.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/ts_utils.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.134 → upgini-1.2.135}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.135"
|
|
@@ -71,6 +71,7 @@ class Dataset:
|
|
|
71
71
|
date_column: Optional[str] = None,
|
|
72
72
|
id_columns: Optional[List[str]] = None,
|
|
73
73
|
is_imbalanced: bool = False,
|
|
74
|
+
dropped_columns: Optional[List[str]] = None,
|
|
74
75
|
random_state: Optional[int] = None,
|
|
75
76
|
sample_config: Optional[SampleConfig] = None,
|
|
76
77
|
rest_client: Optional[_RestClient] = None,
|
|
@@ -118,6 +119,7 @@ class Dataset:
|
|
|
118
119
|
self.is_imbalanced: bool = False
|
|
119
120
|
self.id_columns = id_columns
|
|
120
121
|
self.is_imbalanced = is_imbalanced
|
|
122
|
+
self.dropped_columns = dropped_columns
|
|
121
123
|
self.date_column = date_column
|
|
122
124
|
if logger is not None:
|
|
123
125
|
self.logger = logger
|
|
@@ -285,6 +287,7 @@ class Dataset:
|
|
|
285
287
|
for key in search_group
|
|
286
288
|
if key in self.columns_renaming
|
|
287
289
|
and not self.columns_renaming.get(key).endswith(EmailSearchKeyConverter.ONE_DOMAIN_SUFFIX)
|
|
290
|
+
and not self.columns_renaming.get(key) == "current_date"
|
|
288
291
|
}
|
|
289
292
|
ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS.value)
|
|
290
293
|
if (
|
|
@@ -475,6 +478,7 @@ class Dataset:
|
|
|
475
478
|
hierarchicalGroupKeys=self.hierarchical_group_keys,
|
|
476
479
|
hierarchicalSubgroupKeys=self.hierarchical_subgroup_keys,
|
|
477
480
|
taskType=self.task_type,
|
|
481
|
+
droppedColumns=self.dropped_columns,
|
|
478
482
|
)
|
|
479
483
|
|
|
480
484
|
@staticmethod
|
|
@@ -751,7 +751,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
751
751
|
exclude_features_sources: list[str] | None = None,
|
|
752
752
|
keep_input: bool = True,
|
|
753
753
|
trace_id: str | None = None,
|
|
754
|
-
metrics_calculation: bool = False,
|
|
755
754
|
silent_mode=False,
|
|
756
755
|
progress_bar: ProgressBar | None = None,
|
|
757
756
|
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
|
@@ -810,11 +809,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
810
809
|
X,
|
|
811
810
|
y=y,
|
|
812
811
|
exclude_features_sources=exclude_features_sources,
|
|
813
|
-
metrics_calculation=metrics_calculation,
|
|
814
812
|
silent_mode=silent_mode,
|
|
815
813
|
progress_bar=progress_bar,
|
|
816
814
|
keep_input=keep_input,
|
|
817
815
|
)
|
|
816
|
+
if TARGET in result.columns:
|
|
817
|
+
result = result.drop(columns=TARGET)
|
|
818
818
|
self.logger.info("Transform finished successfully")
|
|
819
819
|
search_progress = SearchProgress(100.0, ProgressStage.FINISHED)
|
|
820
820
|
if progress_bar is not None:
|
|
@@ -1047,7 +1047,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1047
1047
|
with Spinner():
|
|
1048
1048
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
|
1049
1049
|
|
|
1050
|
-
|
|
1050
|
+
date_col = self._get_date_column(search_keys)
|
|
1051
|
+
has_date = date_col is not None and date_col in validated_X.columns
|
|
1051
1052
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
|
1052
1053
|
cat_features = list(set(client_cat_features + cat_features_from_backend))
|
|
1053
1054
|
has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
|
|
@@ -1323,7 +1324,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1323
1324
|
search_keys = {str(k): v for k, v in search_keys.items()}
|
|
1324
1325
|
|
|
1325
1326
|
date_column = self._get_date_column(search_keys)
|
|
1326
|
-
has_date = date_column is not None
|
|
1327
|
+
has_date = date_column is not None and date_column in validated_X.columns
|
|
1327
1328
|
if not has_date:
|
|
1328
1329
|
self.logger.info("No date column for OOT PSI calculation")
|
|
1329
1330
|
return
|
|
@@ -1637,7 +1638,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1637
1638
|
|
|
1638
1639
|
if not isinstance(_cv, BaseCrossValidator):
|
|
1639
1640
|
date_column = self._get_date_column(search_keys)
|
|
1640
|
-
date_series = X[date_column] if date_column is not None else None
|
|
1641
|
+
date_series = X[date_column] if date_column is not None and date_column in X.columns else None
|
|
1641
1642
|
_cv, groups = CVConfig(
|
|
1642
1643
|
_cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
|
|
1643
1644
|
).get_cv_and_groups(X)
|
|
@@ -1736,17 +1737,22 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1736
1737
|
|
|
1737
1738
|
self.logger.info(f"Excluding search keys: {excluding_search_keys}")
|
|
1738
1739
|
|
|
1740
|
+
file_meta = self._search_task.get_file_metadata(trace_id)
|
|
1741
|
+
fit_dropped_features = self.fit_dropped_features or file_meta.droppedColumns or []
|
|
1742
|
+
original_dropped_features = [columns_renaming.get(f, f) for f in fit_dropped_features]
|
|
1743
|
+
|
|
1739
1744
|
client_features = [
|
|
1740
1745
|
c
|
|
1741
|
-
for c in
|
|
1746
|
+
for c in validated_X.columns.to_list()
|
|
1742
1747
|
if (not self.fit_select_features or c in set(self.feature_names_).union(self.id_columns or []))
|
|
1743
1748
|
and c
|
|
1744
1749
|
not in (
|
|
1745
1750
|
excluding_search_keys
|
|
1746
|
-
+
|
|
1751
|
+
+ original_dropped_features
|
|
1747
1752
|
+ [DateTimeConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
|
1748
1753
|
)
|
|
1749
1754
|
]
|
|
1755
|
+
client_features.extend(f for f in generated_features if f in self.feature_names_)
|
|
1750
1756
|
if self.baseline_score_column is not None and self.baseline_score_column not in client_features:
|
|
1751
1757
|
client_features.append(self.baseline_score_column)
|
|
1752
1758
|
self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
|
|
@@ -1847,7 +1853,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1847
1853
|
enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
|
|
1848
1854
|
enriched_eval_X, eval_y_sampled, self.cv
|
|
1849
1855
|
)
|
|
1850
|
-
if date_column is not None:
|
|
1856
|
+
if date_column is not None and date_column in eval_X_sorted.columns:
|
|
1851
1857
|
eval_set_dates[idx] = eval_X_sorted[date_column]
|
|
1852
1858
|
fitting_eval_X = eval_X_sorted[fitting_x_columns].copy()
|
|
1853
1859
|
fitting_enriched_eval_X = enriched_eval_X_sorted[fitting_enriched_x_columns].copy()
|
|
@@ -1936,7 +1942,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1936
1942
|
and self.df_with_original_index is not None
|
|
1937
1943
|
):
|
|
1938
1944
|
self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
|
|
1939
|
-
return self.__get_enriched_from_fit(
|
|
1945
|
+
return self.__get_enriched_from_fit(
|
|
1946
|
+
validated_X, validated_y, eval_set, trace_id, remove_outliers_calc_metrics
|
|
1947
|
+
)
|
|
1940
1948
|
else:
|
|
1941
1949
|
self.logger.info(
|
|
1942
1950
|
"Dataset is imbalanced or exclude_features_sources or X was passed or this is saved search."
|
|
@@ -2074,6 +2082,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2074
2082
|
|
|
2075
2083
|
def __get_enriched_from_fit(
|
|
2076
2084
|
self,
|
|
2085
|
+
validated_X: pd.DataFrame,
|
|
2086
|
+
validated_y: pd.Series,
|
|
2077
2087
|
eval_set: list[tuple] | None,
|
|
2078
2088
|
trace_id: str,
|
|
2079
2089
|
remove_outliers_calc_metrics: bool | None,
|
|
@@ -2082,7 +2092,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2082
2092
|
search_keys = self.fit_search_keys.copy()
|
|
2083
2093
|
|
|
2084
2094
|
rows_to_drop = None
|
|
2085
|
-
|
|
2095
|
+
date_column = self._get_date_column(search_keys)
|
|
2096
|
+
has_date = date_column is not None and date_column in validated_X.columns
|
|
2086
2097
|
self.model_task_type = self.model_task_type or define_task(
|
|
2087
2098
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
|
2088
2099
|
)
|
|
@@ -2124,6 +2135,24 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2124
2135
|
drop_system_record_id=False,
|
|
2125
2136
|
)
|
|
2126
2137
|
|
|
2138
|
+
enriched_Xy.rename(columns=self.fit_columns_renaming, inplace=True)
|
|
2139
|
+
search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
|
|
2140
|
+
generated_features = [self.fit_columns_renaming.get(c, c) for c in self.fit_generated_features]
|
|
2141
|
+
|
|
2142
|
+
validated_Xy = validated_X.copy()
|
|
2143
|
+
validated_Xy[TARGET] = validated_y
|
|
2144
|
+
|
|
2145
|
+
selecting_columns = self._selecting_input_and_generated_columns(
|
|
2146
|
+
validated_Xy, self.fit_generated_features, keep_input=True, trace_id=trace_id
|
|
2147
|
+
)
|
|
2148
|
+
selecting_columns.extend(
|
|
2149
|
+
c
|
|
2150
|
+
for c in enriched_Xy.columns
|
|
2151
|
+
if (c in self.feature_names_ and c not in selecting_columns and c not in validated_X.columns)
|
|
2152
|
+
or c in [EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SYSTEM_RECORD_ID]
|
|
2153
|
+
)
|
|
2154
|
+
enriched_Xy = enriched_Xy[selecting_columns]
|
|
2155
|
+
|
|
2127
2156
|
# Handle eval sets extraction based on EVAL_SET_INDEX
|
|
2128
2157
|
if EVAL_SET_INDEX in enriched_Xy.columns:
|
|
2129
2158
|
eval_set_indices = list(enriched_Xy[EVAL_SET_INDEX].unique())
|
|
@@ -2135,7 +2164,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2135
2164
|
].copy()
|
|
2136
2165
|
enriched_Xy = enriched_Xy.loc[enriched_Xy[EVAL_SET_INDEX] == 0].copy()
|
|
2137
2166
|
|
|
2138
|
-
x_columns = [
|
|
2167
|
+
x_columns = [
|
|
2168
|
+
c
|
|
2169
|
+
for c in [self.fit_columns_renaming.get(k, k) for k in self.df_with_original_index.columns]
|
|
2170
|
+
if c not in [EVAL_SET_INDEX, TARGET] and c in selecting_columns
|
|
2171
|
+
]
|
|
2139
2172
|
X_sampled = enriched_Xy[x_columns].copy()
|
|
2140
2173
|
y_sampled = enriched_Xy[TARGET].copy()
|
|
2141
2174
|
enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
@@ -2157,15 +2190,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2157
2190
|
enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
|
|
2158
2191
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
2159
2192
|
|
|
2160
|
-
# reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
|
|
2161
|
-
X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
|
|
2162
|
-
enriched_X.rename(columns=self.fit_columns_renaming, inplace=True)
|
|
2163
|
-
for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
|
|
2164
|
-
eval_X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
|
|
2165
|
-
enriched_eval_X.rename(columns=self.fit_columns_renaming, inplace=True)
|
|
2166
|
-
search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
|
|
2167
|
-
generated_features = [self.fit_columns_renaming.get(c, c) for c in self.fit_generated_features]
|
|
2168
|
-
|
|
2169
2193
|
datasets_hash = hash_input(self.X, self.y, self.eval_set)
|
|
2170
2194
|
return self.__cache_and_return_results(
|
|
2171
2195
|
datasets_hash,
|
|
@@ -2642,7 +2666,7 @@ if response.status_code == 200:
|
|
|
2642
2666
|
generated_features = [columns_renaming.get(c, c) for c in generated_features]
|
|
2643
2667
|
search_keys = {columns_renaming.get(c, c): t for c, t in search_keys.items()}
|
|
2644
2668
|
selecting_columns = self._selecting_input_and_generated_columns(
|
|
2645
|
-
validated_Xy, generated_features, keep_input, trace_id
|
|
2669
|
+
validated_Xy, generated_features, keep_input, trace_id, is_transform=True
|
|
2646
2670
|
)
|
|
2647
2671
|
self.logger.warning(f"Filtered columns by existance in dataframe: {selecting_columns}")
|
|
2648
2672
|
if add_fit_system_record_id:
|
|
@@ -2895,7 +2919,7 @@ if response.status_code == 200:
|
|
|
2895
2919
|
)
|
|
2896
2920
|
|
|
2897
2921
|
selecting_columns = self._selecting_input_and_generated_columns(
|
|
2898
|
-
validated_Xy, generated_features, keep_input, trace_id
|
|
2922
|
+
validated_Xy, generated_features, keep_input, trace_id, is_transform=True
|
|
2899
2923
|
)
|
|
2900
2924
|
selecting_columns.extend(
|
|
2901
2925
|
c
|
|
@@ -2933,20 +2957,24 @@ if response.status_code == 200:
|
|
|
2933
2957
|
generated_features: list[str],
|
|
2934
2958
|
keep_input: bool,
|
|
2935
2959
|
trace_id: str,
|
|
2960
|
+
is_transform: bool = False,
|
|
2936
2961
|
):
|
|
2937
|
-
|
|
2938
|
-
|
|
2939
|
-
|
|
2940
|
-
|
|
2941
|
-
|
|
2962
|
+
file_meta = self._search_task.get_file_metadata(trace_id)
|
|
2963
|
+
fit_dropped_features = self.fit_dropped_features or file_meta.droppedColumns or []
|
|
2964
|
+
fit_input_columns = [c.originalName for c in file_meta.columns]
|
|
2965
|
+
original_dropped_features = [self.fit_columns_renaming.get(c, c) for c in fit_dropped_features]
|
|
2966
|
+
new_columns_on_transform = [
|
|
2967
|
+
c for c in validated_Xy.columns if c not in fit_input_columns and c not in original_dropped_features
|
|
2942
2968
|
]
|
|
2969
|
+
|
|
2970
|
+
selected_generated_features = [c for c in generated_features if c in self.feature_names_]
|
|
2943
2971
|
if keep_input is True:
|
|
2944
2972
|
selected_input_columns = [
|
|
2945
2973
|
c
|
|
2946
2974
|
for c in validated_Xy.columns
|
|
2947
2975
|
if not self.fit_select_features
|
|
2948
2976
|
or c in self.feature_names_
|
|
2949
|
-
or c in new_columns_on_transform
|
|
2977
|
+
or (c in new_columns_on_transform and is_transform)
|
|
2950
2978
|
or c in self.search_keys
|
|
2951
2979
|
or c in (self.id_columns or [])
|
|
2952
2980
|
or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
|
|
@@ -3112,7 +3140,7 @@ if response.status_code == 200:
|
|
|
3112
3140
|
self.fit_search_keys = self.__prepare_search_keys(df, self.fit_search_keys, is_demo_dataset)
|
|
3113
3141
|
|
|
3114
3142
|
maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
3115
|
-
has_date = maybe_date_column is not None
|
|
3143
|
+
has_date = maybe_date_column is not None and maybe_date_column in validated_X.columns
|
|
3116
3144
|
|
|
3117
3145
|
self.model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
|
|
3118
3146
|
|
|
@@ -3358,6 +3386,7 @@ if response.status_code == 200:
|
|
|
3358
3386
|
cv_type=self.cv,
|
|
3359
3387
|
id_columns=self.__get_renamed_id_columns(),
|
|
3360
3388
|
is_imbalanced=self.imbalanced,
|
|
3389
|
+
dropped_columns=[self.fit_columns_renaming.get(f, f) for f in self.fit_dropped_features],
|
|
3361
3390
|
date_column=self._get_date_column(self.fit_search_keys),
|
|
3362
3391
|
date_format=self.date_format,
|
|
3363
3392
|
random_state=self.random_state,
|
|
@@ -3746,7 +3775,8 @@ if response.status_code == 200:
|
|
|
3746
3775
|
if eval_set is None:
|
|
3747
3776
|
return None
|
|
3748
3777
|
validated_eval_set = []
|
|
3749
|
-
|
|
3778
|
+
date_col = self._get_date_column(self.search_keys)
|
|
3779
|
+
has_date = date_col is not None and date_col in X.columns
|
|
3750
3780
|
for idx, eval_pair in enumerate(eval_set):
|
|
3751
3781
|
validated_pair = self._validate_eval_set_pair(X, eval_pair)
|
|
3752
3782
|
if validated_pair[1].isna().all():
|
|
@@ -252,6 +252,7 @@ class FileMetadata(BaseModel):
|
|
|
252
252
|
rowsCount: Optional[int] = None
|
|
253
253
|
checksumMD5: Optional[str] = None
|
|
254
254
|
digest: Optional[str] = None
|
|
255
|
+
droppedColumns: Optional[List[str]] = None
|
|
255
256
|
|
|
256
257
|
def column_by_name(self, name: str) -> Optional[FileColumnMetadata]:
|
|
257
258
|
for c in self.columns:
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import logging
|
|
3
|
-
import re
|
|
4
3
|
from typing import Dict, List, Optional
|
|
5
4
|
|
|
6
5
|
import numpy as np
|
|
@@ -67,7 +66,7 @@ class DateTimeConverter:
|
|
|
67
66
|
try:
|
|
68
67
|
if s is None or len(str(s).strip()) == 0:
|
|
69
68
|
return None
|
|
70
|
-
if
|
|
69
|
+
if sum(ch.isdigit() for ch in str(s)) < 6:
|
|
71
70
|
return None
|
|
72
71
|
return s
|
|
73
72
|
except Exception:
|
|
@@ -116,7 +115,7 @@ class DateTimeConverter:
|
|
|
116
115
|
else:
|
|
117
116
|
return None
|
|
118
117
|
else:
|
|
119
|
-
date_col = date_col.astype("string")
|
|
118
|
+
date_col = date_col.astype("string").apply(self.clean_date)
|
|
120
119
|
parsed_datetime = self.parse_string_date(date_col.to_frame(self.date_column), raise_errors)
|
|
121
120
|
if parsed_datetime.isna().all():
|
|
122
121
|
raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
|
|
@@ -44,12 +44,14 @@ class FeaturesValidator:
|
|
|
44
44
|
else:
|
|
45
45
|
empty_or_constant_features.append(f)
|
|
46
46
|
|
|
47
|
+
columns_renaming = columns_renaming or {}
|
|
48
|
+
|
|
47
49
|
if one_hot_encoded_features:
|
|
48
|
-
msg = bundle.get("one_hot_encoded_features").format(
|
|
50
|
+
msg = bundle.get("one_hot_encoded_features").format(
|
|
51
|
+
[columns_renaming.get(f, f) for f in one_hot_encoded_features]
|
|
52
|
+
)
|
|
49
53
|
warnings.append(msg)
|
|
50
54
|
|
|
51
|
-
columns_renaming = columns_renaming or {}
|
|
52
|
-
|
|
53
55
|
if empty_or_constant_features:
|
|
54
56
|
msg = bundle.get("empty_or_contant_features").format(
|
|
55
57
|
[columns_renaming.get(f, f) for f in empty_or_constant_features]
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.134"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|