upgini 1.1.152__py3-none-any.whl → 1.1.154a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/features_enricher.py +60 -50
- {upgini-1.1.152.dist-info → upgini-1.1.154a1.dist-info}/METADATA +1 -1
- {upgini-1.1.152.dist-info → upgini-1.1.154a1.dist-info}/RECORD +6 -6
- {upgini-1.1.152.dist-info → upgini-1.1.154a1.dist-info}/LICENSE +0 -0
- {upgini-1.1.152.dist-info → upgini-1.1.154a1.dist-info}/WHEEL +0 -0
- {upgini-1.1.152.dist-info → upgini-1.1.154a1.dist-info}/top_level.txt +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -142,6 +142,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
142
142
|
round_embeddings: Optional[int] = None,
|
|
143
143
|
logs_enabled: bool = True,
|
|
144
144
|
raise_validation_error: bool = False,
|
|
145
|
+
exclude_columns: Optional[List[str]] = None,
|
|
145
146
|
**kwargs,
|
|
146
147
|
):
|
|
147
148
|
self._api_key = api_key or os.environ.get(UPGINI_API_KEY)
|
|
@@ -207,21 +208,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
207
208
|
self.shared_datasets = shared_datasets
|
|
208
209
|
if shared_datasets is not None:
|
|
209
210
|
self.runtime_parameters.properties["shared_datasets"] = ",".join(shared_datasets)
|
|
210
|
-
self.generate_features
|
|
211
|
+
self.generate_features = generate_features
|
|
212
|
+
self.round_embeddings = round_embeddings
|
|
211
213
|
if generate_features is not None:
|
|
212
214
|
if len(generate_features) > 2:
|
|
213
215
|
msg = bundle.get("too_many_generate_features")
|
|
214
216
|
self.logger.error(msg)
|
|
215
217
|
raise ValidationError(msg)
|
|
216
|
-
self.generate_features = generate_features
|
|
217
218
|
self.runtime_parameters.properties["generate_features"] = ",".join(generate_features)
|
|
218
|
-
self.round_embeddings: Optional[int] = None
|
|
219
219
|
if round_embeddings is not None:
|
|
220
220
|
if not isinstance(round_embeddings, int) or round_embeddings < 0:
|
|
221
221
|
msg = bundle.get("invalid_round_embeddings")
|
|
222
222
|
self.logger.error(msg)
|
|
223
223
|
raise ValidationError(msg)
|
|
224
|
-
self.round_embeddings = round_embeddings
|
|
225
224
|
self.runtime_parameters.properties["round_embeddings"] = round_embeddings
|
|
226
225
|
|
|
227
226
|
self.passed_features: List[str] = []
|
|
@@ -238,6 +237,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
238
237
|
self.imbalanced = False
|
|
239
238
|
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
|
|
240
239
|
self.raise_validation_error = raise_validation_error
|
|
240
|
+
self.exclude_columns = exclude_columns
|
|
241
241
|
|
|
242
242
|
def _get_api_key(self):
|
|
243
243
|
return self._api_key
|
|
@@ -678,11 +678,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
678
678
|
|
|
679
679
|
try:
|
|
680
680
|
self.__log_debug_information(
|
|
681
|
-
X,
|
|
682
|
-
y,
|
|
683
|
-
eval_set,
|
|
681
|
+
X if X is not None else self.X,
|
|
682
|
+
y if y is not None else self.y,
|
|
683
|
+
eval_set if eval_set is not None else self.eval_set,
|
|
684
684
|
exclude_features_sources=exclude_features_sources,
|
|
685
|
-
cv=cv,
|
|
685
|
+
cv=cv if cv is not None else self.cv,
|
|
686
686
|
importance_threshold=importance_threshold,
|
|
687
687
|
max_features=max_features,
|
|
688
688
|
scoring=scoring,
|
|
@@ -1801,6 +1801,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1801
1801
|
if not is_transform and not validated_X.index.is_unique:
|
|
1802
1802
|
raise ValidationError(bundle.get("x_non_unique_index"))
|
|
1803
1803
|
|
|
1804
|
+
if self.exclude_columns is not None:
|
|
1805
|
+
validated_X = drop_existing_columns(validated_X, self.exclude_columns)
|
|
1806
|
+
|
|
1804
1807
|
if TARGET in validated_X.columns:
|
|
1805
1808
|
raise ValidationError(bundle.get("x_contains_reserved_column_name").format(TARGET))
|
|
1806
1809
|
if not is_transform and EVAL_SET_INDEX in validated_X.columns:
|
|
@@ -1970,51 +1973,58 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1970
1973
|
estimator: Optional[Any] = None,
|
|
1971
1974
|
remove_outliers_calc_metrics: Optional[bool] = None,
|
|
1972
1975
|
):
|
|
1973
|
-
|
|
1974
|
-
|
|
1975
|
-
|
|
1976
|
-
|
|
1977
|
-
|
|
1978
|
-
|
|
1979
|
-
|
|
1980
|
-
|
|
1981
|
-
|
|
1982
|
-
|
|
1983
|
-
|
|
1984
|
-
|
|
1985
|
-
|
|
1986
|
-
|
|
1987
|
-
|
|
1988
|
-
|
|
1989
|
-
|
|
1990
|
-
|
|
1991
|
-
|
|
1992
|
-
|
|
1993
|
-
|
|
1994
|
-
|
|
1995
|
-
|
|
1996
|
-
|
|
1976
|
+
try:
|
|
1977
|
+
resolved_api_key = self.api_key or os.environ.get(UPGINI_API_KEY)
|
|
1978
|
+
self.logger.info(
|
|
1979
|
+
f"Search keys: {self.search_keys}\n"
|
|
1980
|
+
f"Country code: {self.country_code}\n"
|
|
1981
|
+
f"Model task type: {self.model_task_type}\n"
|
|
1982
|
+
f"Api key presented?: {resolved_api_key is not None and resolved_api_key != ''}\n"
|
|
1983
|
+
f"Endpoint: {self.endpoint}\n"
|
|
1984
|
+
f"Runtime parameters: {self.runtime_parameters}\n"
|
|
1985
|
+
f"Date format: {self.date_format}\n"
|
|
1986
|
+
f"CV: {cv}\n"
|
|
1987
|
+
f"importance_threshold: {importance_threshold}\n"
|
|
1988
|
+
f"max_features: {max_features}\n"
|
|
1989
|
+
f"Shared datasets: {self.shared_datasets}\n"
|
|
1990
|
+
f"Random state: {self.random_state}\n"
|
|
1991
|
+
f"Generate features: {self.generate_features}\n"
|
|
1992
|
+
f"Round embeddings: {self.round_embeddings}\n"
|
|
1993
|
+
f"Detect missing search keys: {self.detect_missing_search_keys}\n"
|
|
1994
|
+
f"Exclude features sources: {exclude_features_sources}\n"
|
|
1995
|
+
f"Calculate metrics: {calculate_metrics}\n"
|
|
1996
|
+
f"Scoring: {scoring}\n"
|
|
1997
|
+
f"Estimator: {estimator}\n"
|
|
1998
|
+
f"Remove target outliers: {remove_outliers_calc_metrics}\n"
|
|
1999
|
+
f"Exclude columns: {self.exclude_columns}\n"
|
|
2000
|
+
f"Search id: {self.search_id}\n"
|
|
2001
|
+
)
|
|
1997
2002
|
|
|
1998
|
-
|
|
1999
|
-
|
|
2000
|
-
|
|
2001
|
-
|
|
2002
|
-
|
|
2003
|
+
def sample(df):
|
|
2004
|
+
if isinstance(df, pd.Series) or isinstance(df, pd.DataFrame):
|
|
2005
|
+
return df.head(10)
|
|
2006
|
+
else:
|
|
2007
|
+
return df[:10]
|
|
2003
2008
|
|
|
2004
|
-
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
|
|
2012
|
-
|
|
2013
|
-
|
|
2014
|
-
|
|
2015
|
-
|
|
2009
|
+
def print_datasets_sample():
|
|
2010
|
+
if X is not None:
|
|
2011
|
+
self.logger.info(f"First 10 rows of the X with shape {X.shape}:\n{sample(X)}")
|
|
2012
|
+
if y is not None:
|
|
2013
|
+
self.logger.info(f"First 10 rows of the y with shape {_num_samples(y)}:\n{sample(y)}")
|
|
2014
|
+
if eval_set is not None:
|
|
2015
|
+
for idx, eval_pair in enumerate(eval_set):
|
|
2016
|
+
eval_X: pd.DataFrame = eval_pair[0]
|
|
2017
|
+
eval_y = eval_pair[1]
|
|
2018
|
+
self.logger.info(
|
|
2019
|
+
f"First 10 rows of the eval_X_{idx} with shape {eval_X.shape}:\n{sample(eval_X)}"
|
|
2020
|
+
)
|
|
2021
|
+
self.logger.info(
|
|
2022
|
+
f"First 10 rows of the eval_y_{idx} with shape {_num_samples(eval_y)}:\n{sample(eval_y)}"
|
|
2023
|
+
)
|
|
2016
2024
|
|
|
2017
|
-
|
|
2025
|
+
do_without_pandas_limits(print_datasets_sample)
|
|
2026
|
+
except Exception:
|
|
2027
|
+
self.logger.exception("Failed to log debug information")
|
|
2018
2028
|
|
|
2019
2029
|
def __handle_index_search_keys(self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> pd.DataFrame:
|
|
2020
2030
|
index_names = df.index.names if df.index.names != [None] else [DEFAULT_INDEX]
|
|
@@ -2,7 +2,7 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
|
|
|
2
2
|
upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
|
|
3
3
|
upgini/dataset.py,sha256=Jmteorv9nMn0bASMUixPXkTfBsONDuol0UIyPznOmRw,44111
|
|
4
4
|
upgini/errors.py,sha256=BqpvfhW2jJW5fa5KXj0alhXatGl-WK4xTl309-QNLp8,959
|
|
5
|
-
upgini/features_enricher.py,sha256=
|
|
5
|
+
upgini/features_enricher.py,sha256=zWIZmROTY3Fi1PsivJhMrGBpznZ-K5mMZApFD6H0AXM,126119
|
|
6
6
|
upgini/fingerprint.js,sha256=wfzunoC87TdquCdABOwcrkoGOoJsX89ICTOb4rsrO50,34162
|
|
7
7
|
upgini/http.py,sha256=ke85Fb1ffD29tjgpnbHF_6gtV3nBJe4Xoxpp9i1GuLA,37176
|
|
8
8
|
upgini/metadata.py,sha256=Oefg-rkA4PsZUHIho_clZcnyZwdtVJ1gXPvEY6oBmpg,5969
|
|
@@ -42,8 +42,8 @@ upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3
|
|
|
42
42
|
upgini/utils/target_utils.py,sha256=cu52icjhDIPpEStHYMXrD2hIl9gzvfnxZr0Ra5osV0k,1616
|
|
43
43
|
upgini/utils/track_info.py,sha256=2IGGyHPXBLhWcLO8-Q-5qir52k_kD6DtdU-sv_Z2hHY,5325
|
|
44
44
|
upgini/utils/warning_counter.py,sha256=vnmdFo5-7GBkU2bK9h_uC0K0Y_wtfcYstxOdeRfacO0,228
|
|
45
|
-
upgini-1.1.
|
|
46
|
-
upgini-1.1.
|
|
47
|
-
upgini-1.1.
|
|
48
|
-
upgini-1.1.
|
|
49
|
-
upgini-1.1.
|
|
45
|
+
upgini-1.1.154a1.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
46
|
+
upgini-1.1.154a1.dist-info/METADATA,sha256=fQ2EK-jmAweNVHlfc3eoRKYs3IEzPkDA6iO8mzByczs,47897
|
|
47
|
+
upgini-1.1.154a1.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
|
48
|
+
upgini-1.1.154a1.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
|
|
49
|
+
upgini-1.1.154a1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|