upgini 1.1.280.dev0__tar.gz → 1.1.281__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.280.dev0 → upgini-1.1.281}/PKG-INFO +1 -1
- upgini-1.1.281/src/upgini/__about__.py +1 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/dataset.py +1 -1
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/features_enricher.py +16 -8
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/metrics.py +12 -2
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/display_utils.py +6 -4
- upgini-1.1.280.dev0/src/upgini/__about__.py +0 -1
- {upgini-1.1.280.dev0 → upgini-1.1.281}/.gitignore +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/LICENSE +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/README.md +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/pyproject.toml +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/__init__.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/ads.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/autofe/date.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/errors.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/http.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/metadata.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/normalizer/phone_normalizer.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/search_task.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/spinner.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.281"
|
|
@@ -246,7 +246,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
246
246
|
if len(columns_to_fix) > 0:
|
|
247
247
|
self.logger.warning(f"Convert strings with decimal comma to float: {columns_to_fix}")
|
|
248
248
|
for col in columns_to_fix:
|
|
249
|
-
self.data[col] = self.data[col].astype("string").str.replace(",", ".").astype(np.float64)
|
|
249
|
+
self.data[col] = self.data[col].astype("string").str.replace(",", ".", regex=False).astype(np.float64)
|
|
250
250
|
|
|
251
251
|
@staticmethod
|
|
252
252
|
def _ip_to_int(ip: Optional[_BaseAddress]) -> Optional[int]:
|
|
@@ -930,6 +930,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
930
930
|
scoring,
|
|
931
931
|
groups=groups,
|
|
932
932
|
text_features=self.generate_features,
|
|
933
|
+
has_date=has_date,
|
|
933
934
|
)
|
|
934
935
|
metric = wrapper.metric_name
|
|
935
936
|
multiplier = wrapper.multiplier
|
|
@@ -956,6 +957,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
956
957
|
add_params=custom_loss_add_params,
|
|
957
958
|
groups=groups,
|
|
958
959
|
text_features=self.generate_features,
|
|
960
|
+
has_date=has_date,
|
|
959
961
|
)
|
|
960
962
|
etalon_metric = baseline_estimator.cross_val_predict(
|
|
961
963
|
fitting_X, y_sorted, self.baseline_score_column
|
|
@@ -981,6 +983,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
981
983
|
add_params=custom_loss_add_params,
|
|
982
984
|
groups=groups,
|
|
983
985
|
text_features=self.generate_features,
|
|
986
|
+
has_date=has_date,
|
|
984
987
|
)
|
|
985
988
|
enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
|
|
986
989
|
self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
|
|
@@ -1333,8 +1336,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1333
1336
|
excluding_search_keys = list(search_keys.keys())
|
|
1334
1337
|
if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
|
|
1335
1338
|
excluding_search_keys = [sk for sk in excluding_search_keys if sk not in search_keys_for_metrics]
|
|
1336
|
-
meta = self._search_task.get_all_features_metadata_v2()
|
|
1337
|
-
zero_importance_client_features = [m for m in meta if m.source == "etalon" and m.shap_value == 0.0]
|
|
1338
1339
|
|
|
1339
1340
|
client_features = [
|
|
1340
1341
|
c
|
|
@@ -1344,7 +1345,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1344
1345
|
excluding_search_keys
|
|
1345
1346
|
+ list(self.fit_dropped_features)
|
|
1346
1347
|
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID]
|
|
1347
|
-
+ zero_importance_client_features
|
|
1348
1348
|
)
|
|
1349
1349
|
]
|
|
1350
1350
|
|
|
@@ -1403,9 +1403,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1403
1403
|
if len(decimal_columns_to_fix) > 0:
|
|
1404
1404
|
self.logger.warning(f"Convert strings with decimal comma to float: {decimal_columns_to_fix}")
|
|
1405
1405
|
for col in decimal_columns_to_fix:
|
|
1406
|
-
fitting_X[col] = fitting_X[col].astype("string").str.replace(",", ".").astype(np.float64)
|
|
1406
|
+
fitting_X[col] = fitting_X[col].astype("string").str.replace(",", ".", regex=False).astype(np.float64)
|
|
1407
1407
|
fitting_enriched_X[col] = (
|
|
1408
|
-
fitting_enriched_X[col].astype("string").str.replace(",", ".").astype(np.float64)
|
|
1408
|
+
fitting_enriched_X[col].astype("string").str.replace(",", ".", regex=False).astype(np.float64)
|
|
1409
1409
|
)
|
|
1410
1410
|
|
|
1411
1411
|
fitting_eval_set_dict = dict()
|
|
@@ -1441,9 +1441,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1441
1441
|
# Correct string features with decimal commas
|
|
1442
1442
|
if len(decimal_columns_to_fix) > 0:
|
|
1443
1443
|
for col in decimal_columns_to_fix:
|
|
1444
|
-
fitting_eval_X[col] =
|
|
1444
|
+
fitting_eval_X[col] = (
|
|
1445
|
+
fitting_eval_X[col]
|
|
1446
|
+
.astype("string").str
|
|
1447
|
+
.replace(",", ".", regex=False)
|
|
1448
|
+
.astype(np.float64)
|
|
1449
|
+
)
|
|
1445
1450
|
fitting_enriched_eval_X[col] = (
|
|
1446
|
-
fitting_enriched_eval_X[col]
|
|
1451
|
+
fitting_enriched_eval_X[col]
|
|
1452
|
+
.astype("string").str
|
|
1453
|
+
.replace(",", ".", regex=False)
|
|
1454
|
+
.astype(np.float64)
|
|
1447
1455
|
)
|
|
1448
1456
|
|
|
1449
1457
|
fitting_eval_set_dict[idx] = (
|
|
@@ -3712,7 +3720,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3712
3720
|
if y is not None:
|
|
3713
3721
|
with open(f"{tmp_dir}/y.pickle", "wb") as y_file:
|
|
3714
3722
|
pickle.dump(sample(y, xy_sample_index), y_file)
|
|
3715
|
-
if eval_set:
|
|
3723
|
+
if eval_set and _num_samples(eval_set[0][0]) > 0:
|
|
3716
3724
|
eval_xy_sample_index = rnd.randint(0, _num_samples(eval_set[0][0]), size=1000)
|
|
3717
3725
|
with open(f"{tmp_dir}/eval_x.pickle", "wb") as eval_x_file:
|
|
3718
3726
|
pickle.dump(sample(eval_set[0][0], eval_xy_sample_index), eval_x_file)
|
|
@@ -314,9 +314,17 @@ class EstimatorWrapper:
|
|
|
314
314
|
metrics_by_fold = cv_results["test_score"]
|
|
315
315
|
self.cv_estimators = cv_results["estimator"]
|
|
316
316
|
|
|
317
|
+
self.check_fold_metrics(metrics_by_fold)
|
|
318
|
+
|
|
317
319
|
metric = np.mean(metrics_by_fold) * self.multiplier
|
|
318
320
|
return self.post_process_metric(metric)
|
|
319
321
|
|
|
322
|
+
def check_fold_metrics(self, metrics_by_fold: List[float]):
|
|
323
|
+
first_metric_sign = 1 if metrics_by_fold[0] >= 0 else -1
|
|
324
|
+
for metric in metrics_by_fold[1:]:
|
|
325
|
+
if first_metric_sign * metric < 0:
|
|
326
|
+
self.logger.warning(f"Sign of metrics differs between folds: {metrics_by_fold}")
|
|
327
|
+
|
|
320
328
|
def post_process_metric(self, metric: float) -> float:
|
|
321
329
|
if self.metric_name == "GINI":
|
|
322
330
|
metric = 2 * metric - 1
|
|
@@ -346,6 +354,7 @@ class EstimatorWrapper:
|
|
|
346
354
|
text_features: Optional[List[str]] = None,
|
|
347
355
|
add_params: Optional[Dict[str, Any]] = None,
|
|
348
356
|
groups: Optional[List[str]] = None,
|
|
357
|
+
has_date: Optional[bool] = None,
|
|
349
358
|
) -> EstimatorWrapper:
|
|
350
359
|
scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
|
|
351
360
|
kwargs = {
|
|
@@ -360,6 +369,7 @@ class EstimatorWrapper:
|
|
|
360
369
|
}
|
|
361
370
|
if estimator is None:
|
|
362
371
|
params = dict()
|
|
372
|
+
params["has_time"] = has_date
|
|
363
373
|
# if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
|
|
364
374
|
# params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
|
|
365
375
|
if target_type == ModelTaskType.MULTICLASS:
|
|
@@ -475,7 +485,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
475
485
|
|
|
476
486
|
# Find rest categorical features
|
|
477
487
|
self.cat_features = _get_cat_features(x, self.text_features, embedding_features)
|
|
478
|
-
x = fill_na_cat_features(x, self.cat_features)
|
|
488
|
+
# x = fill_na_cat_features(x, self.cat_features)
|
|
479
489
|
unique_cat_features = []
|
|
480
490
|
for name in self.cat_features:
|
|
481
491
|
# Remove constant categorical features
|
|
@@ -525,7 +535,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
525
535
|
x, emb_columns = self.group_embeddings(x)
|
|
526
536
|
params["embedding_features"] = emb_columns
|
|
527
537
|
if self.cat_features:
|
|
528
|
-
x = fill_na_cat_features(x, self.cat_features)
|
|
538
|
+
# x = fill_na_cat_features(x, self.cat_features)
|
|
529
539
|
params["cat_features"] = self.cat_features
|
|
530
540
|
|
|
531
541
|
return x, y, params
|
|
@@ -9,6 +9,7 @@ from typing import Callable, List, Optional
|
|
|
9
9
|
|
|
10
10
|
import pandas as pd
|
|
11
11
|
from xhtml2pdf import pisa
|
|
12
|
+
from upgini.__about__ import __version__
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
def ipython_available() -> bool:
|
|
@@ -166,12 +167,12 @@ def make_html_report(
|
|
|
166
167
|
/*-pdf-frame-border: 1;*/
|
|
167
168
|
}}
|
|
168
169
|
@frame content_frame {{
|
|
169
|
-
left: 10pt; width: 574pt; top: 50pt; height:
|
|
170
|
+
left: 10pt; width: 574pt; top: 50pt; height: 742pt;
|
|
170
171
|
/*-pdf-frame-border: 1;*/
|
|
171
172
|
}}
|
|
172
173
|
@frame footer_frame {{
|
|
173
174
|
-pdf-frame-content: footer_content;
|
|
174
|
-
left: 10pt; width: 574pt; top: 802pt; height:
|
|
175
|
+
left: 10pt; width: 574pt; top: 802pt; height: 40pt;
|
|
175
176
|
/*-pdf-frame-border: 1;*/
|
|
176
177
|
}}
|
|
177
178
|
}}
|
|
@@ -234,7 +235,8 @@ def make_html_report(
|
|
|
234
235
|
<div id="header_content">UPGINI</div>
|
|
235
236
|
<div id="footer_content">
|
|
236
237
|
© Upgini</br>
|
|
237
|
-
sales@upgini.com
|
|
238
|
+
sales@upgini.com</br>
|
|
239
|
+
Launched by version {__version__}
|
|
238
240
|
</div>
|
|
239
241
|
|
|
240
242
|
<h1>Data search report</h1>
|
|
@@ -257,7 +259,7 @@ def make_html_report(
|
|
|
257
259
|
}
|
|
258
260
|
<h3>Relevant data sources</h3>
|
|
259
261
|
{make_table(relevant_datasources_df)}
|
|
260
|
-
<h3>All relevant features. Listing</h3>
|
|
262
|
+
<h3>All relevant features. Listing ({len(relevant_features_df)} items)</h3>
|
|
261
263
|
{make_table(relevant_features_df, wrap_long_string=25)}
|
|
262
264
|
{"<h3>Description of AutoFE feature names</h3>" + make_table(autofe_descriptions_df, wrap_long_string=25)
|
|
263
265
|
if autofe_descriptions_df is not None
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.1.280.dev0"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|