upgini 1.1.280.dev1__tar.gz → 1.1.282__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.280.dev1 → upgini-1.1.282}/PKG-INFO +2 -2
- {upgini-1.1.280.dev1 → upgini-1.1.282}/README.md +1 -1
- upgini-1.1.282/src/upgini/__about__.py +1 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/features_enricher.py +4 -4
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/metrics.py +13 -2
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/display_utils.py +6 -4
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/sklearn_ext.py +28 -19
- upgini-1.1.280.dev1/src/upgini/__about__.py +0 -1
- {upgini-1.1.280.dev1 → upgini-1.1.282}/.gitignore +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/LICENSE +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/pyproject.toml +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/__init__.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/ads.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/autofe/date.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/dataset.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/errors.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/http.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/metadata.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/normalizer/phone_normalizer.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/search_task.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/spinner.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/version_validator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.282
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -839,4 +839,4 @@ Some convenient ways to start contributing are:
|
|
|
839
839
|
- [More perks for registered users](https://profile.upgini.com)
|
|
840
840
|
|
|
841
841
|
<sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
|
|
842
|
-
Please report it here
|
|
842
|
+
Please report it here</a></sup>
|
|
@@ -799,4 +799,4 @@ Some convenient ways to start contributing are:
|
|
|
799
799
|
- [More perks for registered users](https://profile.upgini.com)
|
|
800
800
|
|
|
801
801
|
<sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
|
|
802
|
-
Please report it here
|
|
802
|
+
Please report it here</a></sup>
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.282"
|
|
@@ -930,6 +930,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
930
930
|
scoring,
|
|
931
931
|
groups=groups,
|
|
932
932
|
text_features=self.generate_features,
|
|
933
|
+
has_date=has_date,
|
|
933
934
|
)
|
|
934
935
|
metric = wrapper.metric_name
|
|
935
936
|
multiplier = wrapper.multiplier
|
|
@@ -956,6 +957,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
956
957
|
add_params=custom_loss_add_params,
|
|
957
958
|
groups=groups,
|
|
958
959
|
text_features=self.generate_features,
|
|
960
|
+
has_date=has_date,
|
|
959
961
|
)
|
|
960
962
|
etalon_metric = baseline_estimator.cross_val_predict(
|
|
961
963
|
fitting_X, y_sorted, self.baseline_score_column
|
|
@@ -981,6 +983,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
981
983
|
add_params=custom_loss_add_params,
|
|
982
984
|
groups=groups,
|
|
983
985
|
text_features=self.generate_features,
|
|
986
|
+
has_date=has_date,
|
|
984
987
|
)
|
|
985
988
|
enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
|
|
986
989
|
self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
|
|
@@ -1333,8 +1336,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1333
1336
|
excluding_search_keys = list(search_keys.keys())
|
|
1334
1337
|
if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
|
|
1335
1338
|
excluding_search_keys = [sk for sk in excluding_search_keys if sk not in search_keys_for_metrics]
|
|
1336
|
-
meta = self._search_task.get_all_features_metadata_v2()
|
|
1337
|
-
zero_importance_client_features = [m.name for m in meta if m.source == "etalon" and m.shap_value == 0.0]
|
|
1338
1339
|
|
|
1339
1340
|
client_features = [
|
|
1340
1341
|
c
|
|
@@ -1344,7 +1345,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1344
1345
|
excluding_search_keys
|
|
1345
1346
|
+ list(self.fit_dropped_features)
|
|
1346
1347
|
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID]
|
|
1347
|
-
+ zero_importance_client_features
|
|
1348
1348
|
)
|
|
1349
1349
|
]
|
|
1350
1350
|
|
|
@@ -3720,7 +3720,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3720
3720
|
if y is not None:
|
|
3721
3721
|
with open(f"{tmp_dir}/y.pickle", "wb") as y_file:
|
|
3722
3722
|
pickle.dump(sample(y, xy_sample_index), y_file)
|
|
3723
|
-
if eval_set:
|
|
3723
|
+
if eval_set and _num_samples(eval_set[0][0]) > 0:
|
|
3724
3724
|
eval_xy_sample_index = rnd.randint(0, _num_samples(eval_set[0][0]), size=1000)
|
|
3725
3725
|
with open(f"{tmp_dir}/eval_x.pickle", "wb") as eval_x_file:
|
|
3726
3726
|
pickle.dump(sample(eval_set[0][0], eval_xy_sample_index), eval_x_file)
|
|
@@ -298,6 +298,7 @@ class EstimatorWrapper:
|
|
|
298
298
|
scorer = check_scoring(self.estimator, scoring=self.scorer)
|
|
299
299
|
|
|
300
300
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
|
301
|
+
self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
|
|
301
302
|
metric = roc_auc_score(y, x[baseline_score_column])
|
|
302
303
|
else:
|
|
303
304
|
cv_results = cross_validate(
|
|
@@ -314,9 +315,17 @@ class EstimatorWrapper:
|
|
|
314
315
|
metrics_by_fold = cv_results["test_score"]
|
|
315
316
|
self.cv_estimators = cv_results["estimator"]
|
|
316
317
|
|
|
318
|
+
self.check_fold_metrics(metrics_by_fold)
|
|
319
|
+
|
|
317
320
|
metric = np.mean(metrics_by_fold) * self.multiplier
|
|
318
321
|
return self.post_process_metric(metric)
|
|
319
322
|
|
|
323
|
+
def check_fold_metrics(self, metrics_by_fold: List[float]):
|
|
324
|
+
first_metric_sign = 1 if metrics_by_fold[0] >= 0 else -1
|
|
325
|
+
for metric in metrics_by_fold[1:]:
|
|
326
|
+
if first_metric_sign * metric < 0:
|
|
327
|
+
self.logger.warning(f"Sign of metrics differs between folds: {metrics_by_fold}")
|
|
328
|
+
|
|
320
329
|
def post_process_metric(self, metric: float) -> float:
|
|
321
330
|
if self.metric_name == "GINI":
|
|
322
331
|
metric = 2 * metric - 1
|
|
@@ -346,6 +355,7 @@ class EstimatorWrapper:
|
|
|
346
355
|
text_features: Optional[List[str]] = None,
|
|
347
356
|
add_params: Optional[Dict[str, Any]] = None,
|
|
348
357
|
groups: Optional[List[str]] = None,
|
|
358
|
+
has_date: Optional[bool] = None,
|
|
349
359
|
) -> EstimatorWrapper:
|
|
350
360
|
scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
|
|
351
361
|
kwargs = {
|
|
@@ -360,6 +370,7 @@ class EstimatorWrapper:
|
|
|
360
370
|
}
|
|
361
371
|
if estimator is None:
|
|
362
372
|
params = dict()
|
|
373
|
+
params["has_time"] = has_date
|
|
363
374
|
# if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
|
|
364
375
|
# params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
|
|
365
376
|
if target_type == ModelTaskType.MULTICLASS:
|
|
@@ -475,7 +486,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
475
486
|
|
|
476
487
|
# Find rest categorical features
|
|
477
488
|
self.cat_features = _get_cat_features(x, self.text_features, embedding_features)
|
|
478
|
-
x = fill_na_cat_features(x, self.cat_features)
|
|
489
|
+
# x = fill_na_cat_features(x, self.cat_features)
|
|
479
490
|
unique_cat_features = []
|
|
480
491
|
for name in self.cat_features:
|
|
481
492
|
# Remove constant categorical features
|
|
@@ -525,7 +536,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
525
536
|
x, emb_columns = self.group_embeddings(x)
|
|
526
537
|
params["embedding_features"] = emb_columns
|
|
527
538
|
if self.cat_features:
|
|
528
|
-
x = fill_na_cat_features(x, self.cat_features)
|
|
539
|
+
# x = fill_na_cat_features(x, self.cat_features)
|
|
529
540
|
params["cat_features"] = self.cat_features
|
|
530
541
|
|
|
531
542
|
return x, y, params
|
|
@@ -9,6 +9,7 @@ from typing import Callable, List, Optional
|
|
|
9
9
|
|
|
10
10
|
import pandas as pd
|
|
11
11
|
from xhtml2pdf import pisa
|
|
12
|
+
from upgini.__about__ import __version__
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
def ipython_available() -> bool:
|
|
@@ -166,12 +167,12 @@ def make_html_report(
|
|
|
166
167
|
/*-pdf-frame-border: 1;*/
|
|
167
168
|
}}
|
|
168
169
|
@frame content_frame {{
|
|
169
|
-
left: 10pt; width: 574pt; top: 50pt; height:
|
|
170
|
+
left: 10pt; width: 574pt; top: 50pt; height: 742pt;
|
|
170
171
|
/*-pdf-frame-border: 1;*/
|
|
171
172
|
}}
|
|
172
173
|
@frame footer_frame {{
|
|
173
174
|
-pdf-frame-content: footer_content;
|
|
174
|
-
left: 10pt; width: 574pt; top: 802pt; height:
|
|
175
|
+
left: 10pt; width: 574pt; top: 802pt; height: 40pt;
|
|
175
176
|
/*-pdf-frame-border: 1;*/
|
|
176
177
|
}}
|
|
177
178
|
}}
|
|
@@ -234,7 +235,8 @@ def make_html_report(
|
|
|
234
235
|
<div id="header_content">UPGINI</div>
|
|
235
236
|
<div id="footer_content">
|
|
236
237
|
© Upgini</br>
|
|
237
|
-
sales@upgini.com
|
|
238
|
+
sales@upgini.com</br>
|
|
239
|
+
Launched by version {__version__}
|
|
238
240
|
</div>
|
|
239
241
|
|
|
240
242
|
<h1>Data search report</h1>
|
|
@@ -257,7 +259,7 @@ def make_html_report(
|
|
|
257
259
|
}
|
|
258
260
|
<h3>Relevant data sources</h3>
|
|
259
261
|
{make_table(relevant_datasources_df)}
|
|
260
|
-
<h3>All relevant features. Listing</h3>
|
|
262
|
+
<h3>All relevant features. Listing ({len(relevant_features_df)} items)</h3>
|
|
261
263
|
{make_table(relevant_features_df, wrap_long_string=25)}
|
|
262
264
|
{"<h3>Description of AutoFE feature names</h3>" + make_table(autofe_descriptions_df, wrap_long_string=25)
|
|
263
265
|
if autofe_descriptions_df is not None
|
|
@@ -17,7 +17,7 @@ from sklearn.base import clone, is_classifier
|
|
|
17
17
|
from sklearn.exceptions import FitFailedWarning, NotFittedError
|
|
18
18
|
from sklearn.metrics import check_scoring
|
|
19
19
|
from sklearn.metrics._scorer import _MultimetricScorer
|
|
20
|
-
from sklearn.model_selection import check_cv
|
|
20
|
+
from sklearn.model_selection import StratifiedKFold, check_cv
|
|
21
21
|
from sklearn.utils.fixes import np_version, parse_version
|
|
22
22
|
from sklearn.utils.validation import indexable
|
|
23
23
|
|
|
@@ -312,25 +312,34 @@ def cross_validate(
|
|
|
312
312
|
ret[key] = train_scores_dict[name]
|
|
313
313
|
|
|
314
314
|
return ret
|
|
315
|
-
except
|
|
315
|
+
except ValueError as e:
|
|
316
316
|
# logging.exception("Failed to execute overriden cross_validate. Fallback to original")
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
317
|
+
if hasattr(e, "args") and len(e.args) > 0 and "Only one class present in y_true" in e.args[0]:
|
|
318
|
+
# Try change CV to StratifiedKFold and retry
|
|
319
|
+
if hasattr(cv, "shuffle"):
|
|
320
|
+
shuffle = cv.shuffle
|
|
321
|
+
else:
|
|
322
|
+
shuffle = False
|
|
323
|
+
if hasattr(cv, "random_state"):
|
|
324
|
+
random_state = cv.random_state
|
|
325
|
+
else:
|
|
326
|
+
random_state = None
|
|
327
|
+
return cross_validate(
|
|
328
|
+
estimator,
|
|
329
|
+
x,
|
|
330
|
+
y,
|
|
331
|
+
groups=groups,
|
|
332
|
+
scoring=scoring,
|
|
333
|
+
cv=StratifiedKFold(n_splits=cv.get_n_splits(), shuffle=shuffle, random_state=random_state),
|
|
334
|
+
n_jobs=n_jobs,
|
|
335
|
+
verbose=verbose,
|
|
336
|
+
fit_params=fit_params,
|
|
337
|
+
pre_dispatch=pre_dispatch,
|
|
338
|
+
return_train_score=return_train_score,
|
|
339
|
+
return_estimator=return_estimator,
|
|
340
|
+
error_score=error_score,
|
|
341
|
+
)
|
|
342
|
+
raise e
|
|
334
343
|
|
|
335
344
|
|
|
336
345
|
def _fit_and_score(
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.1.280.dev1"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|