upgini 1.1.280.dev1__tar.gz → 1.1.282__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (64) hide show
  1. {upgini-1.1.280.dev1 → upgini-1.1.282}/PKG-INFO +2 -2
  2. {upgini-1.1.280.dev1 → upgini-1.1.282}/README.md +1 -1
  3. upgini-1.1.282/src/upgini/__about__.py +1 -0
  4. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/features_enricher.py +4 -4
  5. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/metrics.py +13 -2
  6. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/display_utils.py +6 -4
  7. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/sklearn_ext.py +28 -19
  8. upgini-1.1.280.dev1/src/upgini/__about__.py +0 -1
  9. {upgini-1.1.280.dev1 → upgini-1.1.282}/.gitignore +0 -0
  10. {upgini-1.1.280.dev1 → upgini-1.1.282}/LICENSE +0 -0
  11. {upgini-1.1.280.dev1 → upgini-1.1.282}/pyproject.toml +0 -0
  12. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/__init__.py +0 -0
  13. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/ads.py +0 -0
  14. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/ads_management/__init__.py +0 -0
  15. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/ads_management/ads_manager.py +0 -0
  16. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/autofe/__init__.py +0 -0
  17. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/autofe/all_operands.py +0 -0
  18. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/autofe/binary.py +0 -0
  19. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/autofe/date.py +0 -0
  20. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/autofe/feature.py +0 -0
  21. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/autofe/groupby.py +0 -0
  22. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/autofe/operand.py +0 -0
  23. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/autofe/unary.py +0 -0
  24. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/autofe/vector.py +0 -0
  25. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/data_source/__init__.py +0 -0
  26. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/data_source/data_source_publisher.py +0 -0
  27. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/dataset.py +0 -0
  28. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/errors.py +0 -0
  29. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/http.py +0 -0
  30. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/mdc/__init__.py +0 -0
  31. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/mdc/context.py +0 -0
  32. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/metadata.py +0 -0
  33. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/normalizer/__init__.py +0 -0
  34. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/normalizer/phone_normalizer.py +0 -0
  35. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/resource_bundle/__init__.py +0 -0
  36. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/resource_bundle/exceptions.py +0 -0
  37. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/resource_bundle/strings.properties +0 -0
  38. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  39. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/sampler/__init__.py +0 -0
  40. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/sampler/base.py +0 -0
  41. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/sampler/random_under_sampler.py +0 -0
  42. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/sampler/utils.py +0 -0
  43. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/search_task.py +0 -0
  44. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/spinner.py +0 -0
  45. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/__init__.py +0 -0
  46. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/base_search_key_detector.py +0 -0
  47. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/blocked_time_series.py +0 -0
  48. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/country_utils.py +0 -0
  49. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/custom_loss_utils.py +0 -0
  50. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/cv_utils.py +0 -0
  51. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/datetime_utils.py +0 -0
  52. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/deduplicate_utils.py +0 -0
  53. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/email_utils.py +0 -0
  54. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/fallback_progress_bar.py +0 -0
  55. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/features_validator.py +0 -0
  56. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/format.py +0 -0
  57. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/ip_utils.py +0 -0
  58. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/phone_utils.py +0 -0
  59. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/postal_code_utils.py +0 -0
  60. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/progress_bar.py +0 -0
  61. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/target_utils.py +0 -0
  62. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/track_info.py +0 -0
  63. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/utils/warning_counter.py +0 -0
  64. {upgini-1.1.280.dev1 → upgini-1.1.282}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.280.dev1
3
+ Version: 1.1.282
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -839,4 +839,4 @@ Some convenient ways to start contributing are:
839
839
  - [More perks for registered users](https://profile.upgini.com)
840
840
 
841
841
  <sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
842
- Please report it here.</a></sup>
842
+ Please report it here</a></sup>
@@ -799,4 +799,4 @@ Some convenient ways to start contributing are:
799
799
  - [More perks for registered users](https://profile.upgini.com)
800
800
 
801
801
  <sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
802
- Please report it here.</a></sup>
802
+ Please report it here</a></sup>
@@ -0,0 +1 @@
1
+ __version__ = "1.1.282"
@@ -930,6 +930,7 @@ class FeaturesEnricher(TransformerMixin):
930
930
  scoring,
931
931
  groups=groups,
932
932
  text_features=self.generate_features,
933
+ has_date=has_date,
933
934
  )
934
935
  metric = wrapper.metric_name
935
936
  multiplier = wrapper.multiplier
@@ -956,6 +957,7 @@ class FeaturesEnricher(TransformerMixin):
956
957
  add_params=custom_loss_add_params,
957
958
  groups=groups,
958
959
  text_features=self.generate_features,
960
+ has_date=has_date,
959
961
  )
960
962
  etalon_metric = baseline_estimator.cross_val_predict(
961
963
  fitting_X, y_sorted, self.baseline_score_column
@@ -981,6 +983,7 @@ class FeaturesEnricher(TransformerMixin):
981
983
  add_params=custom_loss_add_params,
982
984
  groups=groups,
983
985
  text_features=self.generate_features,
986
+ has_date=has_date,
984
987
  )
985
988
  enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
986
989
  self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
@@ -1333,8 +1336,6 @@ class FeaturesEnricher(TransformerMixin):
1333
1336
  excluding_search_keys = list(search_keys.keys())
1334
1337
  if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
1335
1338
  excluding_search_keys = [sk for sk in excluding_search_keys if sk not in search_keys_for_metrics]
1336
- meta = self._search_task.get_all_features_metadata_v2()
1337
- zero_importance_client_features = [m.name for m in meta if m.source == "etalon" and m.shap_value == 0.0]
1338
1339
 
1339
1340
  client_features = [
1340
1341
  c
@@ -1344,7 +1345,6 @@ class FeaturesEnricher(TransformerMixin):
1344
1345
  excluding_search_keys
1345
1346
  + list(self.fit_dropped_features)
1346
1347
  + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID]
1347
- + zero_importance_client_features
1348
1348
  )
1349
1349
  ]
1350
1350
 
@@ -3720,7 +3720,7 @@ class FeaturesEnricher(TransformerMixin):
3720
3720
  if y is not None:
3721
3721
  with open(f"{tmp_dir}/y.pickle", "wb") as y_file:
3722
3722
  pickle.dump(sample(y, xy_sample_index), y_file)
3723
- if eval_set:
3723
+ if eval_set and _num_samples(eval_set[0][0]) > 0:
3724
3724
  eval_xy_sample_index = rnd.randint(0, _num_samples(eval_set[0][0]), size=1000)
3725
3725
  with open(f"{tmp_dir}/eval_x.pickle", "wb") as eval_x_file:
3726
3726
  pickle.dump(sample(eval_set[0][0], eval_xy_sample_index), eval_x_file)
@@ -298,6 +298,7 @@ class EstimatorWrapper:
298
298
  scorer = check_scoring(self.estimator, scoring=self.scorer)
299
299
 
300
300
  if baseline_score_column is not None and self.metric_name == "GINI":
301
+ self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
301
302
  metric = roc_auc_score(y, x[baseline_score_column])
302
303
  else:
303
304
  cv_results = cross_validate(
@@ -314,9 +315,17 @@ class EstimatorWrapper:
314
315
  metrics_by_fold = cv_results["test_score"]
315
316
  self.cv_estimators = cv_results["estimator"]
316
317
 
318
+ self.check_fold_metrics(metrics_by_fold)
319
+
317
320
  metric = np.mean(metrics_by_fold) * self.multiplier
318
321
  return self.post_process_metric(metric)
319
322
 
323
+ def check_fold_metrics(self, metrics_by_fold: List[float]):
324
+ first_metric_sign = 1 if metrics_by_fold[0] >= 0 else -1
325
+ for metric in metrics_by_fold[1:]:
326
+ if first_metric_sign * metric < 0:
327
+ self.logger.warning(f"Sign of metrics differs between folds: {metrics_by_fold}")
328
+
320
329
  def post_process_metric(self, metric: float) -> float:
321
330
  if self.metric_name == "GINI":
322
331
  metric = 2 * metric - 1
@@ -346,6 +355,7 @@ class EstimatorWrapper:
346
355
  text_features: Optional[List[str]] = None,
347
356
  add_params: Optional[Dict[str, Any]] = None,
348
357
  groups: Optional[List[str]] = None,
358
+ has_date: Optional[bool] = None,
349
359
  ) -> EstimatorWrapper:
350
360
  scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
351
361
  kwargs = {
@@ -360,6 +370,7 @@ class EstimatorWrapper:
360
370
  }
361
371
  if estimator is None:
362
372
  params = dict()
373
+ params["has_time"] = has_date
363
374
  # if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
364
375
  # params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
365
376
  if target_type == ModelTaskType.MULTICLASS:
@@ -475,7 +486,7 @@ class CatBoostWrapper(EstimatorWrapper):
475
486
 
476
487
  # Find rest categorical features
477
488
  self.cat_features = _get_cat_features(x, self.text_features, embedding_features)
478
- x = fill_na_cat_features(x, self.cat_features)
489
+ # x = fill_na_cat_features(x, self.cat_features)
479
490
  unique_cat_features = []
480
491
  for name in self.cat_features:
481
492
  # Remove constant categorical features
@@ -525,7 +536,7 @@ class CatBoostWrapper(EstimatorWrapper):
525
536
  x, emb_columns = self.group_embeddings(x)
526
537
  params["embedding_features"] = emb_columns
527
538
  if self.cat_features:
528
- x = fill_na_cat_features(x, self.cat_features)
539
+ # x = fill_na_cat_features(x, self.cat_features)
529
540
  params["cat_features"] = self.cat_features
530
541
 
531
542
  return x, y, params
@@ -9,6 +9,7 @@ from typing import Callable, List, Optional
9
9
 
10
10
  import pandas as pd
11
11
  from xhtml2pdf import pisa
12
+ from upgini.__about__ import __version__
12
13
 
13
14
 
14
15
  def ipython_available() -> bool:
@@ -166,12 +167,12 @@ def make_html_report(
166
167
  /*-pdf-frame-border: 1;*/
167
168
  }}
168
169
  @frame content_frame {{
169
- left: 10pt; width: 574pt; top: 50pt; height: 752pt;
170
+ left: 10pt; width: 574pt; top: 50pt; height: 742pt;
170
171
  /*-pdf-frame-border: 1;*/
171
172
  }}
172
173
  @frame footer_frame {{
173
174
  -pdf-frame-content: footer_content;
174
- left: 10pt; width: 574pt; top: 802pt; height: 30pt;
175
+ left: 10pt; width: 574pt; top: 802pt; height: 40pt;
175
176
  /*-pdf-frame-border: 1;*/
176
177
  }}
177
178
  }}
@@ -234,7 +235,8 @@ def make_html_report(
234
235
  <div id="header_content">UPGINI</div>
235
236
  <div id="footer_content">
236
237
  © Upgini</br>
237
- sales@upgini.com
238
+ sales@upgini.com</br>
239
+ Launched by version {__version__}
238
240
  </div>
239
241
 
240
242
  <h1>Data search report</h1>
@@ -257,7 +259,7 @@ def make_html_report(
257
259
  }
258
260
  <h3>Relevant data sources</h3>
259
261
  {make_table(relevant_datasources_df)}
260
- <h3>All relevant features. Listing</h3>
262
+ <h3>All relevant features. Listing ({len(relevant_features_df)} items)</h3>
261
263
  {make_table(relevant_features_df, wrap_long_string=25)}
262
264
  {"<h3>Description of AutoFE feature names</h3>" + make_table(autofe_descriptions_df, wrap_long_string=25)
263
265
  if autofe_descriptions_df is not None
@@ -17,7 +17,7 @@ from sklearn.base import clone, is_classifier
17
17
  from sklearn.exceptions import FitFailedWarning, NotFittedError
18
18
  from sklearn.metrics import check_scoring
19
19
  from sklearn.metrics._scorer import _MultimetricScorer
20
- from sklearn.model_selection import check_cv
20
+ from sklearn.model_selection import StratifiedKFold, check_cv
21
21
  from sklearn.utils.fixes import np_version, parse_version
22
22
  from sklearn.utils.validation import indexable
23
23
 
@@ -312,25 +312,34 @@ def cross_validate(
312
312
  ret[key] = train_scores_dict[name]
313
313
 
314
314
  return ret
315
- except Exception:
315
+ except ValueError as e:
316
316
  # logging.exception("Failed to execute overriden cross_validate. Fallback to original")
317
- raise
318
- # fit_params["use_best_model"] = False
319
- # return original_cross_validate(
320
- # estimator,
321
- # X,
322
- # y,
323
- # groups=groups,
324
- # scoring=scoring,
325
- # cv=cv,
326
- # n_jobs=n_jobs,
327
- # verbose=verbose,
328
- # fit_params=fit_params,
329
- # pre_dispatch=pre_dispatch,
330
- # return_train_score=return_train_score,
331
- # return_estimator=return_estimator,
332
- # error_score=error_score,
333
- # )
317
+ if hasattr(e, "args") and len(e.args) > 0 and "Only one class present in y_true" in e.args[0]:
318
+ # Try change CV to StratifiedKFold and retry
319
+ if hasattr(cv, "shuffle"):
320
+ shuffle = cv.shuffle
321
+ else:
322
+ shuffle = False
323
+ if hasattr(cv, "random_state"):
324
+ random_state = cv.random_state
325
+ else:
326
+ random_state = None
327
+ return cross_validate(
328
+ estimator,
329
+ x,
330
+ y,
331
+ groups=groups,
332
+ scoring=scoring,
333
+ cv=StratifiedKFold(n_splits=cv.get_n_splits(), shuffle=shuffle, random_state=random_state),
334
+ n_jobs=n_jobs,
335
+ verbose=verbose,
336
+ fit_params=fit_params,
337
+ pre_dispatch=pre_dispatch,
338
+ return_train_score=return_train_score,
339
+ return_estimator=return_estimator,
340
+ error_score=error_score,
341
+ )
342
+ raise e
334
343
 
335
344
 
336
345
  def _fit_and_score(
@@ -1 +0,0 @@
1
- __version__ = "1.1.280.dev1"
File without changes
File without changes
File without changes