upgini 1.1.280.dev0__tar.gz → 1.1.281__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (64) hide show
  1. {upgini-1.1.280.dev0 → upgini-1.1.281}/PKG-INFO +1 -1
  2. upgini-1.1.281/src/upgini/__about__.py +1 -0
  3. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/dataset.py +1 -1
  4. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/features_enricher.py +16 -8
  5. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/metrics.py +12 -2
  6. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/display_utils.py +6 -4
  7. upgini-1.1.280.dev0/src/upgini/__about__.py +0 -1
  8. {upgini-1.1.280.dev0 → upgini-1.1.281}/.gitignore +0 -0
  9. {upgini-1.1.280.dev0 → upgini-1.1.281}/LICENSE +0 -0
  10. {upgini-1.1.280.dev0 → upgini-1.1.281}/README.md +0 -0
  11. {upgini-1.1.280.dev0 → upgini-1.1.281}/pyproject.toml +0 -0
  12. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/__init__.py +0 -0
  13. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/ads.py +0 -0
  14. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/ads_management/__init__.py +0 -0
  15. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/ads_management/ads_manager.py +0 -0
  16. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/autofe/__init__.py +0 -0
  17. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/autofe/all_operands.py +0 -0
  18. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/autofe/binary.py +0 -0
  19. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/autofe/date.py +0 -0
  20. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/autofe/feature.py +0 -0
  21. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/autofe/groupby.py +0 -0
  22. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/autofe/operand.py +0 -0
  23. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/autofe/unary.py +0 -0
  24. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/autofe/vector.py +0 -0
  25. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/data_source/__init__.py +0 -0
  26. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/data_source/data_source_publisher.py +0 -0
  27. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/errors.py +0 -0
  28. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/http.py +0 -0
  29. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/mdc/__init__.py +0 -0
  30. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/mdc/context.py +0 -0
  31. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/metadata.py +0 -0
  32. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/normalizer/__init__.py +0 -0
  33. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/normalizer/phone_normalizer.py +0 -0
  34. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/resource_bundle/__init__.py +0 -0
  35. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/resource_bundle/exceptions.py +0 -0
  36. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/resource_bundle/strings.properties +0 -0
  37. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  38. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/sampler/__init__.py +0 -0
  39. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/sampler/base.py +0 -0
  40. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/sampler/random_under_sampler.py +0 -0
  41. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/sampler/utils.py +0 -0
  42. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/search_task.py +0 -0
  43. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/spinner.py +0 -0
  44. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/__init__.py +0 -0
  45. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/base_search_key_detector.py +0 -0
  46. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/blocked_time_series.py +0 -0
  47. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/country_utils.py +0 -0
  48. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/custom_loss_utils.py +0 -0
  49. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/cv_utils.py +0 -0
  50. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/datetime_utils.py +0 -0
  51. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/deduplicate_utils.py +0 -0
  52. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/email_utils.py +0 -0
  53. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/fallback_progress_bar.py +0 -0
  54. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/features_validator.py +0 -0
  55. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/format.py +0 -0
  56. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/ip_utils.py +0 -0
  57. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/phone_utils.py +0 -0
  58. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/postal_code_utils.py +0 -0
  59. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/progress_bar.py +0 -0
  60. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/sklearn_ext.py +0 -0
  61. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/target_utils.py +0 -0
  62. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/track_info.py +0 -0
  63. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/utils/warning_counter.py +0 -0
  64. {upgini-1.1.280.dev0 → upgini-1.1.281}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.280.dev0
3
+ Version: 1.1.281
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.1.281"
@@ -246,7 +246,7 @@ class Dataset: # (pd.DataFrame):
246
246
  if len(columns_to_fix) > 0:
247
247
  self.logger.warning(f"Convert strings with decimal comma to float: {columns_to_fix}")
248
248
  for col in columns_to_fix:
249
- self.data[col] = self.data[col].astype("string").str.replace(",", ".").astype(np.float64)
249
+ self.data[col] = self.data[col].astype("string").str.replace(",", ".", regex=False).astype(np.float64)
250
250
 
251
251
  @staticmethod
252
252
  def _ip_to_int(ip: Optional[_BaseAddress]) -> Optional[int]:
@@ -930,6 +930,7 @@ class FeaturesEnricher(TransformerMixin):
930
930
  scoring,
931
931
  groups=groups,
932
932
  text_features=self.generate_features,
933
+ has_date=has_date,
933
934
  )
934
935
  metric = wrapper.metric_name
935
936
  multiplier = wrapper.multiplier
@@ -956,6 +957,7 @@ class FeaturesEnricher(TransformerMixin):
956
957
  add_params=custom_loss_add_params,
957
958
  groups=groups,
958
959
  text_features=self.generate_features,
960
+ has_date=has_date,
959
961
  )
960
962
  etalon_metric = baseline_estimator.cross_val_predict(
961
963
  fitting_X, y_sorted, self.baseline_score_column
@@ -981,6 +983,7 @@ class FeaturesEnricher(TransformerMixin):
981
983
  add_params=custom_loss_add_params,
982
984
  groups=groups,
983
985
  text_features=self.generate_features,
986
+ has_date=has_date,
984
987
  )
985
988
  enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
986
989
  self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
@@ -1333,8 +1336,6 @@ class FeaturesEnricher(TransformerMixin):
1333
1336
  excluding_search_keys = list(search_keys.keys())
1334
1337
  if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
1335
1338
  excluding_search_keys = [sk for sk in excluding_search_keys if sk not in search_keys_for_metrics]
1336
- meta = self._search_task.get_all_features_metadata_v2()
1337
- zero_importance_client_features = [m for m in meta if m.source == "etalon" and m.shap_value == 0.0]
1338
1339
 
1339
1340
  client_features = [
1340
1341
  c
@@ -1344,7 +1345,6 @@ class FeaturesEnricher(TransformerMixin):
1344
1345
  excluding_search_keys
1345
1346
  + list(self.fit_dropped_features)
1346
1347
  + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID]
1347
- + zero_importance_client_features
1348
1348
  )
1349
1349
  ]
1350
1350
 
@@ -1403,9 +1403,9 @@ class FeaturesEnricher(TransformerMixin):
1403
1403
  if len(decimal_columns_to_fix) > 0:
1404
1404
  self.logger.warning(f"Convert strings with decimal comma to float: {decimal_columns_to_fix}")
1405
1405
  for col in decimal_columns_to_fix:
1406
- fitting_X[col] = fitting_X[col].astype("string").str.replace(",", ".").astype(np.float64)
1406
+ fitting_X[col] = fitting_X[col].astype("string").str.replace(",", ".", regex=False).astype(np.float64)
1407
1407
  fitting_enriched_X[col] = (
1408
- fitting_enriched_X[col].astype("string").str.replace(",", ".").astype(np.float64)
1408
+ fitting_enriched_X[col].astype("string").str.replace(",", ".", regex=False).astype(np.float64)
1409
1409
  )
1410
1410
 
1411
1411
  fitting_eval_set_dict = dict()
@@ -1441,9 +1441,17 @@ class FeaturesEnricher(TransformerMixin):
1441
1441
  # Correct string features with decimal commas
1442
1442
  if len(decimal_columns_to_fix) > 0:
1443
1443
  for col in decimal_columns_to_fix:
1444
- fitting_eval_X[col] = fitting_eval_X[col].astype("string").str.replace(",", ".").astype(np.float64)
1444
+ fitting_eval_X[col] = (
1445
+ fitting_eval_X[col]
1446
+ .astype("string").str
1447
+ .replace(",", ".", regex=False)
1448
+ .astype(np.float64)
1449
+ )
1445
1450
  fitting_enriched_eval_X[col] = (
1446
- fitting_enriched_eval_X[col].astype("string").str.replace(",", ".").astype(np.float64)
1451
+ fitting_enriched_eval_X[col]
1452
+ .astype("string").str
1453
+ .replace(",", ".", regex=False)
1454
+ .astype(np.float64)
1447
1455
  )
1448
1456
 
1449
1457
  fitting_eval_set_dict[idx] = (
@@ -3712,7 +3720,7 @@ class FeaturesEnricher(TransformerMixin):
3712
3720
  if y is not None:
3713
3721
  with open(f"{tmp_dir}/y.pickle", "wb") as y_file:
3714
3722
  pickle.dump(sample(y, xy_sample_index), y_file)
3715
- if eval_set:
3723
+ if eval_set and _num_samples(eval_set[0][0]) > 0:
3716
3724
  eval_xy_sample_index = rnd.randint(0, _num_samples(eval_set[0][0]), size=1000)
3717
3725
  with open(f"{tmp_dir}/eval_x.pickle", "wb") as eval_x_file:
3718
3726
  pickle.dump(sample(eval_set[0][0], eval_xy_sample_index), eval_x_file)
@@ -314,9 +314,17 @@ class EstimatorWrapper:
314
314
  metrics_by_fold = cv_results["test_score"]
315
315
  self.cv_estimators = cv_results["estimator"]
316
316
 
317
+ self.check_fold_metrics(metrics_by_fold)
318
+
317
319
  metric = np.mean(metrics_by_fold) * self.multiplier
318
320
  return self.post_process_metric(metric)
319
321
 
322
+ def check_fold_metrics(self, metrics_by_fold: List[float]):
323
+ first_metric_sign = 1 if metrics_by_fold[0] >= 0 else -1
324
+ for metric in metrics_by_fold[1:]:
325
+ if first_metric_sign * metric < 0:
326
+ self.logger.warning(f"Sign of metrics differs between folds: {metrics_by_fold}")
327
+
320
328
  def post_process_metric(self, metric: float) -> float:
321
329
  if self.metric_name == "GINI":
322
330
  metric = 2 * metric - 1
@@ -346,6 +354,7 @@ class EstimatorWrapper:
346
354
  text_features: Optional[List[str]] = None,
347
355
  add_params: Optional[Dict[str, Any]] = None,
348
356
  groups: Optional[List[str]] = None,
357
+ has_date: Optional[bool] = None,
349
358
  ) -> EstimatorWrapper:
350
359
  scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
351
360
  kwargs = {
@@ -360,6 +369,7 @@ class EstimatorWrapper:
360
369
  }
361
370
  if estimator is None:
362
371
  params = dict()
372
+ params["has_time"] = has_date
363
373
  # if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
364
374
  # params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
365
375
  if target_type == ModelTaskType.MULTICLASS:
@@ -475,7 +485,7 @@ class CatBoostWrapper(EstimatorWrapper):
475
485
 
476
486
  # Find rest categorical features
477
487
  self.cat_features = _get_cat_features(x, self.text_features, embedding_features)
478
- x = fill_na_cat_features(x, self.cat_features)
488
+ # x = fill_na_cat_features(x, self.cat_features)
479
489
  unique_cat_features = []
480
490
  for name in self.cat_features:
481
491
  # Remove constant categorical features
@@ -525,7 +535,7 @@ class CatBoostWrapper(EstimatorWrapper):
525
535
  x, emb_columns = self.group_embeddings(x)
526
536
  params["embedding_features"] = emb_columns
527
537
  if self.cat_features:
528
- x = fill_na_cat_features(x, self.cat_features)
538
+ # x = fill_na_cat_features(x, self.cat_features)
529
539
  params["cat_features"] = self.cat_features
530
540
 
531
541
  return x, y, params
@@ -9,6 +9,7 @@ from typing import Callable, List, Optional
9
9
 
10
10
  import pandas as pd
11
11
  from xhtml2pdf import pisa
12
+ from upgini.__about__ import __version__
12
13
 
13
14
 
14
15
  def ipython_available() -> bool:
@@ -166,12 +167,12 @@ def make_html_report(
166
167
  /*-pdf-frame-border: 1;*/
167
168
  }}
168
169
  @frame content_frame {{
169
- left: 10pt; width: 574pt; top: 50pt; height: 752pt;
170
+ left: 10pt; width: 574pt; top: 50pt; height: 742pt;
170
171
  /*-pdf-frame-border: 1;*/
171
172
  }}
172
173
  @frame footer_frame {{
173
174
  -pdf-frame-content: footer_content;
174
- left: 10pt; width: 574pt; top: 802pt; height: 30pt;
175
+ left: 10pt; width: 574pt; top: 802pt; height: 40pt;
175
176
  /*-pdf-frame-border: 1;*/
176
177
  }}
177
178
  }}
@@ -234,7 +235,8 @@ def make_html_report(
234
235
  <div id="header_content">UPGINI</div>
235
236
  <div id="footer_content">
236
237
  © Upgini</br>
237
- sales@upgini.com
238
+ sales@upgini.com</br>
239
+ Launched by version {__version__}
238
240
  </div>
239
241
 
240
242
  <h1>Data search report</h1>
@@ -257,7 +259,7 @@ def make_html_report(
257
259
  }
258
260
  <h3>Relevant data sources</h3>
259
261
  {make_table(relevant_datasources_df)}
260
- <h3>All relevant features. Listing</h3>
262
+ <h3>All relevant features. Listing ({len(relevant_features_df)} items)</h3>
261
263
  {make_table(relevant_features_df, wrap_long_string=25)}
262
264
  {"<h3>Description of AutoFE feature names</h3>" + make_table(autofe_descriptions_df, wrap_long_string=25)
263
265
  if autofe_descriptions_df is not None
@@ -1 +0,0 @@
1
- __version__ = "1.1.280.dev0"
File without changes
File without changes
File without changes
File without changes