upgini 1.1.244a18__tar.gz → 1.1.244a20__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (81) hide show
  1. {upgini-1.1.244a18/src/upgini.egg-info → upgini-1.1.244a20}/PKG-INFO +1 -1
  2. {upgini-1.1.244a18 → upgini-1.1.244a20}/setup.py +1 -1
  3. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/features_enricher.py +3 -2
  4. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/metrics.py +15 -18
  5. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/deduplicate_utils.py +2 -0
  6. {upgini-1.1.244a18 → upgini-1.1.244a20/src/upgini.egg-info}/PKG-INFO +1 -1
  7. {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_blocked_time_series.py +13 -6
  8. {upgini-1.1.244a18 → upgini-1.1.244a20}/LICENSE +0 -0
  9. {upgini-1.1.244a18 → upgini-1.1.244a20}/README.md +0 -0
  10. {upgini-1.1.244a18 → upgini-1.1.244a20}/pyproject.toml +0 -0
  11. {upgini-1.1.244a18 → upgini-1.1.244a20}/setup.cfg +0 -0
  12. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/__init__.py +0 -0
  13. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/ads.py +0 -0
  14. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/ads_management/__init__.py +0 -0
  15. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/ads_management/ads_manager.py +0 -0
  16. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/autofe/__init__.py +0 -0
  17. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/autofe/all_operands.py +0 -0
  18. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/autofe/binary.py +0 -0
  19. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/autofe/feature.py +0 -0
  20. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/autofe/groupby.py +0 -0
  21. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/autofe/operand.py +0 -0
  22. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/autofe/unary.py +0 -0
  23. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/autofe/vector.py +0 -0
  24. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/data_source/__init__.py +0 -0
  25. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/data_source/data_source_publisher.py +0 -0
  26. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/dataset.py +0 -0
  27. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/errors.py +0 -0
  28. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/fingerprint.js +0 -0
  29. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/http.py +0 -0
  30. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/mdc/__init__.py +0 -0
  31. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/mdc/context.py +0 -0
  32. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/metadata.py +0 -0
  33. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/normalizer/__init__.py +0 -0
  34. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/normalizer/phone_normalizer.py +0 -0
  35. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/resource_bundle/__init__.py +0 -0
  36. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/resource_bundle/exceptions.py +0 -0
  37. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/resource_bundle/strings.properties +0 -0
  38. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/sampler/__init__.py +0 -0
  39. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/sampler/base.py +0 -0
  40. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/sampler/random_under_sampler.py +0 -0
  41. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/sampler/utils.py +0 -0
  42. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/search_task.py +0 -0
  43. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/spinner.py +0 -0
  44. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/__init__.py +0 -0
  45. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/base_search_key_detector.py +0 -0
  46. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/blocked_time_series.py +0 -0
  47. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/country_utils.py +0 -0
  48. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/custom_loss_utils.py +0 -0
  49. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/cv_utils.py +0 -0
  50. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/datetime_utils.py +0 -0
  51. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/display_utils.py +0 -0
  52. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/email_utils.py +0 -0
  53. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/fallback_progress_bar.py +0 -0
  54. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/features_validator.py +0 -0
  55. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/format.py +0 -0
  56. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/ip_utils.py +0 -0
  57. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/phone_utils.py +0 -0
  58. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/postal_code_utils.py +0 -0
  59. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/progress_bar.py +0 -0
  60. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/sklearn_ext.py +0 -0
  61. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/target_utils.py +0 -0
  62. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/track_info.py +0 -0
  63. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/utils/warning_counter.py +0 -0
  64. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini/version_validator.py +0 -0
  65. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini.egg-info/SOURCES.txt +0 -0
  66. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini.egg-info/dependency_links.txt +0 -0
  67. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini.egg-info/requires.txt +0 -0
  68. {upgini-1.1.244a18 → upgini-1.1.244a20}/src/upgini.egg-info/top_level.txt +0 -0
  69. {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_binary_dataset.py +0 -0
  70. {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_categorical_dataset.py +0 -0
  71. {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_continuous_dataset.py +0 -0
  72. {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_country_utils.py +0 -0
  73. {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_custom_loss_utils.py +0 -0
  74. {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_datetime_utils.py +0 -0
  75. {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_email_utils.py +0 -0
  76. {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_etalon_validation.py +0 -0
  77. {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_features_enricher.py +0 -0
  78. {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_metrics.py +0 -0
  79. {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_phone_utils.py +0 -0
  80. {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_postal_code_utils.py +0 -0
  81. {upgini-1.1.244a18 → upgini-1.1.244a20}/tests/test_widget.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.244a18
3
+ Version: 1.1.244a20
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -40,7 +40,7 @@ def send_log(msg: str):
40
40
 
41
41
 
42
42
  here = Path(__file__).parent.resolve()
43
- version = "1.1.244a18"
43
+ version = "1.1.244a20"
44
44
  try:
45
45
  send_log(f"Start setup PyLib version {version}")
46
46
  setup(
@@ -1855,8 +1855,6 @@ class FeaturesEnricher(TransformerMixin):
1855
1855
  )
1856
1856
  meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
1857
1857
 
1858
- df = clean_full_duplicates(df, self.logger, silent=silent_mode)
1859
-
1860
1858
  df = df.reset_index(drop=True)
1861
1859
  system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
1862
1860
  df_with_original_index = df[system_columns_with_original_index].copy()
@@ -1865,6 +1863,8 @@ class FeaturesEnricher(TransformerMixin):
1865
1863
 
1866
1864
  df_without_features = df.drop(columns=non_keys_columns)
1867
1865
 
1866
+ df_without_features = clean_full_duplicates(df_without_features, self.logger, silent=silent_mode)
1867
+
1868
1868
  del df
1869
1869
  gc.collect()
1870
1870
 
@@ -2092,6 +2092,7 @@ class FeaturesEnricher(TransformerMixin):
2092
2092
  msg = bundle.get("missing_generate_feature").format(gen_feature, x_columns)
2093
2093
  print(msg)
2094
2094
  self.logger.warning(msg)
2095
+ self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
2095
2096
 
2096
2097
  validate_scoring_argument(scoring)
2097
2098
 
@@ -414,6 +414,9 @@ class CatBoostWrapper(EstimatorWrapper):
414
414
  self.emb_features = [c for c in X.columns if re.match(emb_pattern, c) and is_numeric_dtype(X[c])]
415
415
  embedding_features = []
416
416
  if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
417
+ self.logger.info(
418
+ f"Embedding features count more than 3, so group them into one vector for CatBoost: {self.emb_features}"
419
+ )
417
420
  X, embedding_features = self.group_embeddings(X)
418
421
  params["embedding_features"] = embedding_features
419
422
  else:
@@ -421,15 +424,13 @@ class CatBoostWrapper(EstimatorWrapper):
421
424
 
422
425
  # Find text features from passed in generate_features
423
426
  if self.text_features is not None:
424
- self.text_features = [f for f in self.text_features if not is_numeric_dtype(X[f])]
427
+ self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
428
+ self.text_features = [f for f in self.text_features if f in X.columns and not is_numeric_dtype(X[f])]
429
+ self.logger.info(f"Rest text features after checks: {self.text_features}")
425
430
  params["text_features"] = self.text_features
426
431
 
427
432
  # Find rest categorical features
428
- self.cat_features = _get_cat_features(X)
429
- if self.text_features is not None:
430
- self.cat_features = [
431
- f for f in self.cat_features if f not in self.text_features and f not in embedding_features
432
- ]
433
+ self.cat_features = _get_cat_features(X, self.text_features, embedding_features)
433
434
  X = fill_na_cat_features(X, self.cat_features)
434
435
  unique_cat_features = []
435
436
  for name in self.cat_features:
@@ -456,6 +457,7 @@ class CatBoostWrapper(EstimatorWrapper):
456
457
 
457
458
  del self.estimator._init_params["cat_features"]
458
459
 
460
+ self.logger.info(f"Selected categorical features: {self.cat_features}")
459
461
  params["cat_features"] = self.cat_features
460
462
 
461
463
  return X, y, groups, params
@@ -473,13 +475,11 @@ class CatBoostWrapper(EstimatorWrapper):
473
475
  X, y, params = super()._prepare_to_calculate(X, y)
474
476
  if self.text_features:
475
477
  params["text_features"] = self.text_features
476
- # if self.emb_groups:
477
478
  if self.emb_features:
478
479
  X, emb_columns = self.group_embeddings(X)
479
480
  params["embedding_features"] = emb_columns
480
481
  if self.cat_features:
481
482
  X = fill_na_cat_features(X, self.cat_features)
482
- if self.cat_features:
483
483
  params["cat_features"] = self.cat_features
484
484
 
485
485
  return X, y, params
@@ -633,8 +633,13 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
633
633
  return scoring, metric_name, multiplier
634
634
 
635
635
 
636
- def _get_cat_features(X: pd.DataFrame) -> List[str]:
637
- return [c for c in X.columns if not is_numeric_dtype(X[c])]
636
+ def _get_cat_features(
637
+ X: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
638
+ ) -> List[str]:
639
+ text_features = text_features or []
640
+ emb_features = emb_features or []
641
+ exclude_features = text_features + emb_features
642
+ return [c for c in X.columns if c not in exclude_features and not is_numeric_dtype(X[c])]
638
643
 
639
644
 
640
645
  def _get_add_params(input_params, add_params):
@@ -731,11 +736,3 @@ def fill_na_cat_features(df: pd.DataFrame, cat_features: List[str]) -> pd.DataFr
731
736
  na_filter = df[c].str.lower().isin(NA_VALUES)
732
737
  df.loc[na_filter, c] = NA_REPLACEMENT
733
738
  return df
734
-
735
-
736
- def _is_too_many_categorical_values(X: pd.DataFrame) -> bool:
737
- many_values_features_count = 0
738
- for f in _get_cat_features(X):
739
- if X[f].astype("string").nunique() > 100:
740
- many_values_features_count += 1
741
- return many_values_features_count >= 2
@@ -103,6 +103,8 @@ def clean_full_duplicates(
103
103
  unique_columns = df.columns.tolist()
104
104
  if SYSTEM_RECORD_ID in unique_columns:
105
105
  unique_columns.remove(SYSTEM_RECORD_ID)
106
+ if "sort_id" in unique_columns:
107
+ unique_columns.remove("sort_id")
106
108
  logger.info(f"Dataset shape before clean duplicates: {df.shape}")
107
109
  df = df.drop_duplicates(subset=unique_columns)
108
110
  logger.info(f"Dataset shape after clean duplicates: {df.shape}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.244a18
3
+ Version: 1.1.244a20
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -33,13 +33,20 @@ def test_bts_split_logic():
33
33
  def test_bts_metrics():
34
34
  X, y, cv, _, model = _prepare_data()
35
35
  cv_result = set(cross_val_score(model, X, y, cv=cv, scoring="roc_auc"))
36
- assert cv_result == {
37
- 0.4559664254320743,
38
- 0.4767320313326982,
39
- 0.4811855209016638,
40
- 0.48947924927306374,
41
- 0.5150543675843606,
36
+ assert {round(r, 3) for r in cv_result} == {
37
+ 0.456,
38
+ 0.477,
39
+ 0.481,
40
+ 0.489,
41
+ 0.515,
42
42
  }
43
+ # assert cv_result == {
44
+ # 0.4559664254320743,
45
+ # 0.4767320313326982,
46
+ # 0.4811855209016638,
47
+ # 0.48947924927306374,
48
+ # 0.5150543675843606,
49
+ # }
43
50
 
44
51
 
45
52
  def test_bts_exceptions():
File without changes
File without changes
File without changes
File without changes