upgini 1.1.244a19__tar.gz → 1.1.244a20__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (81) hide show
  1. {upgini-1.1.244a19/src/upgini.egg-info → upgini-1.1.244a20}/PKG-INFO +1 -1
  2. {upgini-1.1.244a19 → upgini-1.1.244a20}/setup.py +1 -1
  3. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/features_enricher.py +3 -2
  4. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/metrics.py +7 -1
  5. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/deduplicate_utils.py +2 -0
  6. {upgini-1.1.244a19 → upgini-1.1.244a20/src/upgini.egg-info}/PKG-INFO +1 -1
  7. {upgini-1.1.244a19 → upgini-1.1.244a20}/tests/test_blocked_time_series.py +13 -6
  8. {upgini-1.1.244a19 → upgini-1.1.244a20}/LICENSE +0 -0
  9. {upgini-1.1.244a19 → upgini-1.1.244a20}/README.md +0 -0
  10. {upgini-1.1.244a19 → upgini-1.1.244a20}/pyproject.toml +0 -0
  11. {upgini-1.1.244a19 → upgini-1.1.244a20}/setup.cfg +0 -0
  12. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/__init__.py +0 -0
  13. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/ads.py +0 -0
  14. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/ads_management/__init__.py +0 -0
  15. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/ads_management/ads_manager.py +0 -0
  16. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/autofe/__init__.py +0 -0
  17. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/autofe/all_operands.py +0 -0
  18. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/autofe/binary.py +0 -0
  19. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/autofe/feature.py +0 -0
  20. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/autofe/groupby.py +0 -0
  21. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/autofe/operand.py +0 -0
  22. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/autofe/unary.py +0 -0
  23. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/autofe/vector.py +0 -0
  24. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/data_source/__init__.py +0 -0
  25. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/data_source/data_source_publisher.py +0 -0
  26. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/dataset.py +0 -0
  27. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/errors.py +0 -0
  28. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/fingerprint.js +0 -0
  29. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/http.py +0 -0
  30. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/mdc/__init__.py +0 -0
  31. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/mdc/context.py +0 -0
  32. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/metadata.py +0 -0
  33. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/normalizer/__init__.py +0 -0
  34. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/normalizer/phone_normalizer.py +0 -0
  35. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/resource_bundle/__init__.py +0 -0
  36. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/resource_bundle/exceptions.py +0 -0
  37. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/resource_bundle/strings.properties +0 -0
  38. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/sampler/__init__.py +0 -0
  39. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/sampler/base.py +0 -0
  40. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/sampler/random_under_sampler.py +0 -0
  41. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/sampler/utils.py +0 -0
  42. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/search_task.py +0 -0
  43. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/spinner.py +0 -0
  44. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/__init__.py +0 -0
  45. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/base_search_key_detector.py +0 -0
  46. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/blocked_time_series.py +0 -0
  47. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/country_utils.py +0 -0
  48. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/custom_loss_utils.py +0 -0
  49. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/cv_utils.py +0 -0
  50. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/datetime_utils.py +0 -0
  51. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/display_utils.py +0 -0
  52. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/email_utils.py +0 -0
  53. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/fallback_progress_bar.py +0 -0
  54. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/features_validator.py +0 -0
  55. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/format.py +0 -0
  56. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/ip_utils.py +0 -0
  57. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/phone_utils.py +0 -0
  58. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/postal_code_utils.py +0 -0
  59. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/progress_bar.py +0 -0
  60. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/sklearn_ext.py +0 -0
  61. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/target_utils.py +0 -0
  62. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/track_info.py +0 -0
  63. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/utils/warning_counter.py +0 -0
  64. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini/version_validator.py +0 -0
  65. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini.egg-info/SOURCES.txt +0 -0
  66. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini.egg-info/dependency_links.txt +0 -0
  67. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini.egg-info/requires.txt +0 -0
  68. {upgini-1.1.244a19 → upgini-1.1.244a20}/src/upgini.egg-info/top_level.txt +0 -0
  69. {upgini-1.1.244a19 → upgini-1.1.244a20}/tests/test_binary_dataset.py +0 -0
  70. {upgini-1.1.244a19 → upgini-1.1.244a20}/tests/test_categorical_dataset.py +0 -0
  71. {upgini-1.1.244a19 → upgini-1.1.244a20}/tests/test_continuous_dataset.py +0 -0
  72. {upgini-1.1.244a19 → upgini-1.1.244a20}/tests/test_country_utils.py +0 -0
  73. {upgini-1.1.244a19 → upgini-1.1.244a20}/tests/test_custom_loss_utils.py +0 -0
  74. {upgini-1.1.244a19 → upgini-1.1.244a20}/tests/test_datetime_utils.py +0 -0
  75. {upgini-1.1.244a19 → upgini-1.1.244a20}/tests/test_email_utils.py +0 -0
  76. {upgini-1.1.244a19 → upgini-1.1.244a20}/tests/test_etalon_validation.py +0 -0
  77. {upgini-1.1.244a19 → upgini-1.1.244a20}/tests/test_features_enricher.py +0 -0
  78. {upgini-1.1.244a19 → upgini-1.1.244a20}/tests/test_metrics.py +0 -0
  79. {upgini-1.1.244a19 → upgini-1.1.244a20}/tests/test_phone_utils.py +0 -0
  80. {upgini-1.1.244a19 → upgini-1.1.244a20}/tests/test_postal_code_utils.py +0 -0
  81. {upgini-1.1.244a19 → upgini-1.1.244a20}/tests/test_widget.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.244a19
3
+ Version: 1.1.244a20
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -40,7 +40,7 @@ def send_log(msg: str):
40
40
 
41
41
 
42
42
  here = Path(__file__).parent.resolve()
43
- version = "1.1.244a19"
43
+ version = "1.1.244a20"
44
44
  try:
45
45
  send_log(f"Start setup PyLib version {version}")
46
46
  setup(
@@ -1855,8 +1855,6 @@ class FeaturesEnricher(TransformerMixin):
1855
1855
  )
1856
1856
  meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
1857
1857
 
1858
- df = clean_full_duplicates(df, self.logger, silent=silent_mode)
1859
-
1860
1858
  df = df.reset_index(drop=True)
1861
1859
  system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
1862
1860
  df_with_original_index = df[system_columns_with_original_index].copy()
@@ -1865,6 +1863,8 @@ class FeaturesEnricher(TransformerMixin):
1865
1863
 
1866
1864
  df_without_features = df.drop(columns=non_keys_columns)
1867
1865
 
1866
+ df_without_features = clean_full_duplicates(df_without_features, self.logger, silent=silent_mode)
1867
+
1868
1868
  del df
1869
1869
  gc.collect()
1870
1870
 
@@ -2092,6 +2092,7 @@ class FeaturesEnricher(TransformerMixin):
2092
2092
  msg = bundle.get("missing_generate_feature").format(gen_feature, x_columns)
2093
2093
  print(msg)
2094
2094
  self.logger.warning(msg)
2095
+ self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
2095
2096
 
2096
2097
  validate_scoring_argument(scoring)
2097
2098
 
@@ -414,6 +414,9 @@ class CatBoostWrapper(EstimatorWrapper):
414
414
  self.emb_features = [c for c in X.columns if re.match(emb_pattern, c) and is_numeric_dtype(X[c])]
415
415
  embedding_features = []
416
416
  if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
417
+ self.logger.info(
418
+ f"Embedding features count more than 3, so group them into one vector for CatBoost: {self.emb_features}"
419
+ )
417
420
  X, embedding_features = self.group_embeddings(X)
418
421
  params["embedding_features"] = embedding_features
419
422
  else:
@@ -421,7 +424,9 @@ class CatBoostWrapper(EstimatorWrapper):
421
424
 
422
425
  # Find text features from passed in generate_features
423
426
  if self.text_features is not None:
424
- self.text_features = [f for f in self.text_features if not is_numeric_dtype(X[f])]
427
+ self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
428
+ self.text_features = [f for f in self.text_features if f in X.columns and not is_numeric_dtype(X[f])]
429
+ self.logger.info(f"Rest text features after checks: {self.text_features}")
425
430
  params["text_features"] = self.text_features
426
431
 
427
432
  # Find rest categorical features
@@ -452,6 +457,7 @@ class CatBoostWrapper(EstimatorWrapper):
452
457
 
453
458
  del self.estimator._init_params["cat_features"]
454
459
 
460
+ self.logger.info(f"Selected categorical features: {self.cat_features}")
455
461
  params["cat_features"] = self.cat_features
456
462
 
457
463
  return X, y, groups, params
@@ -103,6 +103,8 @@ def clean_full_duplicates(
103
103
  unique_columns = df.columns.tolist()
104
104
  if SYSTEM_RECORD_ID in unique_columns:
105
105
  unique_columns.remove(SYSTEM_RECORD_ID)
106
+ if "sort_id" in unique_columns:
107
+ unique_columns.remove("sort_id")
106
108
  logger.info(f"Dataset shape before clean duplicates: {df.shape}")
107
109
  df = df.drop_duplicates(subset=unique_columns)
108
110
  logger.info(f"Dataset shape after clean duplicates: {df.shape}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.244a19
3
+ Version: 1.1.244a20
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -33,13 +33,20 @@ def test_bts_split_logic():
33
33
  def test_bts_metrics():
34
34
  X, y, cv, _, model = _prepare_data()
35
35
  cv_result = set(cross_val_score(model, X, y, cv=cv, scoring="roc_auc"))
36
- assert cv_result == {
37
- 0.4559664254320743,
38
- 0.4767320313326982,
39
- 0.4811855209016638,
40
- 0.48947924927306374,
41
- 0.5150543675843606,
36
+ assert {round(r, 3) for r in cv_result} == {
37
+ 0.456,
38
+ 0.477,
39
+ 0.481,
40
+ 0.489,
41
+ 0.515,
42
42
  }
43
+ # assert cv_result == {
44
+ # 0.4559664254320743,
45
+ # 0.4767320313326982,
46
+ # 0.4811855209016638,
47
+ # 0.48947924927306374,
48
+ # 0.5150543675843606,
49
+ # }
43
50
 
44
51
 
45
52
  def test_bts_exceptions():
File without changes
File without changes
File without changes
File without changes