upgini 1.2.118__tar.gz → 1.2.119__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. {upgini-1.2.118 → upgini-1.2.119}/PKG-INFO +1 -1
  2. upgini-1.2.119/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/features_enricher.py +67 -38
  4. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/display_utils.py +12 -9
  5. upgini-1.2.118/src/upgini/__about__.py +0 -1
  6. {upgini-1.2.118 → upgini-1.2.119}/.gitignore +0 -0
  7. {upgini-1.2.118 → upgini-1.2.119}/LICENSE +0 -0
  8. {upgini-1.2.118 → upgini-1.2.119}/README.md +0 -0
  9. {upgini-1.2.118 → upgini-1.2.119}/pyproject.toml +0 -0
  10. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/__init__.py +0 -0
  11. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/ads.py +0 -0
  12. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/ads_management/__init__.py +0 -0
  13. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/ads_management/ads_manager.py +0 -0
  14. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/autofe/__init__.py +0 -0
  15. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/autofe/all_operators.py +0 -0
  16. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/autofe/binary.py +0 -0
  17. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/autofe/date.py +0 -0
  18. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/autofe/feature.py +0 -0
  19. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/autofe/groupby.py +0 -0
  20. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/autofe/operator.py +0 -0
  21. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/autofe/timeseries/__init__.py +0 -0
  22. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/autofe/timeseries/base.py +0 -0
  23. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/autofe/timeseries/cross.py +0 -0
  24. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/autofe/timeseries/delta.py +0 -0
  25. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/autofe/timeseries/lag.py +0 -0
  26. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/autofe/timeseries/roll.py +0 -0
  27. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/autofe/timeseries/trend.py +0 -0
  28. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/autofe/timeseries/volatility.py +0 -0
  29. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/autofe/unary.py +0 -0
  30. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/autofe/utils.py +0 -0
  31. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/autofe/vector.py +0 -0
  32. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/data_source/__init__.py +0 -0
  33. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/data_source/data_source_publisher.py +0 -0
  34. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/dataset.py +0 -0
  35. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/errors.py +0 -0
  36. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/http.py +0 -0
  37. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/mdc/__init__.py +0 -0
  38. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/mdc/context.py +0 -0
  39. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/metadata.py +0 -0
  40. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/metrics.py +0 -0
  41. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/normalizer/__init__.py +0 -0
  42. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/normalizer/normalize_utils.py +0 -0
  43. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/resource_bundle/__init__.py +0 -0
  44. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/resource_bundle/exceptions.py +0 -0
  45. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/resource_bundle/strings.properties +0 -0
  46. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  47. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/sampler/__init__.py +0 -0
  48. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/sampler/base.py +0 -0
  49. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/sampler/random_under_sampler.py +0 -0
  50. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/sampler/utils.py +0 -0
  51. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/search_task.py +0 -0
  52. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/spinner.py +0 -0
  53. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  54. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/__init__.py +0 -0
  55. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/base_search_key_detector.py +0 -0
  56. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/blocked_time_series.py +0 -0
  57. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/config.py +0 -0
  58. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/country_utils.py +0 -0
  59. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/custom_loss_utils.py +0 -0
  60. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/cv_utils.py +0 -0
  61. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/datetime_utils.py +0 -0
  62. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/deduplicate_utils.py +0 -0
  63. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/email_utils.py +0 -0
  64. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/fallback_progress_bar.py +0 -0
  65. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/feature_info.py +0 -0
  66. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/features_validator.py +0 -0
  67. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/format.py +0 -0
  68. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/hash_utils.py +0 -0
  69. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/ip_utils.py +0 -0
  70. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/mstats.py +0 -0
  71. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/phone_utils.py +0 -0
  72. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/postal_code_utils.py +0 -0
  73. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/progress_bar.py +0 -0
  74. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/psi.py +0 -0
  75. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/sample_utils.py +0 -0
  76. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/sklearn_ext.py +0 -0
  77. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/sort.py +0 -0
  78. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/target_utils.py +0 -0
  79. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/track_info.py +0 -0
  80. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/ts_utils.py +0 -0
  81. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/utils/warning_counter.py +0 -0
  82. {upgini-1.2.118 → upgini-1.2.119}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.118
3
+ Version: 1.2.119
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.119"
@@ -854,7 +854,7 @@ class FeaturesEnricher(TransformerMixin):
854
854
  raise e
855
855
  finally:
856
856
  self.logger.info(f"Transform elapsed time: {time.time() - start_time}")
857
-
857
+
858
858
  return result
859
859
 
860
860
  def calculate_metrics(
@@ -1741,7 +1741,8 @@ class FeaturesEnricher(TransformerMixin):
1741
1741
  c
1742
1742
  for c in (validated_X.columns.to_list() + generated_features)
1743
1743
  if (not self.fit_select_features or c in set(self.feature_names_).union(self.id_columns or []))
1744
- and c not in (
1744
+ and c
1745
+ not in (
1745
1746
  excluding_search_keys
1746
1747
  + list(self.fit_dropped_features)
1747
1748
  + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
@@ -2215,7 +2216,8 @@ class FeaturesEnricher(TransformerMixin):
2215
2216
  progress_callback=progress_callback,
2216
2217
  add_fit_system_record_id=True,
2217
2218
  )
2218
- if enriched_df is None:
2219
+ if enriched_df is None or len(enriched_df) == 0 or len(enriched_df.columns) == 0:
2220
+ self.logger.warning(f"Empty enriched dataframe returned: {enriched_df}, returning None")
2219
2221
  return None
2220
2222
 
2221
2223
  x_columns = [
@@ -2519,7 +2521,7 @@ if response.status_code == 200:
2519
2521
  if len(self.feature_names_) == 0:
2520
2522
  msg = self.bundle.get("no_important_features_for_transform")
2521
2523
  self.__log_warning(msg, show_support_link=True)
2522
- return X, {c: c for c in X.columns}, [], dict()
2524
+ return None, {}, [], self.search_keys
2523
2525
 
2524
2526
  self.__validate_search_keys(self.search_keys, self.search_id)
2525
2527
 
@@ -2527,7 +2529,7 @@ if response.status_code == 200:
2527
2529
  msg = self.bundle.get("transform_with_paid_features")
2528
2530
  self.logger.warning(msg)
2529
2531
  self.__display_support_link(msg)
2530
- return None, {c: c for c in X.columns}, [], {}
2532
+ return None, {}, [], self.search_keys
2531
2533
 
2532
2534
  features_meta = self._search_task.get_all_features_metadata_v2()
2533
2535
  online_api_features = [fm.name for fm in features_meta if fm.from_online_api and fm.shap_value > 0]
@@ -2550,7 +2552,7 @@ if response.status_code == 200:
2550
2552
  self.logger.warning(msg)
2551
2553
  print(msg)
2552
2554
  show_request_quote_button()
2553
- return None, {c: c for c in X.columns}, [], {}
2555
+ return None, {}, [], {}
2554
2556
  else:
2555
2557
  msg = self.bundle.get("transform_usage_info").format(
2556
2558
  transform_usage.limit, transform_usage.transformed_rows
@@ -2620,14 +2622,33 @@ if response.status_code == 200:
2620
2622
 
2621
2623
  # If there are no external features, we don't call backend on transform
2622
2624
  external_features = [fm for fm in features_meta if fm.shap_value > 0 and fm.source != "etalon"]
2623
- if not external_features:
2625
+ if len(external_features) == 0:
2624
2626
  self.logger.warning(
2625
2627
  "No external features found, returning original dataframe"
2626
2628
  f" with generated important features: {self.feature_names_}"
2627
2629
  )
2628
- filtered_columns = [c for c in self.feature_names_ if c in df.columns]
2629
- self.logger.warning(f"Filtered columns by existance in dataframe: {filtered_columns}")
2630
- return df[filtered_columns], columns_renaming, generated_features, search_keys
2630
+ df = df.rename(columns=columns_renaming)
2631
+ generated_features = [columns_renaming.get(c, c) for c in generated_features]
2632
+ search_keys = {columns_renaming.get(c, c): t for c, t in search_keys.items()}
2633
+ selecting_columns = self._selecting_input_and_generated_columns(
2634
+ validated_Xy, generated_features, keep_input, trace_id
2635
+ )
2636
+ self.logger.warning(f"Filtered columns by existance in dataframe: {selecting_columns}")
2637
+ if add_fit_system_record_id:
2638
+ df = self._add_fit_system_record_id(
2639
+ df,
2640
+ search_keys,
2641
+ SYSTEM_RECORD_ID,
2642
+ TARGET,
2643
+ columns_renaming,
2644
+ self.id_columns,
2645
+ self.cv,
2646
+ self.model_task_type,
2647
+ self.logger,
2648
+ self.bundle,
2649
+ )
2650
+ selecting_columns.append(SYSTEM_RECORD_ID)
2651
+ return df[selecting_columns], columns_renaming, generated_features, search_keys
2631
2652
 
2632
2653
  # Don't pass all features in backend on transform
2633
2654
  runtime_parameters = self._get_copy_of_runtime_parameters()
@@ -2845,29 +2866,12 @@ if response.status_code == 200:
2845
2866
  how="left",
2846
2867
  )
2847
2868
 
2848
- fit_input_columns = [c.originalName for c in self._search_task.get_file_metadata(trace_id).columns]
2849
- new_columns_on_transform = [c for c in validated_Xy.columns if c not in fit_input_columns]
2850
-
2851
- selected_generated_features = [
2852
- c for c in generated_features if not self.fit_select_features or c in self.feature_names_
2853
- ]
2854
- if keep_input is True:
2855
- selected_input_columns = [
2856
- c
2857
- for c in validated_Xy.columns
2858
- if not self.fit_select_features
2859
- or c in self.feature_names_
2860
- or c in new_columns_on_transform
2861
- or c in self.search_keys
2862
- or c in (self.id_columns or [])
2863
- or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
2864
- ]
2865
- else:
2866
- selected_input_columns = []
2867
-
2868
- selecting_columns = selected_input_columns + selected_generated_features
2869
+ selecting_columns = self._selecting_input_and_generated_columns(
2870
+ validated_Xy, generated_features, keep_input, trace_id
2871
+ )
2869
2872
  selecting_columns.extend(
2870
- c for c in result.columns
2873
+ c
2874
+ for c in result.columns
2871
2875
  if c in self.feature_names_ and c not in selecting_columns and c not in validated_Xy.columns
2872
2876
  )
2873
2877
  if add_fit_system_record_id:
@@ -2895,6 +2899,35 @@ if response.status_code == 200:
2895
2899
 
2896
2900
  return result, columns_renaming, generated_features, search_keys
2897
2901
 
2902
+ def _selecting_input_and_generated_columns(
2903
+ self,
2904
+ validated_Xy: pd.DataFrame,
2905
+ generated_features: list[str],
2906
+ keep_input: bool,
2907
+ trace_id: str,
2908
+ ):
2909
+ fit_input_columns = [c.originalName for c in self._search_task.get_file_metadata(trace_id).columns]
2910
+ new_columns_on_transform = [c for c in validated_Xy.columns if c not in fit_input_columns]
2911
+
2912
+ selected_generated_features = [
2913
+ c for c in generated_features if not self.fit_select_features or c in self.feature_names_
2914
+ ]
2915
+ if keep_input is True:
2916
+ selected_input_columns = [
2917
+ c
2918
+ for c in validated_Xy.columns
2919
+ if not self.fit_select_features
2920
+ or c in self.feature_names_
2921
+ or c in new_columns_on_transform
2922
+ or c in self.search_keys
2923
+ or c in (self.id_columns or [])
2924
+ or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
2925
+ ]
2926
+ else:
2927
+ selected_input_columns = []
2928
+
2929
+ return selected_input_columns + selected_generated_features
2930
+
2898
2931
  def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
2899
2932
  if (search_keys is None or len(search_keys) == 0) and self.country_code is None:
2900
2933
  if search_id:
@@ -3727,9 +3760,7 @@ if response.status_code == 200:
3727
3760
  eval_types = validated_eval_X.dtypes
3728
3761
  # Find columns with different types
3729
3762
  diff_cols = [
3730
- (col, x_types[col], eval_types[col])
3731
- for col in x_types.index
3732
- if x_types[col] != eval_types[col]
3763
+ (col, x_types[col], eval_types[col]) for col in x_types.index if x_types[col] != eval_types[col]
3733
3764
  ]
3734
3765
  diff_col_names = [col for col, _, _ in diff_cols]
3735
3766
  # print columns with different types
@@ -3815,9 +3846,7 @@ if response.status_code == 200:
3815
3846
  return Xy[X.columns].copy(), Xy[TARGET].copy()
3816
3847
 
3817
3848
  @staticmethod
3818
- def _sort_by_system_record_id(
3819
- X: pd.DataFrame, y: pd.Series, cv: CVType | None
3820
- ) -> tuple[pd.DataFrame, pd.Series]:
3849
+ def _sort_by_system_record_id(X: pd.DataFrame, y: pd.Series, cv: CVType | None) -> tuple[pd.DataFrame, pd.Series]:
3821
3850
  if cv not in [CVType.time_series, CVType.blocked_time_series]:
3822
3851
  record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
3823
3852
  Xy = X.copy()
@@ -269,19 +269,22 @@ def make_html_report(
269
269
  if search_keys is not None
270
270
  else ""
271
271
  }
272
- {"<h3>All relevant features. Accuracy after enrichment</h3>" + make_table(metrics_df)
273
- if metrics_df is not None
274
- else ""
272
+ {
273
+ "<h3>All relevant features. Accuracy after enrichment</h3>" + make_table(metrics_df)
274
+ if metrics_df is not None
275
+ else ""
275
276
  }
276
- {"<h3>Relevant data sources</h3>" + make_table(relevant_datasources_df)
277
- if len(relevant_datasources_df) > 0
278
- else ""
277
+ {
278
+ "<h3>Relevant data sources</h3>" + make_table(relevant_datasources_df)
279
+ if len(relevant_datasources_df) > 0
280
+ else ""
279
281
  }
280
282
  <h3>All relevant features. Listing ({len(relevant_features_df)} items)</h3>
281
283
  {make_table(relevant_features_df, wrap_long_string=25)}
282
- {"<h3>Description of AutoFE feature names</h3>" + make_table(autofe_descriptions_df, wrap_long_string=25)
283
- if autofe_descriptions_df is not None
284
- else ""
284
+ {
285
+ "<h3>Description of AutoFE feature names</h3>" + make_table(autofe_descriptions_df, wrap_long_string=25)
286
+ if autofe_descriptions_df is not None and len(autofe_descriptions_df) > 0
287
+ else ""
285
288
  }
286
289
  <p>To buy found data sources, please contact: <a href='mailto:sales@upgini.com'>sales@upgini.com</a></p>
287
290
  <p>Best regards, </br><b>Upgini Team</b></p>
@@ -1 +0,0 @@
1
- __version__ = "1.2.118"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes