upgini 1.2.87.dev3__py3-none-any.whl → 1.2.87.dev5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.87.dev3"
1
+ __version__ = "1.2.87.dev5"
@@ -300,7 +300,7 @@ class FeaturesEnricher(TransformerMixin):
300
300
  self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
301
301
  self.metrics: Optional[pd.DataFrame] = None
302
302
  self.feature_names_ = []
303
- self.dropped_client_feature_names_ = []
303
+ self.zero_shap_client_features = []
304
304
  self.feature_importances_ = []
305
305
  self.search_id = search_id
306
306
  self.disable_force_downsampling = disable_force_downsampling
@@ -315,7 +315,7 @@ class FeaturesEnricher(TransformerMixin):
315
315
  self.logger.debug(f"FeaturesEnricher created from existing search: {search_id}")
316
316
  self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
317
317
  file_metadata = self._search_task.get_file_metadata(trace_id)
318
- x_columns = [c.originalName or c.name for c in file_metadata.columns]
318
+ x_columns = [c.name for c in file_metadata.columns]
319
319
  self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
320
320
  df = pd.DataFrame(columns=x_columns)
321
321
  self.__prepare_feature_importances(trace_id, df, silent=True)
@@ -2299,11 +2299,16 @@ if response.status_code == 200:
2299
2299
 
2300
2300
  self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
2301
2301
 
2302
- self.__validate_search_keys(self.search_keys, self.search_id)
2302
+ filtered_columns = self.__filtered_enriched_features(
2303
+ importance_threshold, max_features, trace_id, validated_X
2304
+ )
2305
+ # If there are no important features, return original dataframe
2306
+ if not filtered_columns:
2307
+ msg = self.bundle.get("no_important_features_for_transform")
2308
+ self.__log_warning(msg, show_support_link=True)
2309
+ return X, {c: c for c in X.columns}, [], dict()
2303
2310
 
2304
- if len(self.feature_names_) == 0:
2305
- self.logger.warning(self.bundle.get("no_important_features_for_transform"))
2306
- return X, {c: c for c in X.columns}, [], {}
2311
+ self.__validate_search_keys(self.search_keys, self.search_id)
2307
2312
 
2308
2313
  if self._has_paid_features(exclude_features_sources):
2309
2314
  msg = self.bundle.get("transform_with_paid_features")
@@ -2342,9 +2347,7 @@ if response.status_code == 200:
2342
2347
 
2343
2348
  is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
2344
2349
 
2345
- columns_to_drop = [
2346
- c for c in df.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
2347
- ]
2350
+ columns_to_drop = [c for c in df.columns if c in self.feature_names_]
2348
2351
  if len(columns_to_drop) > 0:
2349
2352
  msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2350
2353
  self.logger.warning(msg)
@@ -2400,6 +2403,17 @@ if response.status_code == 200:
2400
2403
  df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
2401
2404
  columns_renaming = normalizer.columns_renaming
2402
2405
 
2406
+ # If there are no external features, we don't call backend on transform
2407
+ external_features = [fm for fm in features_meta if fm.shap_value > 0 and fm.source != "etalon"]
2408
+ if not external_features:
2409
+ self.logger.warning(
2410
+ "No external features found, returning original dataframe"
2411
+ f" with generated important features: {filtered_columns}"
2412
+ )
2413
+ filtered_columns = [c for c in filtered_columns if c in df.columns]
2414
+ self.logger.warning(f"Filtered columns by existance in dataframe: {filtered_columns}")
2415
+ return df[filtered_columns], columns_renaming, generated_features, search_keys
2416
+
2403
2417
  # Don't pass all features in backend on transform
2404
2418
  runtime_parameters = self._get_copy_of_runtime_parameters()
2405
2419
  features_for_transform = self._search_task.get_features_for_transform() or []
@@ -2444,6 +2458,8 @@ if response.status_code == 200:
2444
2458
  # Explode multiple search keys
2445
2459
  df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
2446
2460
 
2461
+ # Convert search keys and generate features on them
2462
+
2447
2463
  email_column = self._get_email_column(search_keys)
2448
2464
  hem_column = self._get_hem_column(search_keys)
2449
2465
  if email_column:
@@ -2632,17 +2648,15 @@ if response.status_code == 200:
2632
2648
  how="left",
2633
2649
  )
2634
2650
 
2651
+ selected_generated_features = [
2652
+ c for c in generated_features if not self.fit_select_features or c in filtered_columns
2653
+ ]
2635
2654
  selecting_columns = [
2636
2655
  c
2637
- for c in itertools.chain(validated_Xy.columns.tolist(), generated_features)
2638
- if c not in self.dropped_client_feature_names_
2656
+ for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
2657
+ if c not in self.zero_shap_client_features
2639
2658
  ]
2640
- filtered_columns = self.__filtered_enriched_features(
2641
- importance_threshold, max_features, trace_id, validated_X
2642
- )
2643
- selecting_columns.extend(
2644
- c for c in filtered_columns if c in result.columns and c not in validated_X.columns
2645
- )
2659
+ selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
2646
2660
  if add_fit_system_record_id:
2647
2661
  selecting_columns.append(SORT_ID)
2648
2662
 
@@ -2942,7 +2956,10 @@ if response.status_code == 200:
2942
2956
  self.__log_warning(fintech_warning)
2943
2957
  df, full_duplicates_warning = clean_full_duplicates(df, self.logger, bundle=self.bundle)
2944
2958
  if full_duplicates_warning:
2945
- self.__log_warning(full_duplicates_warning)
2959
+ if len(df) == 0:
2960
+ raise ValidationError(full_duplicates_warning)
2961
+ else:
2962
+ self.__log_warning(full_duplicates_warning)
2946
2963
 
2947
2964
  # Explode multiple search keys
2948
2965
  df = self.__add_fit_system_record_id(
@@ -3345,9 +3362,13 @@ if response.status_code == 200:
3345
3362
  Xy[TARGET] = y
3346
3363
  validated_y = Xy[TARGET].copy()
3347
3364
 
3348
- if validated_y.nunique() < 2:
3365
+ y_nunique = validated_y.nunique()
3366
+ if y_nunique < 2:
3349
3367
  raise ValidationError(self.bundle.get("y_is_constant"))
3350
3368
 
3369
+ if self.model_task_type == ModelTaskType.BINARY and y_nunique != 2:
3370
+ raise ValidationError(self.bundle.get("binary_target_unique_count_not_2").format(y_nunique))
3371
+
3351
3372
  return validated_y
3352
3373
 
3353
3374
  def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
@@ -3422,9 +3443,13 @@ if response.status_code == 200:
3422
3443
  else:
3423
3444
  raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
3424
3445
 
3425
- if validated_eval_y.nunique() < 2:
3446
+ eval_y_nunique = validated_eval_y.nunique()
3447
+ if eval_y_nunique < 2:
3426
3448
  raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
3427
3449
 
3450
+ if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
3451
+ raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
3452
+
3428
3453
  return validated_eval_X, validated_eval_y
3429
3454
 
3430
3455
  def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
@@ -3966,10 +3991,11 @@ if response.status_code == 200:
3966
3991
  original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
3967
3992
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
3968
3993
 
3994
+ # To be sure that names with hash suffixes
3969
3995
  df = df.rename(columns=original_names_dict)
3970
3996
 
3971
3997
  self.feature_names_ = []
3972
- self.dropped_client_feature_names_ = []
3998
+ self.zero_shap_client_features = []
3973
3999
  self.feature_importances_ = []
3974
4000
  features_info = []
3975
4001
  features_info_without_links = []
@@ -3981,7 +4007,7 @@ if response.status_code == 200:
3981
4007
  if feature_meta.name in original_names_dict.keys():
3982
4008
  feature_meta.name = original_names_dict[feature_meta.name]
3983
4009
 
3984
- is_client_feature = feature_meta.name in df.columns
4010
+ is_client_feature = original_names_dict.get(feature_meta.name, feature_meta.name) in df.columns
3985
4011
 
3986
4012
  # Show and update shap values for client features only if select_features is True
3987
4013
  if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
@@ -3997,13 +4023,13 @@ if response.status_code == 200:
3997
4023
  features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3998
4024
 
3999
4025
  for feature_meta in features_meta:
4000
-
4001
- is_client_feature = feature_meta.name in df.columns
4026
+ original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4027
+ is_client_feature = original_name in df.columns
4002
4028
 
4003
4029
  # TODO make a decision about selected features based on special flag from mlb
4004
4030
  if original_shaps.get(feature_meta.name, 0.0) == 0.0:
4005
- if self.fit_select_features:
4006
- self.dropped_client_feature_names_.append(feature_meta.name)
4031
+ if is_client_feature and self.fit_select_features:
4032
+ self.zero_shap_client_features.append(original_name)
4007
4033
  continue
4008
4034
 
4009
4035
  # Use only important features
upgini/metrics.py CHANGED
@@ -807,14 +807,16 @@ class CatBoostWrapper(EstimatorWrapper):
807
807
  try:
808
808
  from catboost import Pool
809
809
 
810
+ cat_features = None
810
811
  if cat_encoder is not None:
811
812
  if isinstance(self.cv, TimeSeriesSplit) or isinstance(self.cv, BlockedTimeSeriesSplit):
812
813
  encoded = cat_encoder.transform(x[self.cat_features]).astype(int)
813
- cat_features = None
814
814
  else:
815
815
  encoded = cat_encoder.transform(x[self.cat_features])
816
816
  cat_features = encoded.columns.to_list()
817
817
  x[self.cat_features] = encoded
818
+ else:
819
+ cat_features = self.cat_features
818
820
 
819
821
  # Create Pool for fold data, if need (for example, when categorical features are present)
820
822
  fold_pool = Pool(
@@ -68,6 +68,8 @@ too_many_generate_features=Too many columns passed in `generate_features` argume
68
68
  invalid_round_embeddings=Argument `round_embeddings` should be non negative integer
69
69
  no_important_features_for_transform=There are no important features for transform. Return input as transformed
70
70
  search_task_not_initial=Passed search_id {} is transform id. Please use search task id of fit call: {}.
71
+ binary_target_unique_count_not_2=Binary target should contain only 2 unique values, but {} found
72
+ binary_target_eval_unique_count_not_2=Binary target should contain only 2 unique values, but {} found in eval_set
71
73
 
72
74
  # Validation errors
73
75
  # params validation
@@ -156,7 +158,7 @@ dataset_too_few_rows=X size should be at least {} rows after validation
156
158
  dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample X
157
159
  dataset_empty_column_names=Some column names are empty. Add names please
158
160
  dataset_full_duplicates={:.5f}% of the rows are fully duplicated
159
- dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
161
+ dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nSample of incorrect row indexes: {}
160
162
  dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
161
163
  dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
162
164
  dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset
@@ -192,7 +192,7 @@ def clean_full_duplicates(
192
192
  unique_columns.remove(TARGET)
193
193
  marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
194
194
  if marked_duplicates.sum() > 0:
195
- dups_indices = df[marked_duplicates].index.to_list()
195
+ dups_indices = df[marked_duplicates].index.to_list()[:100]
196
196
  nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
197
197
  num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
198
198
  share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.87.dev3
3
+ Version: 1.2.87.dev5
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=-MoNpjvEXC0uIle8xxIgQduzBZJlNzuW-1rPMTm_xc8,28
1
+ upgini/__about__.py,sha256=wcphyJpGJs2mZPWvsK3omRtXm2Q4NsYXyO0X5zcwLMw,28
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=fRtqSkXNONLnPe6cCL967GMt349FTIpXzy_u8LUKncw,35354
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=n8KBoBgJApLiRv4wXeSgfS-PfbB1D5aDOJfFnL0q6v8,214487
6
+ upgini/features_enricher.py,sha256=eFnJVb8jM1INlT-imfjafhWtOfx9EJv2HSvlfyGy0_U,216188
7
7
  upgini/http.py,sha256=6Qcepv0tDC72mBBJxYHnA2xqw6QwFaKrXN8o4vju8Es,44372
8
8
  upgini/metadata.py,sha256=zt_9k0iQbWXuiRZcel4ORNPdQKt6Ou69ucZD_E1Q46o,12341
9
- upgini/metrics.py,sha256=CR_MKBcq1RlNMXeqc9S374JzHgunMl-mEmlTnZAm_VI,45236
9
+ upgini/metrics.py,sha256=zIOaiyfQLedU9Fk4877drnlWh-KiImSkZpPeiq6Xr1E,45295
10
10
  upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
38
38
  upgini/normalizer/normalize_utils.py,sha256=g2TcDXZeJp9kAFO2sTqZ4CAsN4J1qHNgoJHZ8gtzUWo,7376
39
39
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
40
40
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
41
- upgini/resource_bundle/strings.properties,sha256=U_ewTI-qPww4X3WcFG3qDf_jv2vo6RrlCehVDjqtzEI,27991
41
+ upgini/resource_bundle/strings.properties,sha256=xpHD-3mW1U6Nca0QghC6FSrQLDci9pInuMpOBPPiB8M,28212
42
42
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
43
43
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -52,7 +52,7 @@ upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk
52
52
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
53
53
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
54
54
  upgini/utils/datetime_utils.py,sha256=UL1ernnawW0LV9mPDpCIc6sFy0HUhFscWVNwfH4V7rI,14366
55
- upgini/utils/deduplicate_utils.py,sha256=jm9ARZ0fbJFF3aJqj-xm_T6lNh-WErM0H0h6B_L1xQc,8948
55
+ upgini/utils/deduplicate_utils.py,sha256=EpBVCov42-FJIAPfa4jY_ZRct3N2MFaC7i-oJNZ_MGI,8954
56
56
  upgini/utils/display_utils.py,sha256=hAeWEcJtPDg8fAVcMNrNB-azFD2WJp1nvbPAhR7SeP4,12071
57
57
  upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
58
58
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.87.dev3.dist-info/METADATA,sha256=Pm-acVK8TpDLvPsO0qluwSjmu0cb3FHmtXmqMj--2Ag,49167
74
- upgini-1.2.87.dev3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
- upgini-1.2.87.dev3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.87.dev3.dist-info/RECORD,,
73
+ upgini-1.2.87.dev5.dist-info/METADATA,sha256=Jdb6gn8ijXK4ccs5hC9yEPA6dQBzc5FtelPXOJgBfJA,49167
74
+ upgini-1.2.87.dev5.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.87.dev5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.87.dev5.dist-info/RECORD,,