upgini 1.2.87.dev4__py3-none-any.whl → 1.2.88a3884.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.87.dev4"
1
+ __version__ = "1.2.88a3884.dev0"
@@ -300,7 +300,7 @@ class FeaturesEnricher(TransformerMixin):
300
300
  self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
301
301
  self.metrics: Optional[pd.DataFrame] = None
302
302
  self.feature_names_ = []
303
- self.dropped_client_feature_names_ = []
303
+ self.zero_shap_client_features = []
304
304
  self.feature_importances_ = []
305
305
  self.search_id = search_id
306
306
  self.disable_force_downsampling = disable_force_downsampling
@@ -315,7 +315,7 @@ class FeaturesEnricher(TransformerMixin):
315
315
  self.logger.debug(f"FeaturesEnricher created from existing search: {search_id}")
316
316
  self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
317
317
  file_metadata = self._search_task.get_file_metadata(trace_id)
318
- x_columns = [c.originalName or c.name for c in file_metadata.columns]
318
+ x_columns = [c.name for c in file_metadata.columns]
319
319
  self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
320
320
  df = pd.DataFrame(columns=x_columns)
321
321
  self.__prepare_feature_importances(trace_id, df, silent=True)
@@ -2347,9 +2347,7 @@ if response.status_code == 200:
2347
2347
 
2348
2348
  is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
2349
2349
 
2350
- columns_to_drop = [
2351
- c for c in df.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
2352
- ]
2350
+ columns_to_drop = [c for c in df.columns if c in self.feature_names_]
2353
2351
  if len(columns_to_drop) > 0:
2354
2352
  msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2355
2353
  self.logger.warning(msg)
@@ -2405,6 +2403,17 @@ if response.status_code == 200:
2405
2403
  df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
2406
2404
  columns_renaming = normalizer.columns_renaming
2407
2405
 
2406
+ # If there are no external features, we don't call backend on transform
2407
+ external_features = [fm for fm in features_meta if fm.shap_value > 0 and fm.source != "etalon"]
2408
+ if not external_features:
2409
+ self.logger.warning(
2410
+ "No external features found, returning original dataframe"
2411
+ f" with generated important features: {filtered_columns}"
2412
+ )
2413
+ filtered_columns = [c for c in filtered_columns if c in df.columns]
2414
+ self.logger.warning(f"Filtered columns by existance in dataframe: {filtered_columns}")
2415
+ return df[filtered_columns], columns_renaming, generated_features, search_keys
2416
+
2408
2417
  # Don't pass all features in backend on transform
2409
2418
  runtime_parameters = self._get_copy_of_runtime_parameters()
2410
2419
  features_for_transform = self._search_task.get_features_for_transform() or []
@@ -2491,26 +2500,6 @@ if response.status_code == 200:
2491
2500
  converter = PostalCodeSearchKeyConverter(postal_code)
2492
2501
  df = converter.convert(df)
2493
2502
 
2494
- # TODO return X + generated features
2495
- # external_features = [fm for fm in features_meta if fm.shap_value > 0 and fm.source != "etalon"]
2496
- # if not external_features:
2497
- # # Unexplode dataframe back to original shape
2498
- # if len(unnest_search_keys) > 0:
2499
- # df = df.groupby(ENTITY_SYSTEM_RECORD_ID).first().reset_index()
2500
-
2501
- # # Get important features from etalon source
2502
- # etalon_features = [fm.name for fm in features_meta if fm.shap_value > 0 and fm.source == "etalon"]
2503
-
2504
- # # Select only etalon features that exist in dataframe
2505
- # available_etalon_features = [f for f in etalon_features if f in df.columns]
2506
-
2507
- # # Return original dataframe with only important etalon features
2508
- # result = df[available_etalon_features].copy()
2509
- # result.index = validated_Xy.index
2510
-
2511
- # return result, columns_renaming, generated_features, search_keys
2512
- # ...
2513
-
2514
2503
  meaning_types = {}
2515
2504
  meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
2516
2505
  meaning_types.update({col: key.value for col, key in search_keys.items()})
@@ -2659,14 +2648,15 @@ if response.status_code == 200:
2659
2648
  how="left",
2660
2649
  )
2661
2650
 
2651
+ selected_generated_features = [
2652
+ c for c in generated_features if not self.fit_select_features or c in filtered_columns
2653
+ ]
2662
2654
  selecting_columns = [
2663
2655
  c
2664
- for c in itertools.chain(validated_Xy.columns.tolist(), generated_features)
2665
- if c not in self.dropped_client_feature_names_
2656
+ for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
2657
+ if c not in self.zero_shap_client_features
2666
2658
  ]
2667
- selecting_columns.extend(
2668
- c for c in filtered_columns if c in result.columns and c not in validated_X.columns
2669
- )
2659
+ selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
2670
2660
  if add_fit_system_record_id:
2671
2661
  selecting_columns.append(SORT_ID)
2672
2662
 
@@ -3372,9 +3362,13 @@ if response.status_code == 200:
3372
3362
  Xy[TARGET] = y
3373
3363
  validated_y = Xy[TARGET].copy()
3374
3364
 
3375
- if validated_y.nunique() < 2:
3365
+ y_nunique = validated_y.nunique()
3366
+ if y_nunique < 2:
3376
3367
  raise ValidationError(self.bundle.get("y_is_constant"))
3377
3368
 
3369
+ if self.model_task_type == ModelTaskType.BINARY and y_nunique != 2:
3370
+ raise ValidationError(self.bundle.get("binary_target_unique_count_not_2").format(y_nunique))
3371
+
3378
3372
  return validated_y
3379
3373
 
3380
3374
  def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
@@ -3449,9 +3443,13 @@ if response.status_code == 200:
3449
3443
  else:
3450
3444
  raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
3451
3445
 
3452
- if validated_eval_y.nunique() < 2:
3446
+ eval_y_nunique = validated_eval_y.nunique()
3447
+ if eval_y_nunique < 2:
3453
3448
  raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
3454
3449
 
3450
+ if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
3451
+ raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
3452
+
3455
3453
  return validated_eval_X, validated_eval_y
3456
3454
 
3457
3455
  def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
@@ -3993,10 +3991,11 @@ if response.status_code == 200:
3993
3991
  original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
3994
3992
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
3995
3993
 
3994
+ # To be sure that names with hash suffixes
3996
3995
  df = df.rename(columns=original_names_dict)
3997
3996
 
3998
3997
  self.feature_names_ = []
3999
- self.dropped_client_feature_names_ = []
3998
+ self.zero_shap_client_features = []
4000
3999
  self.feature_importances_ = []
4001
4000
  features_info = []
4002
4001
  features_info_without_links = []
@@ -4008,7 +4007,7 @@ if response.status_code == 200:
4008
4007
  if feature_meta.name in original_names_dict.keys():
4009
4008
  feature_meta.name = original_names_dict[feature_meta.name]
4010
4009
 
4011
- is_client_feature = feature_meta.name in df.columns
4010
+ is_client_feature = original_names_dict.get(feature_meta.name, feature_meta.name) in df.columns
4012
4011
 
4013
4012
  # Show and update shap values for client features only if select_features is True
4014
4013
  if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
@@ -4024,13 +4023,13 @@ if response.status_code == 200:
4024
4023
  features_meta.sort(key=lambda m: (-m.shap_value, m.name))
4025
4024
 
4026
4025
  for feature_meta in features_meta:
4027
-
4028
- is_client_feature = feature_meta.name in df.columns
4026
+ original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4027
+ is_client_feature = original_name in df.columns
4029
4028
 
4030
4029
  # TODO make a decision about selected features based on special flag from mlb
4031
4030
  if original_shaps.get(feature_meta.name, 0.0) == 0.0:
4032
- if self.fit_select_features:
4033
- self.dropped_client_feature_names_.append(feature_meta.name)
4031
+ if is_client_feature and self.fit_select_features:
4032
+ self.zero_shap_client_features.append(original_name)
4034
4033
  continue
4035
4034
 
4036
4035
  # Use only important features
upgini/metrics.py CHANGED
@@ -815,6 +815,8 @@ class CatBoostWrapper(EstimatorWrapper):
815
815
  encoded = cat_encoder.transform(x[self.cat_features])
816
816
  cat_features = encoded.columns.to_list()
817
817
  x[self.cat_features] = encoded
818
+ else:
819
+ cat_features = self.cat_features
818
820
 
819
821
  # Create Pool for fold data, if need (for example, when categorical features are present)
820
822
  fold_pool = Pool(
@@ -68,6 +68,8 @@ too_many_generate_features=Too many columns passed in `generate_features` argume
68
68
  invalid_round_embeddings=Argument `round_embeddings` should be non negative integer
69
69
  no_important_features_for_transform=There are no important features for transform. Return input as transformed
70
70
  search_task_not_initial=Passed search_id {} is transform id. Please use search task id of fit call: {}.
71
+ binary_target_unique_count_not_2=Binary target should contain only 2 unique values, but {} found
72
+ binary_target_eval_unique_count_not_2=Binary target should contain only 2 unique values, but {} found in eval_set
71
73
 
72
74
  # Validation errors
73
75
  # params validation
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.87.dev4
3
+ Version: 1.2.88a3884.dev0
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=snYX5GSOXf809cKcpmiRzx30DuIAydReavaEB237z1A,28
1
+ upgini/__about__.py,sha256=9UxVEFo0h8LcuPSKD5JSZ_n02IZF15Ksx8d1ITu4M7U,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=fRtqSkXNONLnPe6cCL967GMt349FTIpXzy_u8LUKncw,35354
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=8KJiPXTFSiJUl5hJPEhMwhpXqPnGm3LrX31pKwlYe3k,215900
6
+ upgini/features_enricher.py,sha256=eFnJVb8jM1INlT-imfjafhWtOfx9EJv2HSvlfyGy0_U,216188
7
7
  upgini/http.py,sha256=6Qcepv0tDC72mBBJxYHnA2xqw6QwFaKrXN8o4vju8Es,44372
8
8
  upgini/metadata.py,sha256=zt_9k0iQbWXuiRZcel4ORNPdQKt6Ou69ucZD_E1Q46o,12341
9
- upgini/metrics.py,sha256=64M7RGbr9dItbXPYqWmeKhpBGHO4B69eV9Rj6P18_qg,45228
9
+ upgini/metrics.py,sha256=zIOaiyfQLedU9Fk4877drnlWh-KiImSkZpPeiq6Xr1E,45295
10
10
  upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
38
38
  upgini/normalizer/normalize_utils.py,sha256=g2TcDXZeJp9kAFO2sTqZ4CAsN4J1qHNgoJHZ8gtzUWo,7376
39
39
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
40
40
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
41
- upgini/resource_bundle/strings.properties,sha256=Q__3SNuespbG9bRJ9Gq4E_w665NPe8EZ7Pcng8B1V8Y,28001
41
+ upgini/resource_bundle/strings.properties,sha256=xpHD-3mW1U6Nca0QghC6FSrQLDci9pInuMpOBPPiB8M,28212
42
42
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
43
43
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.87.dev4.dist-info/METADATA,sha256=PpZ-d4CiDjy-RnXvTGmyEXh-Q_Mjkdf1UaGyVFniqCw,49167
74
- upgini-1.2.87.dev4.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
- upgini-1.2.87.dev4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.87.dev4.dist-info/RECORD,,
73
+ upgini-1.2.88a3884.dev0.dist-info/METADATA,sha256=e_lwt9ydR712gQBymukF9Lc2W-5aqj5nrZa-6T-UXA4,49172
74
+ upgini-1.2.88a3884.dev0.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
75
+ upgini-1.2.88a3884.dev0.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.88a3884.dev0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.25.0
2
+ Generator: hatchling 1.24.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any