upgini 1.2.91a3884.dev3__py3-none-any.whl → 1.2.91a3884.dev5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.91a3884.dev3"
1
+ __version__ = "1.2.91a3884.dev5"
upgini/dataset.py CHANGED
@@ -48,17 +48,9 @@ except Exception:
48
48
  )
49
49
 
50
50
 
51
- class Dataset: # (pd.DataFrame):
51
+ class Dataset:
52
52
  MIN_ROWS_COUNT = 100
53
53
  MAX_ROWS = 200_000
54
- FIT_SAMPLE_ROWS = 200_000
55
- FIT_SAMPLE_THRESHOLD = 200_000
56
- FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
57
- FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
58
- FIT_SAMPLE_THRESHOLD_TS = 54_000
59
- FIT_SAMPLE_ROWS_TS = 54_000
60
- BINARY_MIN_SAMPLE_THRESHOLD = 5_000
61
- MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
62
54
  IMBALANCE_THESHOLD = 0.6
63
55
  BINARY_BOOTSTRAP_LOOPS = 5
64
56
  MULTICLASS_BOOTSTRAP_LOOPS = 2
@@ -576,8 +568,8 @@ class Dataset: # (pd.DataFrame):
576
568
  def _set_sample_size(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
577
569
  if runtime_parameters is not None and runtime_parameters.properties is not None:
578
570
  if self.cv_type is not None and self.cv_type.is_time_series():
579
- runtime_parameters.properties["sample_size"] = self.FIT_SAMPLE_ROWS_TS
580
- runtime_parameters.properties["iter0_sample_size"] = self.FIT_SAMPLE_ROWS_TS
571
+ runtime_parameters.properties["sample_size"] = self.sample_config.fit_sample_rows_ts
572
+ runtime_parameters.properties["iter0_sample_size"] = self.sample_config.fit_sample_rows_ts
581
573
  return runtime_parameters
582
574
 
583
575
  def _clean_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
@@ -302,7 +302,7 @@ class FeaturesEnricher(TransformerMixin):
302
302
  self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
303
303
  self.metrics: Optional[pd.DataFrame] = None
304
304
  self.feature_names_ = []
305
- self.zero_shap_client_features = []
305
+ self.external_source_feature_names = []
306
306
  self.feature_importances_ = []
307
307
  self.search_id = search_id
308
308
  self.disable_force_downsampling = disable_force_downsampling
@@ -981,6 +981,19 @@ class FeaturesEnricher(TransformerMixin):
981
981
  client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
982
982
  estimator, validated_X, self.search_keys
983
983
  )
984
+ if self.id_columns and self.id_columns_encoder is not None:
985
+ if cat_features_from_backend:
986
+ cat_features_from_backend = [
987
+ c
988
+ for c in cat_features_from_backend
989
+ if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
990
+ ]
991
+ if client_cat_features:
992
+ client_cat_features = [
993
+ c
994
+ for c in client_cat_features
995
+ if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
996
+ ]
984
997
  for cat_feature in cat_features_from_backend:
985
998
  original_cat_feature = self.fit_columns_renaming.get(cat_feature)
986
999
  if original_cat_feature in self.search_keys:
@@ -2346,7 +2359,9 @@ if response.status_code == 200:
2346
2359
 
2347
2360
  is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
2348
2361
 
2349
- columns_to_drop = [c for c in df.columns if c in self.feature_names_ and c not in (self.id_columns or [])]
2362
+ columns_to_drop = [
2363
+ c for c in df.columns if c in self.feature_names_ and c in self.external_source_feature_names
2364
+ ]
2350
2365
  if len(columns_to_drop) > 0:
2351
2366
  msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2352
2367
  self.logger.warning(msg)
@@ -2654,7 +2669,7 @@ if response.status_code == 200:
2654
2669
  selecting_columns = [
2655
2670
  c
2656
2671
  for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
2657
- if c not in self.zero_shap_client_features or c in (self.id_columns or [])
2672
+ if c not in self.external_source_feature_names or c in (self.id_columns or [])
2658
2673
  ]
2659
2674
  selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
2660
2675
  if add_fit_system_record_id:
@@ -4048,7 +4063,7 @@ if response.status_code == 200:
4048
4063
  df = df.rename(columns=original_names_dict)
4049
4064
 
4050
4065
  self.feature_names_ = []
4051
- self.zero_shap_client_features = []
4066
+ self.external_source_feature_names = []
4052
4067
  self.feature_importances_ = []
4053
4068
  features_info = []
4054
4069
  features_info_without_links = []
@@ -4079,10 +4094,11 @@ if response.status_code == 200:
4079
4094
  original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4080
4095
  is_client_feature = original_name in df.columns
4081
4096
 
4097
+ if not is_client_feature:
4098
+ self.external_source_feature_names.append(original_name)
4099
+
4082
4100
  # TODO make a decision about selected features based on special flag from mlb
4083
4101
  if original_shaps.get(feature_meta.name, 0.0) == 0.0:
4084
- if is_client_feature and self.fit_select_features:
4085
- self.zero_shap_client_features.append(original_name)
4086
4102
  continue
4087
4103
 
4088
4104
  # Use only important features
upgini/metrics.py CHANGED
@@ -815,7 +815,7 @@ class CatBoostWrapper(EstimatorWrapper):
815
815
  else:
816
816
  encoded = cat_encoder.transform(x[self.cat_features])
817
817
  cat_features = encoded.columns.to_list()
818
- x[self.cat_features] = encoded
818
+ x.loc[:, self.cat_features] = encoded
819
819
  else:
820
820
  cat_features = self.cat_features
821
821
 
@@ -15,7 +15,7 @@ TS_MIN_DIFFERENT_IDS_RATIO = 0.2
15
15
  TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
16
16
  TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
17
17
  TS_DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
18
- FIT_SAMPLE_ROWS_TS = 54_000
18
+ FIT_SAMPLE_ROWS_TS = 100_000
19
19
 
20
20
  BINARY_MIN_SAMPLE_THRESHOLD = 5_000
21
21
  MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
@@ -173,6 +173,7 @@ def sample_time_series_train_eval(
173
173
  logger=logger,
174
174
  **kwargs,
175
175
  )
176
+ logger.info(f"Eval set size: {len(eval_df)}")
176
177
  df = pd.concat([train_df, eval_df])
177
178
 
178
179
  elif len(train_df) > max_rows:
@@ -188,6 +189,8 @@ def sample_time_series_train_eval(
188
189
  else:
189
190
  df = train_df
190
191
 
192
+ logger.info(f"Train set size: {len(df)}")
193
+
191
194
  return df
192
195
 
193
196
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.91a3884.dev3
3
+ Version: 1.2.91a3884.dev5
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=55Sg-JLu4aw-5ANNPanS_ciHPSsxXTa8YndbgltGREA,33
1
+ upgini/__about__.py,sha256=tqdF5EG5u2XotiGWCSnsQ61GODo019Lr-s097ItdAHs,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=c0rZ-ydrnCdrTzx10WZl4WbO3LdyuF0fUCRD8Ugjitg,33093
4
+ upgini/dataset.py,sha256=MituLJZTDdIwSk-Ia5G1pS52PERNHQ2P99FgCH2kTjQ,32790
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=plU6GbGwc4Pk3eXPzYFezq33yTslX615Ew-HAvdLyVs,217468
6
+ upgini/features_enricher.py,sha256=udQdXpqVO4YAwEHjUzS195k2jxe5_CtZ-KTpWRicjfs,218225
7
7
  upgini/http.py,sha256=4i7fQwrwU3WzDUOWzrgR-4C8eJwj_5dBwRAR-UjUtlc,44345
8
8
  upgini/metadata.py,sha256=vsbbHyPCP3Rs8WkeDgQg99uAA_zmsbDStAT-NwDYhO4,12455
9
- upgini/metrics.py,sha256=ju7JPwLUe8vtFUGbBV6w6ecySd952XucrqToc1edVBs,45306
9
+ upgini/metrics.py,sha256=Bc1L9DUmEL8OWwNvIEjPjw5EyHSZbiu3v2hWyBmedis,45313
10
10
  upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -64,14 +64,14 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
64
64
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
65
65
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
66
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
67
- upgini/utils/sample_utils.py,sha256=PpMXRVTPKi6TyAo0gPhF0OmXmutecHdonM7WYUsB1Wo,15249
67
+ upgini/utils/sample_utils.py,sha256=jQ1em2FRnMFLul9ujuBgs5XZ9jAZ4eM4FHT3aDSjOy8,15351
68
68
  upgini/utils/sklearn_ext.py,sha256=jLJWAKkqQinV15Z4y1ZnsN3c-fKFwXTsprs00COnyVU,49315
69
69
  upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
70
70
  upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,9049
71
71
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
72
72
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
73
73
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
74
- upgini-1.2.91a3884.dev3.dist-info/METADATA,sha256=teoc8dCmv4mb2eBV6QruZag3xnwK3YAdlKCHuIKllXw,49546
75
- upgini-1.2.91a3884.dev3.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
76
- upgini-1.2.91a3884.dev3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
- upgini-1.2.91a3884.dev3.dist-info/RECORD,,
74
+ upgini-1.2.91a3884.dev5.dist-info/METADATA,sha256=A9MDv_VCFrWDDw2Xyo7Wsx3ps6ECwTul36gmw-wujgI,49546
75
+ upgini-1.2.91a3884.dev5.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
76
+ upgini-1.2.91a3884.dev5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
+ upgini-1.2.91a3884.dev5.dist-info/RECORD,,