upgini 1.2.91a3884.dev4__py3-none-any.whl → 1.2.91a3884.dev5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.91a3884.dev4"
1
+ __version__ = "1.2.91a3884.dev5"
upgini/dataset.py CHANGED
@@ -48,17 +48,9 @@ except Exception:
48
48
  )
49
49
 
50
50
 
51
- class Dataset: # (pd.DataFrame):
51
+ class Dataset:
52
52
  MIN_ROWS_COUNT = 100
53
53
  MAX_ROWS = 200_000
54
- FIT_SAMPLE_ROWS = 200_000
55
- FIT_SAMPLE_THRESHOLD = 200_000
56
- FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
57
- FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
58
- FIT_SAMPLE_THRESHOLD_TS = 54_000
59
- FIT_SAMPLE_ROWS_TS = 54_000
60
- BINARY_MIN_SAMPLE_THRESHOLD = 5_000
61
- MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
62
54
  IMBALANCE_THESHOLD = 0.6
63
55
  BINARY_BOOTSTRAP_LOOPS = 5
64
56
  MULTICLASS_BOOTSTRAP_LOOPS = 2
@@ -576,8 +568,8 @@ class Dataset: # (pd.DataFrame):
576
568
  def _set_sample_size(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
577
569
  if runtime_parameters is not None and runtime_parameters.properties is not None:
578
570
  if self.cv_type is not None and self.cv_type.is_time_series():
579
- runtime_parameters.properties["sample_size"] = self.FIT_SAMPLE_ROWS_TS
580
- runtime_parameters.properties["iter0_sample_size"] = self.FIT_SAMPLE_ROWS_TS
571
+ runtime_parameters.properties["sample_size"] = self.sample_config.fit_sample_rows_ts
572
+ runtime_parameters.properties["iter0_sample_size"] = self.sample_config.fit_sample_rows_ts
581
573
  return runtime_parameters
582
574
 
583
575
  def _clean_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
@@ -302,7 +302,7 @@ class FeaturesEnricher(TransformerMixin):
302
302
  self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
303
303
  self.metrics: Optional[pd.DataFrame] = None
304
304
  self.feature_names_ = []
305
- self.zero_shap_client_features = []
305
+ self.external_source_feature_names = []
306
306
  self.feature_importances_ = []
307
307
  self.search_id = search_id
308
308
  self.disable_force_downsampling = disable_force_downsampling
@@ -981,7 +981,7 @@ class FeaturesEnricher(TransformerMixin):
981
981
  client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
982
982
  estimator, validated_X, self.search_keys
983
983
  )
984
- if self.id_columns_encoder is not None:
984
+ if self.id_columns and self.id_columns_encoder is not None:
985
985
  if cat_features_from_backend:
986
986
  cat_features_from_backend = [
987
987
  c
@@ -2359,7 +2359,9 @@ if response.status_code == 200:
2359
2359
 
2360
2360
  is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
2361
2361
 
2362
- columns_to_drop = [c for c in df.columns if c in self.feature_names_ and c not in (self.id_columns or [])]
2362
+ columns_to_drop = [
2363
+ c for c in df.columns if c in self.feature_names_ and c in self.external_source_feature_names
2364
+ ]
2363
2365
  if len(columns_to_drop) > 0:
2364
2366
  msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2365
2367
  self.logger.warning(msg)
@@ -2667,7 +2669,7 @@ if response.status_code == 200:
2667
2669
  selecting_columns = [
2668
2670
  c
2669
2671
  for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
2670
- if c not in self.zero_shap_client_features or c in (self.id_columns or [])
2672
+ if c not in self.external_source_feature_names or c in (self.id_columns or [])
2671
2673
  ]
2672
2674
  selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
2673
2675
  if add_fit_system_record_id:
@@ -4061,7 +4063,7 @@ if response.status_code == 200:
4061
4063
  df = df.rename(columns=original_names_dict)
4062
4064
 
4063
4065
  self.feature_names_ = []
4064
- self.zero_shap_client_features = []
4066
+ self.external_source_feature_names = []
4065
4067
  self.feature_importances_ = []
4066
4068
  features_info = []
4067
4069
  features_info_without_links = []
@@ -4092,10 +4094,11 @@ if response.status_code == 200:
4092
4094
  original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4093
4095
  is_client_feature = original_name in df.columns
4094
4096
 
4097
+ if not is_client_feature:
4098
+ self.external_source_feature_names.append(original_name)
4099
+
4095
4100
  # TODO make a decision about selected features based on special flag from mlb
4096
4101
  if original_shaps.get(feature_meta.name, 0.0) == 0.0:
4097
- if is_client_feature and self.fit_select_features:
4098
- self.zero_shap_client_features.append(original_name)
4099
4102
  continue
4100
4103
 
4101
4104
  # Use only important features
@@ -15,7 +15,7 @@ TS_MIN_DIFFERENT_IDS_RATIO = 0.2
15
15
  TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
16
16
  TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
17
17
  TS_DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
18
- FIT_SAMPLE_ROWS_TS = 54_000
18
+ FIT_SAMPLE_ROWS_TS = 100_000
19
19
 
20
20
  BINARY_MIN_SAMPLE_THRESHOLD = 5_000
21
21
  MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.91a3884.dev4
3
+ Version: 1.2.91a3884.dev5
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=Zce8QcjCDT4pusClVC02UASiQuwoBtB7rmnk5bHiPIg,33
1
+ upgini/__about__.py,sha256=tqdF5EG5u2XotiGWCSnsQ61GODo019Lr-s097ItdAHs,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=c0rZ-ydrnCdrTzx10WZl4WbO3LdyuF0fUCRD8Ugjitg,33093
4
+ upgini/dataset.py,sha256=MituLJZTDdIwSk-Ia5G1pS52PERNHQ2P99FgCH2kTjQ,32790
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=omgw5OyM4OhamfRKymd84bBF8lh3kiDurpOfSBTdb_4,218184
6
+ upgini/features_enricher.py,sha256=udQdXpqVO4YAwEHjUzS195k2jxe5_CtZ-KTpWRicjfs,218225
7
7
  upgini/http.py,sha256=4i7fQwrwU3WzDUOWzrgR-4C8eJwj_5dBwRAR-UjUtlc,44345
8
8
  upgini/metadata.py,sha256=vsbbHyPCP3Rs8WkeDgQg99uAA_zmsbDStAT-NwDYhO4,12455
9
9
  upgini/metrics.py,sha256=Bc1L9DUmEL8OWwNvIEjPjw5EyHSZbiu3v2hWyBmedis,45313
@@ -64,14 +64,14 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
64
64
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
65
65
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
66
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
67
- upgini/utils/sample_utils.py,sha256=GMlffkOXQrcnJpTPXDOOPIlH82j4-fgo4D9mBSvJvEM,15350
67
+ upgini/utils/sample_utils.py,sha256=jQ1em2FRnMFLul9ujuBgs5XZ9jAZ4eM4FHT3aDSjOy8,15351
68
68
  upgini/utils/sklearn_ext.py,sha256=jLJWAKkqQinV15Z4y1ZnsN3c-fKFwXTsprs00COnyVU,49315
69
69
  upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
70
70
  upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,9049
71
71
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
72
72
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
73
73
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
74
- upgini-1.2.91a3884.dev4.dist-info/METADATA,sha256=7vCViUfOCfZCgQ_BFnWgWAcJ0oi4WtaVBij67MNXrK0,49546
75
- upgini-1.2.91a3884.dev4.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
76
- upgini-1.2.91a3884.dev4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
- upgini-1.2.91a3884.dev4.dist-info/RECORD,,
74
+ upgini-1.2.91a3884.dev5.dist-info/METADATA,sha256=A9MDv_VCFrWDDw2Xyo7Wsx3ps6ECwTul36gmw-wujgI,49546
75
+ upgini-1.2.91a3884.dev5.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
76
+ upgini-1.2.91a3884.dev5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
+ upgini-1.2.91a3884.dev5.dist-info/RECORD,,