upgini 1.2.91a3884.dev3__py3-none-any.whl → 1.2.91a3884.dev5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/dataset.py +3 -11
- upgini/features_enricher.py +22 -6
- upgini/metrics.py +1 -1
- upgini/utils/sample_utils.py +4 -1
- {upgini-1.2.91a3884.dev3.dist-info → upgini-1.2.91a3884.dev5.dist-info}/METADATA +1 -1
- {upgini-1.2.91a3884.dev3.dist-info → upgini-1.2.91a3884.dev5.dist-info}/RECORD +9 -9
- {upgini-1.2.91a3884.dev3.dist-info → upgini-1.2.91a3884.dev5.dist-info}/WHEEL +0 -0
- {upgini-1.2.91a3884.dev3.dist-info → upgini-1.2.91a3884.dev5.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.91a3884.
|
1
|
+
__version__ = "1.2.91a3884.dev5"
|
upgini/dataset.py
CHANGED
@@ -48,17 +48,9 @@ except Exception:
|
|
48
48
|
)
|
49
49
|
|
50
50
|
|
51
|
-
class Dataset:
|
51
|
+
class Dataset:
|
52
52
|
MIN_ROWS_COUNT = 100
|
53
53
|
MAX_ROWS = 200_000
|
54
|
-
FIT_SAMPLE_ROWS = 200_000
|
55
|
-
FIT_SAMPLE_THRESHOLD = 200_000
|
56
|
-
FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
|
57
|
-
FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
|
58
|
-
FIT_SAMPLE_THRESHOLD_TS = 54_000
|
59
|
-
FIT_SAMPLE_ROWS_TS = 54_000
|
60
|
-
BINARY_MIN_SAMPLE_THRESHOLD = 5_000
|
61
|
-
MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
|
62
54
|
IMBALANCE_THESHOLD = 0.6
|
63
55
|
BINARY_BOOTSTRAP_LOOPS = 5
|
64
56
|
MULTICLASS_BOOTSTRAP_LOOPS = 2
|
@@ -576,8 +568,8 @@ class Dataset: # (pd.DataFrame):
|
|
576
568
|
def _set_sample_size(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
|
577
569
|
if runtime_parameters is not None and runtime_parameters.properties is not None:
|
578
570
|
if self.cv_type is not None and self.cv_type.is_time_series():
|
579
|
-
runtime_parameters.properties["sample_size"] = self.
|
580
|
-
runtime_parameters.properties["iter0_sample_size"] = self.
|
571
|
+
runtime_parameters.properties["sample_size"] = self.sample_config.fit_sample_rows_ts
|
572
|
+
runtime_parameters.properties["iter0_sample_size"] = self.sample_config.fit_sample_rows_ts
|
581
573
|
return runtime_parameters
|
582
574
|
|
583
575
|
def _clean_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
|
upgini/features_enricher.py
CHANGED
@@ -302,7 +302,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
302
302
|
self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
303
303
|
self.metrics: Optional[pd.DataFrame] = None
|
304
304
|
self.feature_names_ = []
|
305
|
-
self.
|
305
|
+
self.external_source_feature_names = []
|
306
306
|
self.feature_importances_ = []
|
307
307
|
self.search_id = search_id
|
308
308
|
self.disable_force_downsampling = disable_force_downsampling
|
@@ -981,6 +981,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
981
981
|
client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
|
982
982
|
estimator, validated_X, self.search_keys
|
983
983
|
)
|
984
|
+
if self.id_columns and self.id_columns_encoder is not None:
|
985
|
+
if cat_features_from_backend:
|
986
|
+
cat_features_from_backend = [
|
987
|
+
c
|
988
|
+
for c in cat_features_from_backend
|
989
|
+
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
990
|
+
]
|
991
|
+
if client_cat_features:
|
992
|
+
client_cat_features = [
|
993
|
+
c
|
994
|
+
for c in client_cat_features
|
995
|
+
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
996
|
+
]
|
984
997
|
for cat_feature in cat_features_from_backend:
|
985
998
|
original_cat_feature = self.fit_columns_renaming.get(cat_feature)
|
986
999
|
if original_cat_feature in self.search_keys:
|
@@ -2346,7 +2359,9 @@ if response.status_code == 200:
|
|
2346
2359
|
|
2347
2360
|
is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
|
2348
2361
|
|
2349
|
-
columns_to_drop = [
|
2362
|
+
columns_to_drop = [
|
2363
|
+
c for c in df.columns if c in self.feature_names_ and c in self.external_source_feature_names
|
2364
|
+
]
|
2350
2365
|
if len(columns_to_drop) > 0:
|
2351
2366
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
2352
2367
|
self.logger.warning(msg)
|
@@ -2654,7 +2669,7 @@ if response.status_code == 200:
|
|
2654
2669
|
selecting_columns = [
|
2655
2670
|
c
|
2656
2671
|
for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
|
2657
|
-
if c not in self.
|
2672
|
+
if c not in self.external_source_feature_names or c in (self.id_columns or [])
|
2658
2673
|
]
|
2659
2674
|
selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
|
2660
2675
|
if add_fit_system_record_id:
|
@@ -4048,7 +4063,7 @@ if response.status_code == 200:
|
|
4048
4063
|
df = df.rename(columns=original_names_dict)
|
4049
4064
|
|
4050
4065
|
self.feature_names_ = []
|
4051
|
-
self.
|
4066
|
+
self.external_source_feature_names = []
|
4052
4067
|
self.feature_importances_ = []
|
4053
4068
|
features_info = []
|
4054
4069
|
features_info_without_links = []
|
@@ -4079,10 +4094,11 @@ if response.status_code == 200:
|
|
4079
4094
|
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
4080
4095
|
is_client_feature = original_name in df.columns
|
4081
4096
|
|
4097
|
+
if not is_client_feature:
|
4098
|
+
self.external_source_feature_names.append(original_name)
|
4099
|
+
|
4082
4100
|
# TODO make a decision about selected features based on special flag from mlb
|
4083
4101
|
if original_shaps.get(feature_meta.name, 0.0) == 0.0:
|
4084
|
-
if is_client_feature and self.fit_select_features:
|
4085
|
-
self.zero_shap_client_features.append(original_name)
|
4086
4102
|
continue
|
4087
4103
|
|
4088
4104
|
# Use only important features
|
upgini/metrics.py
CHANGED
@@ -815,7 +815,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
815
815
|
else:
|
816
816
|
encoded = cat_encoder.transform(x[self.cat_features])
|
817
817
|
cat_features = encoded.columns.to_list()
|
818
|
-
x[self.cat_features] = encoded
|
818
|
+
x.loc[:, self.cat_features] = encoded
|
819
819
|
else:
|
820
820
|
cat_features = self.cat_features
|
821
821
|
|
upgini/utils/sample_utils.py
CHANGED
@@ -15,7 +15,7 @@ TS_MIN_DIFFERENT_IDS_RATIO = 0.2
|
|
15
15
|
TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
|
16
16
|
TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
|
17
17
|
TS_DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
|
18
|
-
FIT_SAMPLE_ROWS_TS =
|
18
|
+
FIT_SAMPLE_ROWS_TS = 100_000
|
19
19
|
|
20
20
|
BINARY_MIN_SAMPLE_THRESHOLD = 5_000
|
21
21
|
MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
|
@@ -173,6 +173,7 @@ def sample_time_series_train_eval(
|
|
173
173
|
logger=logger,
|
174
174
|
**kwargs,
|
175
175
|
)
|
176
|
+
logger.info(f"Eval set size: {len(eval_df)}")
|
176
177
|
df = pd.concat([train_df, eval_df])
|
177
178
|
|
178
179
|
elif len(train_df) > max_rows:
|
@@ -188,6 +189,8 @@ def sample_time_series_train_eval(
|
|
188
189
|
else:
|
189
190
|
df = train_df
|
190
191
|
|
192
|
+
logger.info(f"Train set size: {len(df)}")
|
193
|
+
|
191
194
|
return df
|
192
195
|
|
193
196
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.91a3884.
|
3
|
+
Version: 1.2.91a3884.dev5
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -1,12 +1,12 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=tqdF5EG5u2XotiGWCSnsQ61GODo019Lr-s097ItdAHs,33
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
|
-
upgini/dataset.py,sha256=
|
4
|
+
upgini/dataset.py,sha256=MituLJZTDdIwSk-Ia5G1pS52PERNHQ2P99FgCH2kTjQ,32790
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=udQdXpqVO4YAwEHjUzS195k2jxe5_CtZ-KTpWRicjfs,218225
|
7
7
|
upgini/http.py,sha256=4i7fQwrwU3WzDUOWzrgR-4C8eJwj_5dBwRAR-UjUtlc,44345
|
8
8
|
upgini/metadata.py,sha256=vsbbHyPCP3Rs8WkeDgQg99uAA_zmsbDStAT-NwDYhO4,12455
|
9
|
-
upgini/metrics.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=Bc1L9DUmEL8OWwNvIEjPjw5EyHSZbiu3v2hWyBmedis,45313
|
10
10
|
upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
@@ -64,14 +64,14 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
|
64
64
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
65
65
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
66
66
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
67
|
-
upgini/utils/sample_utils.py,sha256=
|
67
|
+
upgini/utils/sample_utils.py,sha256=jQ1em2FRnMFLul9ujuBgs5XZ9jAZ4eM4FHT3aDSjOy8,15351
|
68
68
|
upgini/utils/sklearn_ext.py,sha256=jLJWAKkqQinV15Z4y1ZnsN3c-fKFwXTsprs00COnyVU,49315
|
69
69
|
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
70
70
|
upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,9049
|
71
71
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
72
72
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
73
73
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
74
|
-
upgini-1.2.91a3884.
|
75
|
-
upgini-1.2.91a3884.
|
76
|
-
upgini-1.2.91a3884.
|
77
|
-
upgini-1.2.91a3884.
|
74
|
+
upgini-1.2.91a3884.dev5.dist-info/METADATA,sha256=A9MDv_VCFrWDDw2Xyo7Wsx3ps6ECwTul36gmw-wujgI,49546
|
75
|
+
upgini-1.2.91a3884.dev5.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
76
|
+
upgini-1.2.91a3884.dev5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
77
|
+
upgini-1.2.91a3884.dev5.dist-info/RECORD,,
|
File without changes
|
File without changes
|