upgini 1.2.91a3884.dev4__py3-none-any.whl → 1.2.91a3884.dev5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/dataset.py +3 -11
- upgini/features_enricher.py +10 -7
- upgini/utils/sample_utils.py +1 -1
- {upgini-1.2.91a3884.dev4.dist-info → upgini-1.2.91a3884.dev5.dist-info}/METADATA +1 -1
- {upgini-1.2.91a3884.dev4.dist-info → upgini-1.2.91a3884.dev5.dist-info}/RECORD +8 -8
- {upgini-1.2.91a3884.dev4.dist-info → upgini-1.2.91a3884.dev5.dist-info}/WHEEL +0 -0
- {upgini-1.2.91a3884.dev4.dist-info → upgini-1.2.91a3884.dev5.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.91a3884.
|
1
|
+
__version__ = "1.2.91a3884.dev5"
|
upgini/dataset.py
CHANGED
@@ -48,17 +48,9 @@ except Exception:
|
|
48
48
|
)
|
49
49
|
|
50
50
|
|
51
|
-
class Dataset:
|
51
|
+
class Dataset:
|
52
52
|
MIN_ROWS_COUNT = 100
|
53
53
|
MAX_ROWS = 200_000
|
54
|
-
FIT_SAMPLE_ROWS = 200_000
|
55
|
-
FIT_SAMPLE_THRESHOLD = 200_000
|
56
|
-
FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
|
57
|
-
FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
|
58
|
-
FIT_SAMPLE_THRESHOLD_TS = 54_000
|
59
|
-
FIT_SAMPLE_ROWS_TS = 54_000
|
60
|
-
BINARY_MIN_SAMPLE_THRESHOLD = 5_000
|
61
|
-
MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
|
62
54
|
IMBALANCE_THESHOLD = 0.6
|
63
55
|
BINARY_BOOTSTRAP_LOOPS = 5
|
64
56
|
MULTICLASS_BOOTSTRAP_LOOPS = 2
|
@@ -576,8 +568,8 @@ class Dataset: # (pd.DataFrame):
|
|
576
568
|
def _set_sample_size(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
|
577
569
|
if runtime_parameters is not None and runtime_parameters.properties is not None:
|
578
570
|
if self.cv_type is not None and self.cv_type.is_time_series():
|
579
|
-
runtime_parameters.properties["sample_size"] = self.
|
580
|
-
runtime_parameters.properties["iter0_sample_size"] = self.
|
571
|
+
runtime_parameters.properties["sample_size"] = self.sample_config.fit_sample_rows_ts
|
572
|
+
runtime_parameters.properties["iter0_sample_size"] = self.sample_config.fit_sample_rows_ts
|
581
573
|
return runtime_parameters
|
582
574
|
|
583
575
|
def _clean_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
|
upgini/features_enricher.py
CHANGED
@@ -302,7 +302,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
302
302
|
self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
303
303
|
self.metrics: Optional[pd.DataFrame] = None
|
304
304
|
self.feature_names_ = []
|
305
|
-
self.
|
305
|
+
self.external_source_feature_names = []
|
306
306
|
self.feature_importances_ = []
|
307
307
|
self.search_id = search_id
|
308
308
|
self.disable_force_downsampling = disable_force_downsampling
|
@@ -981,7 +981,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
981
981
|
client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
|
982
982
|
estimator, validated_X, self.search_keys
|
983
983
|
)
|
984
|
-
if self.id_columns_encoder is not None:
|
984
|
+
if self.id_columns and self.id_columns_encoder is not None:
|
985
985
|
if cat_features_from_backend:
|
986
986
|
cat_features_from_backend = [
|
987
987
|
c
|
@@ -2359,7 +2359,9 @@ if response.status_code == 200:
|
|
2359
2359
|
|
2360
2360
|
is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
|
2361
2361
|
|
2362
|
-
columns_to_drop = [
|
2362
|
+
columns_to_drop = [
|
2363
|
+
c for c in df.columns if c in self.feature_names_ and c in self.external_source_feature_names
|
2364
|
+
]
|
2363
2365
|
if len(columns_to_drop) > 0:
|
2364
2366
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
2365
2367
|
self.logger.warning(msg)
|
@@ -2667,7 +2669,7 @@ if response.status_code == 200:
|
|
2667
2669
|
selecting_columns = [
|
2668
2670
|
c
|
2669
2671
|
for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
|
2670
|
-
if c not in self.
|
2672
|
+
if c not in self.external_source_feature_names or c in (self.id_columns or [])
|
2671
2673
|
]
|
2672
2674
|
selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
|
2673
2675
|
if add_fit_system_record_id:
|
@@ -4061,7 +4063,7 @@ if response.status_code == 200:
|
|
4061
4063
|
df = df.rename(columns=original_names_dict)
|
4062
4064
|
|
4063
4065
|
self.feature_names_ = []
|
4064
|
-
self.
|
4066
|
+
self.external_source_feature_names = []
|
4065
4067
|
self.feature_importances_ = []
|
4066
4068
|
features_info = []
|
4067
4069
|
features_info_without_links = []
|
@@ -4092,10 +4094,11 @@ if response.status_code == 200:
|
|
4092
4094
|
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
4093
4095
|
is_client_feature = original_name in df.columns
|
4094
4096
|
|
4097
|
+
if not is_client_feature:
|
4098
|
+
self.external_source_feature_names.append(original_name)
|
4099
|
+
|
4095
4100
|
# TODO make a decision about selected features based on special flag from mlb
|
4096
4101
|
if original_shaps.get(feature_meta.name, 0.0) == 0.0:
|
4097
|
-
if is_client_feature and self.fit_select_features:
|
4098
|
-
self.zero_shap_client_features.append(original_name)
|
4099
4102
|
continue
|
4100
4103
|
|
4101
4104
|
# Use only important features
|
upgini/utils/sample_utils.py
CHANGED
@@ -15,7 +15,7 @@ TS_MIN_DIFFERENT_IDS_RATIO = 0.2
|
|
15
15
|
TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
|
16
16
|
TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
|
17
17
|
TS_DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
|
18
|
-
FIT_SAMPLE_ROWS_TS =
|
18
|
+
FIT_SAMPLE_ROWS_TS = 100_000
|
19
19
|
|
20
20
|
BINARY_MIN_SAMPLE_THRESHOLD = 5_000
|
21
21
|
MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.91a3884.
|
3
|
+
Version: 1.2.91a3884.dev5
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -1,9 +1,9 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=tqdF5EG5u2XotiGWCSnsQ61GODo019Lr-s097ItdAHs,33
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
|
-
upgini/dataset.py,sha256=
|
4
|
+
upgini/dataset.py,sha256=MituLJZTDdIwSk-Ia5G1pS52PERNHQ2P99FgCH2kTjQ,32790
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=udQdXpqVO4YAwEHjUzS195k2jxe5_CtZ-KTpWRicjfs,218225
|
7
7
|
upgini/http.py,sha256=4i7fQwrwU3WzDUOWzrgR-4C8eJwj_5dBwRAR-UjUtlc,44345
|
8
8
|
upgini/metadata.py,sha256=vsbbHyPCP3Rs8WkeDgQg99uAA_zmsbDStAT-NwDYhO4,12455
|
9
9
|
upgini/metrics.py,sha256=Bc1L9DUmEL8OWwNvIEjPjw5EyHSZbiu3v2hWyBmedis,45313
|
@@ -64,14 +64,14 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
|
64
64
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
65
65
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
66
66
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
67
|
-
upgini/utils/sample_utils.py,sha256=
|
67
|
+
upgini/utils/sample_utils.py,sha256=jQ1em2FRnMFLul9ujuBgs5XZ9jAZ4eM4FHT3aDSjOy8,15351
|
68
68
|
upgini/utils/sklearn_ext.py,sha256=jLJWAKkqQinV15Z4y1ZnsN3c-fKFwXTsprs00COnyVU,49315
|
69
69
|
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
70
70
|
upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,9049
|
71
71
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
72
72
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
73
73
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
74
|
-
upgini-1.2.91a3884.
|
75
|
-
upgini-1.2.91a3884.
|
76
|
-
upgini-1.2.91a3884.
|
77
|
-
upgini-1.2.91a3884.
|
74
|
+
upgini-1.2.91a3884.dev5.dist-info/METADATA,sha256=A9MDv_VCFrWDDw2Xyo7Wsx3ps6ECwTul36gmw-wujgI,49546
|
75
|
+
upgini-1.2.91a3884.dev5.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
76
|
+
upgini-1.2.91a3884.dev5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
77
|
+
upgini-1.2.91a3884.dev5.dist-info/RECORD,,
|
File without changes
|
File without changes
|