upgini 1.2.91a3884.dev1__py3-none-any.whl → 1.2.91a3884.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/resource_bundle/strings.properties +1 -0
- upgini/utils/sample_utils.py +75 -13
- {upgini-1.2.91a3884.dev1.dist-info → upgini-1.2.91a3884.dev2.dist-info}/METADATA +1 -1
- {upgini-1.2.91a3884.dev1.dist-info → upgini-1.2.91a3884.dev2.dist-info}/RECORD +7 -7
- {upgini-1.2.91a3884.dev1.dist-info → upgini-1.2.91a3884.dev2.dist-info}/WHEEL +0 -0
- {upgini-1.2.91a3884.dev1.dist-info → upgini-1.2.91a3884.dev2.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.91a3884.
|
1
|
+
__version__ = "1.2.91a3884.dev2"
|
@@ -196,6 +196,7 @@ timeseries_invalid_test_size_type=test_size={} should be a float in the (0, 1) r
|
|
196
196
|
timeseries_splits_more_than_samples=Number of splits={} can't be more than number of samples={}
|
197
197
|
timeseries_invalid_test_size=Wrong number of samples in a test fold: (test_size * n_samples / n_splits) <= 1
|
198
198
|
date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns. Please remove them and try again
|
199
|
+
missing_ids_in_eval_set=Following ids are present in eval set but not in sampled train set: {}. They will be removed from eval set.
|
199
200
|
# Upload ads validation
|
200
201
|
ads_upload_too_few_rows=At least 1000 records per sample are needed. Increase the sample size for evaluation and resubmit the data
|
201
202
|
ads_upload_search_key_not_found=Search key {} wasn't found in dataframe columns
|
upgini/utils/sample_utils.py
CHANGED
@@ -92,12 +92,13 @@ def sample(
|
|
92
92
|
fit_sample_rows = sample_config.fit_sample_rows
|
93
93
|
|
94
94
|
if cv_type is not None and cv_type.is_time_series():
|
95
|
-
return
|
95
|
+
return sample_time_series_train_eval(
|
96
96
|
df,
|
97
|
-
sample_columns
|
98
|
-
sample_columns.date,
|
97
|
+
sample_columns,
|
99
98
|
sample_config.fit_sample_rows_ts,
|
100
|
-
|
99
|
+
trim_threshold=fit_sample_threshold,
|
100
|
+
max_rows=fit_sample_rows,
|
101
|
+
random_state=random_state,
|
101
102
|
logger=logger,
|
102
103
|
**kwargs,
|
103
104
|
)
|
@@ -128,6 +129,68 @@ def sample(
|
|
128
129
|
return df
|
129
130
|
|
130
131
|
|
132
|
+
def sample_time_series_train_eval(
|
133
|
+
df: pd.DataFrame,
|
134
|
+
sample_columns: SampleColumns,
|
135
|
+
sample_size: int,
|
136
|
+
trim_threshold: int,
|
137
|
+
max_rows: int,
|
138
|
+
random_state: int = 42,
|
139
|
+
logger: Optional[logging.Logger] = None,
|
140
|
+
bundle: Optional[ResourceBundle] = None,
|
141
|
+
**kwargs,
|
142
|
+
):
|
143
|
+
if sample_columns.eval_set_index in df.columns:
|
144
|
+
train_df = df[df[sample_columns.eval_set_index] == 0]
|
145
|
+
eval_df = df[df[sample_columns.eval_set_index] > 0]
|
146
|
+
else:
|
147
|
+
train_df = df
|
148
|
+
eval_df = None
|
149
|
+
|
150
|
+
train_df = sample_time_series_trunc(
|
151
|
+
train_df, sample_columns.ids, sample_columns.date, sample_size, random_state, logger=logger, **kwargs
|
152
|
+
)
|
153
|
+
if sample_columns.ids and eval_df is not None:
|
154
|
+
missing_ids = (
|
155
|
+
eval_df[~eval_df[sample_columns.ids].isin(np.unique(train_df[sample_columns.ids]))][sample_columns.ids]
|
156
|
+
.dropna()
|
157
|
+
.drop_duplicates()
|
158
|
+
.values.tolist()
|
159
|
+
)
|
160
|
+
if missing_ids:
|
161
|
+
bundle = bundle or get_custom_bundle()
|
162
|
+
print(bundle.get("missing_ids_in_eval_set").format(missing_ids))
|
163
|
+
eval_df = eval_df.merge(train_df[sample_columns.ids].drop_duplicates())
|
164
|
+
|
165
|
+
if eval_df is not None:
|
166
|
+
if len(eval_df) > trim_threshold - len(train_df):
|
167
|
+
eval_df = sample_time_series_trunc(
|
168
|
+
eval_df,
|
169
|
+
sample_columns.ids,
|
170
|
+
sample_columns.date,
|
171
|
+
max_rows - len(train_df),
|
172
|
+
random_state,
|
173
|
+
logger=logger,
|
174
|
+
**kwargs,
|
175
|
+
)
|
176
|
+
df = pd.concat([train_df, eval_df])
|
177
|
+
|
178
|
+
elif len(train_df) > max_rows:
|
179
|
+
df = sample_time_series_trunc(
|
180
|
+
train_df,
|
181
|
+
sample_columns.ids,
|
182
|
+
sample_columns.date,
|
183
|
+
max_rows,
|
184
|
+
random_state,
|
185
|
+
logger=logger,
|
186
|
+
**kwargs,
|
187
|
+
)
|
188
|
+
else:
|
189
|
+
df = train_df
|
190
|
+
|
191
|
+
return df
|
192
|
+
|
193
|
+
|
131
194
|
def sample_time_series_trunc(
|
132
195
|
df: pd.DataFrame,
|
133
196
|
id_columns: Optional[List[str]],
|
@@ -242,9 +305,7 @@ def sample_time_series(
|
|
242
305
|
|
243
306
|
def balance_undersample_forced(
|
244
307
|
df: pd.DataFrame,
|
245
|
-
|
246
|
-
id_columns: Optional[List[str]],
|
247
|
-
date_column: str,
|
308
|
+
sample_columns: SampleColumns,
|
248
309
|
task_type: ModelTaskType,
|
249
310
|
cv_type: Optional[CVType],
|
250
311
|
random_state: int,
|
@@ -268,7 +329,7 @@ def balance_undersample_forced(
|
|
268
329
|
if warning_callback is not None:
|
269
330
|
warning_callback(msg)
|
270
331
|
|
271
|
-
target = df[
|
332
|
+
target = df[sample_columns.target].copy()
|
272
333
|
|
273
334
|
vc = target.value_counts()
|
274
335
|
max_class_value = vc.index[0]
|
@@ -280,11 +341,12 @@ def balance_undersample_forced(
|
|
280
341
|
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
281
342
|
if cv_type is not None and cv_type.is_time_series():
|
282
343
|
logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
|
283
|
-
resampled_data =
|
344
|
+
resampled_data = sample_time_series_train_eval(
|
284
345
|
df,
|
285
|
-
|
286
|
-
date_column=date_column,
|
346
|
+
sample_columns=sample_columns,
|
287
347
|
sample_size=sample_size,
|
348
|
+
trim_threshold=sample_size,
|
349
|
+
max_rows=sample_size,
|
288
350
|
random_state=random_state,
|
289
351
|
logger=logger,
|
290
352
|
)
|
@@ -296,8 +358,8 @@ def balance_undersample_forced(
|
|
296
358
|
logger.warning(msg)
|
297
359
|
|
298
360
|
# fill up to min_sample_threshold by majority class
|
299
|
-
minority_class = df[df[
|
300
|
-
majority_class = df[df[
|
361
|
+
minority_class = df[df[sample_columns.target] == min_class_value]
|
362
|
+
majority_class = df[df[sample_columns.target] != min_class_value]
|
301
363
|
logger.info(
|
302
364
|
f"Min class count: {min_class_count}. Max class count: {max_class_count}."
|
303
365
|
f" Rebalance sample size: {sample_size}"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.91a3884.
|
3
|
+
Version: 1.2.91a3884.dev2
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -1,4 +1,4 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=RG6ueZRQs3BUHBh7ch4HmvbtSSP9jbOsaWiZUilfT0s,33
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=c0rZ-ydrnCdrTzx10WZl4WbO3LdyuF0fUCRD8Ugjitg,33093
|
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
38
38
|
upgini/normalizer/normalize_utils.py,sha256=g2TcDXZeJp9kAFO2sTqZ4CAsN4J1qHNgoJHZ8gtzUWo,7376
|
39
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
40
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
41
|
-
upgini/resource_bundle/strings.properties,sha256=
|
41
|
+
upgini/resource_bundle/strings.properties,sha256=Hfpr2-I5Ws6ugIN1QSz549OHayZeLYglRsbrGDT6g9g,28491
|
42
42
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
43
43
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
44
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
@@ -64,14 +64,14 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
|
64
64
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
65
65
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
66
66
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
67
|
-
upgini/utils/sample_utils.py,sha256=
|
67
|
+
upgini/utils/sample_utils.py,sha256=wjxVEm5owjoJLUU-Re1gNwj81QLhtSWSGxvp2GVXRxg,15235
|
68
68
|
upgini/utils/sklearn_ext.py,sha256=jLJWAKkqQinV15Z4y1ZnsN3c-fKFwXTsprs00COnyVU,49315
|
69
69
|
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
70
70
|
upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,9049
|
71
71
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
72
72
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
73
73
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
74
|
-
upgini-1.2.91a3884.
|
75
|
-
upgini-1.2.91a3884.
|
76
|
-
upgini-1.2.91a3884.
|
77
|
-
upgini-1.2.91a3884.
|
74
|
+
upgini-1.2.91a3884.dev2.dist-info/METADATA,sha256=mR63uBPU2-kA3MNagei7Uxxn0uO0YeTm9t5eU1y942A,49546
|
75
|
+
upgini-1.2.91a3884.dev2.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
76
|
+
upgini-1.2.91a3884.dev2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
77
|
+
upgini-1.2.91a3884.dev2.dist-info/RECORD,,
|
File without changes
|
File without changes
|