upgini 1.2.91a3884.dev1__py3-none-any.whl → 1.2.91a3884.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/resource_bundle/strings.properties +1 -0
- upgini/utils/sample_utils.py +76 -13
- {upgini-1.2.91a3884.dev1.dist-info → upgini-1.2.91a3884.dev3.dist-info}/METADATA +1 -1
- {upgini-1.2.91a3884.dev1.dist-info → upgini-1.2.91a3884.dev3.dist-info}/RECORD +7 -7
- {upgini-1.2.91a3884.dev1.dist-info → upgini-1.2.91a3884.dev3.dist-info}/WHEEL +0 -0
- {upgini-1.2.91a3884.dev1.dist-info → upgini-1.2.91a3884.dev3.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.91a3884.
|
1
|
+
__version__ = "1.2.91a3884.dev3"
|
@@ -196,6 +196,7 @@ timeseries_invalid_test_size_type=test_size={} should be a float in the (0, 1) r
|
|
196
196
|
timeseries_splits_more_than_samples=Number of splits={} can't be more than number of samples={}
|
197
197
|
timeseries_invalid_test_size=Wrong number of samples in a test fold: (test_size * n_samples / n_splits) <= 1
|
198
198
|
date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns. Please remove them and try again
|
199
|
+
missing_ids_in_eval_set=Following ids are present in eval set but not in sampled train set: {}. They will be removed from eval set.
|
199
200
|
# Upload ads validation
|
200
201
|
ads_upload_too_few_rows=At least 1000 records per sample are needed. Increase the sample size for evaluation and resubmit the data
|
201
202
|
ads_upload_search_key_not_found=Search key {} wasn't found in dataframe columns
|
upgini/utils/sample_utils.py
CHANGED
@@ -92,12 +92,13 @@ def sample(
|
|
92
92
|
fit_sample_rows = sample_config.fit_sample_rows
|
93
93
|
|
94
94
|
if cv_type is not None and cv_type.is_time_series():
|
95
|
-
return
|
95
|
+
return sample_time_series_train_eval(
|
96
96
|
df,
|
97
|
-
sample_columns
|
98
|
-
sample_columns.date,
|
97
|
+
sample_columns,
|
99
98
|
sample_config.fit_sample_rows_ts,
|
100
|
-
|
99
|
+
trim_threshold=fit_sample_threshold,
|
100
|
+
max_rows=fit_sample_rows,
|
101
|
+
random_state=random_state,
|
101
102
|
logger=logger,
|
102
103
|
**kwargs,
|
103
104
|
)
|
@@ -128,6 +129,68 @@ def sample(
|
|
128
129
|
return df
|
129
130
|
|
130
131
|
|
132
|
+
def sample_time_series_train_eval(
|
133
|
+
df: pd.DataFrame,
|
134
|
+
sample_columns: SampleColumns,
|
135
|
+
sample_size: int,
|
136
|
+
trim_threshold: int,
|
137
|
+
max_rows: int,
|
138
|
+
random_state: int = 42,
|
139
|
+
logger: Optional[logging.Logger] = None,
|
140
|
+
bundle: Optional[ResourceBundle] = None,
|
141
|
+
**kwargs,
|
142
|
+
):
|
143
|
+
if sample_columns.eval_set_index in df.columns:
|
144
|
+
train_df = df[df[sample_columns.eval_set_index] == 0]
|
145
|
+
eval_df = df[df[sample_columns.eval_set_index] > 0]
|
146
|
+
else:
|
147
|
+
train_df = df
|
148
|
+
eval_df = None
|
149
|
+
|
150
|
+
train_df = sample_time_series_trunc(
|
151
|
+
train_df, sample_columns.ids, sample_columns.date, sample_size, random_state, logger=logger, **kwargs
|
152
|
+
)
|
153
|
+
if sample_columns.ids and eval_df is not None:
|
154
|
+
missing_ids = (
|
155
|
+
eval_df[~eval_df[sample_columns.ids].isin(np.unique(train_df[sample_columns.ids]))][sample_columns.ids]
|
156
|
+
.dropna()
|
157
|
+
.drop_duplicates()
|
158
|
+
.values.tolist()
|
159
|
+
)
|
160
|
+
if missing_ids:
|
161
|
+
bundle = bundle or get_custom_bundle()
|
162
|
+
print(bundle.get("missing_ids_in_eval_set").format(missing_ids))
|
163
|
+
eval_df = eval_df.merge(train_df[sample_columns.ids].drop_duplicates())
|
164
|
+
|
165
|
+
if eval_df is not None:
|
166
|
+
if len(eval_df) > trim_threshold - len(train_df):
|
167
|
+
eval_df = sample_time_series_trunc(
|
168
|
+
eval_df,
|
169
|
+
sample_columns.ids,
|
170
|
+
sample_columns.date,
|
171
|
+
max_rows - len(train_df),
|
172
|
+
random_state,
|
173
|
+
logger=logger,
|
174
|
+
**kwargs,
|
175
|
+
)
|
176
|
+
df = pd.concat([train_df, eval_df])
|
177
|
+
|
178
|
+
elif len(train_df) > max_rows:
|
179
|
+
df = sample_time_series_trunc(
|
180
|
+
train_df,
|
181
|
+
sample_columns.ids,
|
182
|
+
sample_columns.date,
|
183
|
+
max_rows,
|
184
|
+
random_state,
|
185
|
+
logger=logger,
|
186
|
+
**kwargs,
|
187
|
+
)
|
188
|
+
else:
|
189
|
+
df = train_df
|
190
|
+
|
191
|
+
return df
|
192
|
+
|
193
|
+
|
131
194
|
def sample_time_series_trunc(
|
132
195
|
df: pd.DataFrame,
|
133
196
|
id_columns: Optional[List[str]],
|
@@ -189,6 +252,7 @@ def sample_time_series(
|
|
189
252
|
min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO,
|
190
253
|
prefer_recent_dates: bool = True,
|
191
254
|
logger: Optional[logging.Logger] = None,
|
255
|
+
**kwargs,
|
192
256
|
):
|
193
257
|
def ensure_tuple(x):
|
194
258
|
return tuple([x]) if not isinstance(x, tuple) else x
|
@@ -242,9 +306,7 @@ def sample_time_series(
|
|
242
306
|
|
243
307
|
def balance_undersample_forced(
|
244
308
|
df: pd.DataFrame,
|
245
|
-
|
246
|
-
id_columns: Optional[List[str]],
|
247
|
-
date_column: str,
|
309
|
+
sample_columns: SampleColumns,
|
248
310
|
task_type: ModelTaskType,
|
249
311
|
cv_type: Optional[CVType],
|
250
312
|
random_state: int,
|
@@ -268,7 +330,7 @@ def balance_undersample_forced(
|
|
268
330
|
if warning_callback is not None:
|
269
331
|
warning_callback(msg)
|
270
332
|
|
271
|
-
target = df[
|
333
|
+
target = df[sample_columns.target].copy()
|
272
334
|
|
273
335
|
vc = target.value_counts()
|
274
336
|
max_class_value = vc.index[0]
|
@@ -280,11 +342,12 @@ def balance_undersample_forced(
|
|
280
342
|
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
281
343
|
if cv_type is not None and cv_type.is_time_series():
|
282
344
|
logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
|
283
|
-
resampled_data =
|
345
|
+
resampled_data = sample_time_series_train_eval(
|
284
346
|
df,
|
285
|
-
|
286
|
-
date_column=date_column,
|
347
|
+
sample_columns=sample_columns,
|
287
348
|
sample_size=sample_size,
|
349
|
+
trim_threshold=sample_size,
|
350
|
+
max_rows=sample_size,
|
288
351
|
random_state=random_state,
|
289
352
|
logger=logger,
|
290
353
|
)
|
@@ -296,8 +359,8 @@ def balance_undersample_forced(
|
|
296
359
|
logger.warning(msg)
|
297
360
|
|
298
361
|
# fill up to min_sample_threshold by majority class
|
299
|
-
minority_class = df[df[
|
300
|
-
majority_class = df[df[
|
362
|
+
minority_class = df[df[sample_columns.target] == min_class_value]
|
363
|
+
majority_class = df[df[sample_columns.target] != min_class_value]
|
301
364
|
logger.info(
|
302
365
|
f"Min class count: {min_class_count}. Max class count: {max_class_count}."
|
303
366
|
f" Rebalance sample size: {sample_size}"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.91a3884.
|
3
|
+
Version: 1.2.91a3884.dev3
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -1,4 +1,4 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=55Sg-JLu4aw-5ANNPanS_ciHPSsxXTa8YndbgltGREA,33
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=c0rZ-ydrnCdrTzx10WZl4WbO3LdyuF0fUCRD8Ugjitg,33093
|
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
38
38
|
upgini/normalizer/normalize_utils.py,sha256=g2TcDXZeJp9kAFO2sTqZ4CAsN4J1qHNgoJHZ8gtzUWo,7376
|
39
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
40
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
41
|
-
upgini/resource_bundle/strings.properties,sha256=
|
41
|
+
upgini/resource_bundle/strings.properties,sha256=Hfpr2-I5Ws6ugIN1QSz549OHayZeLYglRsbrGDT6g9g,28491
|
42
42
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
43
43
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
44
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
@@ -64,14 +64,14 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
|
64
64
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
65
65
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
66
66
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
67
|
-
upgini/utils/sample_utils.py,sha256=
|
67
|
+
upgini/utils/sample_utils.py,sha256=PpMXRVTPKi6TyAo0gPhF0OmXmutecHdonM7WYUsB1Wo,15249
|
68
68
|
upgini/utils/sklearn_ext.py,sha256=jLJWAKkqQinV15Z4y1ZnsN3c-fKFwXTsprs00COnyVU,49315
|
69
69
|
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
70
70
|
upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,9049
|
71
71
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
72
72
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
73
73
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
74
|
-
upgini-1.2.91a3884.
|
75
|
-
upgini-1.2.91a3884.
|
76
|
-
upgini-1.2.91a3884.
|
77
|
-
upgini-1.2.91a3884.
|
74
|
+
upgini-1.2.91a3884.dev3.dist-info/METADATA,sha256=teoc8dCmv4mb2eBV6QruZag3xnwK3YAdlKCHuIKllXw,49546
|
75
|
+
upgini-1.2.91a3884.dev3.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
76
|
+
upgini-1.2.91a3884.dev3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
77
|
+
upgini-1.2.91a3884.dev3.dist-info/RECORD,,
|
File without changes
|
File without changes
|