upgini 1.1.280a3418.post2__py3-none-any.whl → 1.2.31a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/__init__.py +4 -20
- upgini/autofe/all_operands.py +39 -10
- upgini/autofe/binary.py +148 -45
- upgini/autofe/date.py +197 -26
- upgini/autofe/feature.py +102 -19
- upgini/autofe/groupby.py +22 -22
- upgini/autofe/operand.py +9 -6
- upgini/autofe/unary.py +78 -54
- upgini/autofe/vector.py +8 -8
- upgini/data_source/data_source_publisher.py +128 -5
- upgini/dataset.py +50 -386
- upgini/features_enricher.py +936 -541
- upgini/http.py +27 -16
- upgini/lazy_import.py +35 -0
- upgini/metadata.py +84 -59
- upgini/metrics.py +164 -34
- upgini/normalizer/normalize_utils.py +197 -0
- upgini/resource_bundle/strings.properties +66 -51
- upgini/search_task.py +10 -4
- upgini/utils/Roboto-Regular.ttf +0 -0
- upgini/utils/base_search_key_detector.py +14 -12
- upgini/utils/country_utils.py +16 -0
- upgini/utils/custom_loss_utils.py +39 -36
- upgini/utils/datetime_utils.py +98 -45
- upgini/utils/deduplicate_utils.py +135 -112
- upgini/utils/display_utils.py +46 -15
- upgini/utils/email_utils.py +54 -16
- upgini/utils/feature_info.py +172 -0
- upgini/utils/features_validator.py +34 -20
- upgini/utils/ip_utils.py +100 -1
- upgini/utils/phone_utils.py +343 -0
- upgini/utils/postal_code_utils.py +34 -0
- upgini/utils/sklearn_ext.py +28 -19
- upgini/utils/target_utils.py +113 -57
- upgini/utils/warning_counter.py +1 -0
- upgini/version_validator.py +8 -4
- {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31a1.dist-info}/METADATA +31 -16
- upgini-1.2.31a1.dist-info/RECORD +65 -0
- upgini/normalizer/phone_normalizer.py +0 -340
- upgini-1.1.280a3418.post2.dist-info/RECORD +0 -62
- {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31a1.dist-info}/WHEEL +0 -0
- {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31a1.dist-info}/licenses/LICENSE +0 -0
upgini/dataset.py
CHANGED
|
@@ -1,30 +1,25 @@
|
|
|
1
1
|
import csv
|
|
2
|
-
import hashlib
|
|
3
2
|
import logging
|
|
4
3
|
import tempfile
|
|
5
4
|
import time
|
|
6
|
-
from ipaddress import IPv4Address, IPv6Address, _BaseAddress, ip_address
|
|
7
5
|
from pathlib import Path
|
|
8
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
6
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
9
7
|
|
|
10
8
|
import numpy as np
|
|
11
9
|
import pandas as pd
|
|
12
|
-
from pandas.api.types import is_bool_dtype as is_bool
|
|
13
|
-
from pandas.api.types import is_datetime64_any_dtype as is_datetime
|
|
14
10
|
from pandas.api.types import (
|
|
15
11
|
is_float_dtype,
|
|
16
12
|
is_integer_dtype,
|
|
17
13
|
is_numeric_dtype,
|
|
18
14
|
is_object_dtype,
|
|
19
|
-
is_period_dtype,
|
|
20
15
|
is_string_dtype,
|
|
21
16
|
)
|
|
22
17
|
|
|
23
18
|
from upgini.errors import ValidationError
|
|
24
19
|
from upgini.http import ProgressStage, SearchProgress, _RestClient
|
|
25
20
|
from upgini.metadata import (
|
|
21
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
26
22
|
EVAL_SET_INDEX,
|
|
27
|
-
SYSTEM_COLUMNS,
|
|
28
23
|
SYSTEM_RECORD_ID,
|
|
29
24
|
TARGET,
|
|
30
25
|
DataType,
|
|
@@ -38,10 +33,8 @@ from upgini.metadata import (
|
|
|
38
33
|
RuntimeParameters,
|
|
39
34
|
SearchCustomization,
|
|
40
35
|
)
|
|
41
|
-
from upgini.normalizer.phone_normalizer import PhoneNormalizer
|
|
42
36
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
43
37
|
from upgini.search_task import SearchTask
|
|
44
|
-
from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
|
|
45
38
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
|
46
39
|
from upgini.utils.target_utils import balance_undersample
|
|
47
40
|
|
|
@@ -60,7 +53,8 @@ class Dataset: # (pd.DataFrame):
|
|
|
60
53
|
FIT_SAMPLE_THRESHOLD = 200_000
|
|
61
54
|
FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
|
|
62
55
|
FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
|
|
63
|
-
|
|
56
|
+
BINARY_MIN_SAMPLE_THRESHOLD = 5_000
|
|
57
|
+
MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
|
|
64
58
|
IMBALANCE_THESHOLD = 0.6
|
|
65
59
|
BINARY_BOOTSTRAP_LOOPS = 5
|
|
66
60
|
MULTICLASS_BOOTSTRAP_LOOPS = 2
|
|
@@ -79,6 +73,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
79
73
|
path: Optional[str] = None,
|
|
80
74
|
meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
|
|
81
75
|
search_keys: Optional[List[Tuple[str, ...]]] = None,
|
|
76
|
+
unnest_search_keys: Optional[Dict[str, str]] = None,
|
|
82
77
|
model_task_type: Optional[ModelTaskType] = None,
|
|
83
78
|
random_state: Optional[int] = None,
|
|
84
79
|
rest_client: Optional[_RestClient] = None,
|
|
@@ -113,7 +108,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
113
108
|
self.description = description
|
|
114
109
|
self.meaning_types = meaning_types
|
|
115
110
|
self.search_keys = search_keys
|
|
116
|
-
self.
|
|
111
|
+
self.unnest_search_keys = unnest_search_keys
|
|
117
112
|
self.hierarchical_group_keys = []
|
|
118
113
|
self.hierarchical_subgroup_keys = []
|
|
119
114
|
self.file_upload_id: Optional[str] = None
|
|
@@ -164,242 +159,13 @@ class Dataset: # (pd.DataFrame):
|
|
|
164
159
|
raise ValidationError(self.bundle.get("dataset_too_few_rows").format(self.MIN_ROWS_COUNT))
|
|
165
160
|
|
|
166
161
|
def __validate_max_row_count(self):
|
|
167
|
-
if
|
|
162
|
+
if ENTITY_SYSTEM_RECORD_ID in self.data.columns:
|
|
163
|
+
rows_count = self.data[ENTITY_SYSTEM_RECORD_ID].nunique()
|
|
164
|
+
else:
|
|
165
|
+
rows_count = len(self.data)
|
|
166
|
+
if rows_count > self.MAX_ROWS:
|
|
168
167
|
raise ValidationError(self.bundle.get("dataset_too_many_rows_registered").format(self.MAX_ROWS))
|
|
169
168
|
|
|
170
|
-
def __rename_columns(self):
|
|
171
|
-
# self.logger.info("Replace restricted symbols in column names")
|
|
172
|
-
new_columns = []
|
|
173
|
-
dup_counter = 0
|
|
174
|
-
for column in self.data.columns:
|
|
175
|
-
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
|
|
176
|
-
self.columns_renaming[column] = column
|
|
177
|
-
new_columns.append(column)
|
|
178
|
-
continue
|
|
179
|
-
|
|
180
|
-
new_column = str(column)
|
|
181
|
-
suffix = hashlib.sha256(new_column.encode()).hexdigest()[:6]
|
|
182
|
-
if len(new_column) == 0:
|
|
183
|
-
raise ValidationError(self.bundle.get("dataset_empty_column_names"))
|
|
184
|
-
# db limit for column length
|
|
185
|
-
if len(new_column) > 250:
|
|
186
|
-
new_column = new_column[:250]
|
|
187
|
-
|
|
188
|
-
# make column name unique relative to server features
|
|
189
|
-
new_column = f"{new_column}_{suffix}"
|
|
190
|
-
|
|
191
|
-
new_column = new_column.lower()
|
|
192
|
-
|
|
193
|
-
# if column starts with non alphabetic symbol then add "a" to the beginning of string
|
|
194
|
-
if ord(new_column[0]) not in range(ord("a"), ord("z") + 1):
|
|
195
|
-
new_column = "a" + new_column
|
|
196
|
-
|
|
197
|
-
# replace unsupported characters to "_"
|
|
198
|
-
for idx, c in enumerate(new_column):
|
|
199
|
-
if ord(c) not in range(ord("a"), ord("z") + 1) and ord(c) not in range(ord("0"), ord("9") + 1):
|
|
200
|
-
new_column = new_column[:idx] + "_" + new_column[idx + 1 :]
|
|
201
|
-
|
|
202
|
-
if new_column in new_columns:
|
|
203
|
-
new_column = f"{new_column}_{dup_counter}"
|
|
204
|
-
dup_counter += 1
|
|
205
|
-
new_columns.append(new_column)
|
|
206
|
-
|
|
207
|
-
# self.data.columns.values[col_idx] = new_column
|
|
208
|
-
# self.rename(columns={column: new_column}, inplace=True)
|
|
209
|
-
self.meaning_types = {
|
|
210
|
-
(new_column if key == str(column) else key): value for key, value in self.meaning_types_checked.items()
|
|
211
|
-
}
|
|
212
|
-
self.search_keys = [
|
|
213
|
-
tuple(new_column if key == str(column) else key for key in keys) for keys in self.search_keys_checked
|
|
214
|
-
]
|
|
215
|
-
self.columns_renaming[new_column] = str(column)
|
|
216
|
-
self.data.columns = new_columns
|
|
217
|
-
self.etalon_def = None
|
|
218
|
-
|
|
219
|
-
def __validate_too_long_string_values(self):
|
|
220
|
-
"""Check that string values less than maximum characters for LLM"""
|
|
221
|
-
# self.logger.info("Validate too long string values")
|
|
222
|
-
for col in self.data.columns:
|
|
223
|
-
if is_string_dtype(self.data[col]) or is_object_dtype(self.data[col]):
|
|
224
|
-
max_length: int = self.data[col].astype("str").str.len().max()
|
|
225
|
-
if max_length > self.MAX_STRING_FEATURE_LENGTH:
|
|
226
|
-
self.data[col] = self.data[col].astype("str").str.slice(stop=self.MAX_STRING_FEATURE_LENGTH)
|
|
227
|
-
|
|
228
|
-
def __convert_bools(self):
|
|
229
|
-
"""Convert bool columns to string"""
|
|
230
|
-
# self.logger.info("Converting bool to int")
|
|
231
|
-
for col in self.data.columns:
|
|
232
|
-
if is_bool(self.data[col]):
|
|
233
|
-
self.data[col] = self.data[col].astype("str")
|
|
234
|
-
|
|
235
|
-
def __convert_float16(self):
|
|
236
|
-
"""Convert float16 to float"""
|
|
237
|
-
# self.logger.info("Converting float16 to float")
|
|
238
|
-
for col in self.data.columns:
|
|
239
|
-
if is_float_dtype(self.data[col]):
|
|
240
|
-
self.data[col] = self.data[col].astype("float64")
|
|
241
|
-
|
|
242
|
-
def __correct_decimal_comma(self):
|
|
243
|
-
"""Check DataSet for decimal commas and fix them"""
|
|
244
|
-
# self.logger.info("Correct decimal commas")
|
|
245
|
-
columns_to_fix = find_numbers_with_decimal_comma(self.data)
|
|
246
|
-
if len(columns_to_fix) > 0:
|
|
247
|
-
self.logger.warning(f"Convert strings with decimal comma to float: {columns_to_fix}")
|
|
248
|
-
for col in columns_to_fix:
|
|
249
|
-
self.data[col] = self.data[col].astype("string").str.replace(",", ".").astype(np.float64)
|
|
250
|
-
|
|
251
|
-
@staticmethod
|
|
252
|
-
def _ip_to_int(ip: Optional[_BaseAddress]) -> Optional[int]:
|
|
253
|
-
try:
|
|
254
|
-
if isinstance(ip, (IPv4Address, IPv6Address)):
|
|
255
|
-
return int(ip)
|
|
256
|
-
except Exception:
|
|
257
|
-
pass
|
|
258
|
-
|
|
259
|
-
@staticmethod
|
|
260
|
-
def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
|
|
261
|
-
try:
|
|
262
|
-
if isinstance(ip, (IPv4Address, IPv6Address)):
|
|
263
|
-
return str(int(ip))
|
|
264
|
-
except Exception:
|
|
265
|
-
pass
|
|
266
|
-
|
|
267
|
-
@staticmethod
|
|
268
|
-
def _safe_ip_parse(ip: Union[str, int, IPv4Address, IPv6Address]) -> Optional[_BaseAddress]:
|
|
269
|
-
try:
|
|
270
|
-
return ip_address(ip)
|
|
271
|
-
except ValueError:
|
|
272
|
-
pass
|
|
273
|
-
|
|
274
|
-
@staticmethod
|
|
275
|
-
def _is_ipv4(ip: Optional[_BaseAddress]):
|
|
276
|
-
return ip is not None and (
|
|
277
|
-
isinstance(ip, IPv4Address) or (isinstance(ip, IPv6Address) and ip.ipv4_mapped is not None)
|
|
278
|
-
)
|
|
279
|
-
|
|
280
|
-
@staticmethod
|
|
281
|
-
def _to_ipv4(ip: Optional[_BaseAddress]) -> Optional[IPv4Address]:
|
|
282
|
-
if isinstance(ip, IPv4Address):
|
|
283
|
-
return ip
|
|
284
|
-
return None
|
|
285
|
-
|
|
286
|
-
@staticmethod
|
|
287
|
-
def _to_ipv6(ip: Optional[_BaseAddress]) -> Optional[IPv6Address]:
|
|
288
|
-
if isinstance(ip, IPv6Address):
|
|
289
|
-
return ip
|
|
290
|
-
if isinstance(ip, IPv4Address):
|
|
291
|
-
return IPv6Address("::ffff:" + str(ip))
|
|
292
|
-
return None
|
|
293
|
-
|
|
294
|
-
def __convert_ip(self):
|
|
295
|
-
"""Convert ip address to int"""
|
|
296
|
-
ip = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS.value)
|
|
297
|
-
if ip is not None and ip in self.data.columns:
|
|
298
|
-
self.logger.info("Convert ip address to int")
|
|
299
|
-
del self.etalon_def[FileColumnMeaningType.IP_ADDRESS.value]
|
|
300
|
-
del self.meaning_types[ip]
|
|
301
|
-
original_ip = self.columns_renaming[ip]
|
|
302
|
-
del self.columns_renaming[ip]
|
|
303
|
-
|
|
304
|
-
search_keys = set()
|
|
305
|
-
for tup in self.search_keys_checked:
|
|
306
|
-
search_keys.update(tup)
|
|
307
|
-
search_keys.remove(ip)
|
|
308
|
-
|
|
309
|
-
self.data[ip] = self.data[ip].apply(self._safe_ip_parse)
|
|
310
|
-
if self.data[ip].isnull().all():
|
|
311
|
-
raise ValidationError(self.bundle.get("invalid_ip").format(ip))
|
|
312
|
-
|
|
313
|
-
ipv4 = ip + "_v4"
|
|
314
|
-
self.data[ipv4] = self.data[ip].apply(self._to_ipv4).apply(self._ip_to_int).astype("Int64")
|
|
315
|
-
self.meaning_types[ipv4] = FileColumnMeaningType.IP_ADDRESS
|
|
316
|
-
self.etalon_def[FileColumnMeaningType.IP_ADDRESS.value] = ipv4
|
|
317
|
-
search_keys.add(ipv4)
|
|
318
|
-
self.columns_renaming[ipv4] = original_ip
|
|
319
|
-
|
|
320
|
-
ipv6 = ip + "_v6"
|
|
321
|
-
self.data[ipv6] = (
|
|
322
|
-
self.data[ip]
|
|
323
|
-
.apply(self._to_ipv6)
|
|
324
|
-
.apply(self._ip_to_int_str)
|
|
325
|
-
.astype("string")
|
|
326
|
-
# .str.replace(".0", "", regex=False)
|
|
327
|
-
)
|
|
328
|
-
self.data = self.data.drop(columns=ip)
|
|
329
|
-
self.meaning_types[ipv6] = FileColumnMeaningType.IPV6_ADDRESS
|
|
330
|
-
self.etalon_def[FileColumnMeaningType.IPV6_ADDRESS.value] = ipv6
|
|
331
|
-
search_keys.add(ipv6)
|
|
332
|
-
self.columns_renaming[ipv6] = original_ip
|
|
333
|
-
self.search_keys = combine_search_keys(search_keys)
|
|
334
|
-
|
|
335
|
-
def __normalize_iso_code(self):
|
|
336
|
-
iso_code = self.etalon_def_checked.get(FileColumnMeaningType.COUNTRY.value)
|
|
337
|
-
if iso_code is not None and iso_code in self.data.columns:
|
|
338
|
-
# self.logger.info("Normalize iso code column")
|
|
339
|
-
self.data[iso_code] = (
|
|
340
|
-
self.data[iso_code]
|
|
341
|
-
.astype("string")
|
|
342
|
-
.str.upper()
|
|
343
|
-
.str.replace(r"[^A-Z]", "", regex=True)
|
|
344
|
-
.str.replace("UK", "GB", regex=False)
|
|
345
|
-
)
|
|
346
|
-
if (self.data[iso_code] == "").all():
|
|
347
|
-
raise ValidationError(self.bundle.get("invalid_country").format(iso_code))
|
|
348
|
-
|
|
349
|
-
def __normalize_postal_code(self):
|
|
350
|
-
postal_code = self.etalon_def_checked.get(FileColumnMeaningType.POSTAL_CODE.value)
|
|
351
|
-
if postal_code is not None and postal_code in self.data.columns:
|
|
352
|
-
# self.logger.info("Normalize postal code")
|
|
353
|
-
|
|
354
|
-
if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
|
|
355
|
-
try:
|
|
356
|
-
self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
|
|
357
|
-
except Exception:
|
|
358
|
-
pass
|
|
359
|
-
elif is_float_dtype(self.data[postal_code]):
|
|
360
|
-
self.data[postal_code] = self.data[postal_code].astype("Int64").astype("string")
|
|
361
|
-
|
|
362
|
-
self.data[postal_code] = (
|
|
363
|
-
self.data[postal_code]
|
|
364
|
-
.astype("string")
|
|
365
|
-
.str.upper()
|
|
366
|
-
.str.replace(r"[^0-9A-Z]", "", regex=True) # remove non alphanumeric characters
|
|
367
|
-
.str.replace(r"^0+\B", "", regex=True) # remove leading zeros
|
|
368
|
-
)
|
|
369
|
-
if (self.data[postal_code] == "").all():
|
|
370
|
-
raise ValidationError(self.bundle.get("invalid_postal_code").format(postal_code))
|
|
371
|
-
|
|
372
|
-
def __normalize_hem(self):
|
|
373
|
-
hem = self.etalon_def_checked.get(FileColumnMeaningType.HEM.value)
|
|
374
|
-
if hem is not None and hem in self.data.columns:
|
|
375
|
-
self.data[hem] = self.data[hem].str.lower()
|
|
376
|
-
|
|
377
|
-
def __remove_old_dates(self, silent_mode: bool = False):
|
|
378
|
-
date_column = self.etalon_def_checked.get(FileColumnMeaningType.DATE.value) or self.etalon_def_checked.get(
|
|
379
|
-
FileColumnMeaningType.DATETIME.value
|
|
380
|
-
)
|
|
381
|
-
if date_column is not None and is_numeric_dtype(self.data[date_column]):
|
|
382
|
-
old_subset = self.data[self.data[date_column] < self.MIN_SUPPORTED_DATE_TS]
|
|
383
|
-
if len(old_subset) > 0:
|
|
384
|
-
self.logger.info(f"df before dropping old rows: {self.data.shape}")
|
|
385
|
-
self.data.drop(index=old_subset.index, inplace=True) # type: ignore
|
|
386
|
-
self.logger.info(f"df after dropping old rows: {self.data.shape}")
|
|
387
|
-
if len(self.data) == 0:
|
|
388
|
-
raise ValidationError(self.bundle.get("dataset_all_dates_old"))
|
|
389
|
-
else:
|
|
390
|
-
msg = self.bundle.get("dataset_drop_old_dates")
|
|
391
|
-
self.logger.warning(msg)
|
|
392
|
-
if not silent_mode:
|
|
393
|
-
print(msg)
|
|
394
|
-
self.warning_counter.increment()
|
|
395
|
-
|
|
396
|
-
def __drop_ignore_columns(self):
|
|
397
|
-
"""Drop ignore columns"""
|
|
398
|
-
columns_to_drop = list(set(self.data.columns) & set(self.ignore_columns))
|
|
399
|
-
if len(columns_to_drop) > 0:
|
|
400
|
-
# self.logger.info(f"Dropping ignore columns: {self.ignore_columns}")
|
|
401
|
-
self.data.drop(columns_to_drop, axis=1, inplace=True)
|
|
402
|
-
|
|
403
169
|
def __target_value(self) -> pd.Series:
|
|
404
170
|
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, "")
|
|
405
171
|
target: pd.Series = self.data[target_column]
|
|
@@ -439,14 +205,14 @@ class Dataset: # (pd.DataFrame):
|
|
|
439
205
|
elif self.task_type == ModelTaskType.REGRESSION:
|
|
440
206
|
if not is_float_dtype(target):
|
|
441
207
|
try:
|
|
442
|
-
self.data[target_column] = self.data[target_column].astype("
|
|
208
|
+
self.data[target_column] = self.data[target_column].astype("float64")
|
|
443
209
|
except ValueError:
|
|
444
210
|
self.logger.exception("Failed to cast target to float for regression task type")
|
|
445
211
|
raise ValidationError(self.bundle.get("dataset_invalid_regression_target").format(target.dtype))
|
|
446
212
|
elif self.task_type == ModelTaskType.TIMESERIES:
|
|
447
213
|
if not is_float_dtype(target):
|
|
448
214
|
try:
|
|
449
|
-
self.data[target_column] = self.data[target_column].astype("
|
|
215
|
+
self.data[target_column] = self.data[target_column].astype("float64")
|
|
450
216
|
except ValueError:
|
|
451
217
|
self.logger.exception("Failed to cast target to float for timeseries task type")
|
|
452
218
|
raise ValidationError(self.bundle.get("dataset_invalid_timeseries_target").format(target.dtype))
|
|
@@ -460,7 +226,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
460
226
|
train_segment = self.data
|
|
461
227
|
|
|
462
228
|
if self.task_type == ModelTaskType.MULTICLASS or (
|
|
463
|
-
self.task_type == ModelTaskType.BINARY and len(train_segment) > self.
|
|
229
|
+
self.task_type == ModelTaskType.BINARY and len(train_segment) > self.BINARY_MIN_SAMPLE_THRESHOLD
|
|
464
230
|
):
|
|
465
231
|
count = len(train_segment)
|
|
466
232
|
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
|
|
@@ -488,6 +254,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
488
254
|
min_class_percent = self.IMBALANCE_THESHOLD / target_classes_count
|
|
489
255
|
min_class_threshold = min_class_percent * count
|
|
490
256
|
|
|
257
|
+
# If min class count less than 30% for binary or (60 / classes_count)% for multiclass
|
|
491
258
|
if min_class_count < min_class_threshold:
|
|
492
259
|
self.imbalanced = True
|
|
493
260
|
self.data = balance_undersample(
|
|
@@ -495,7 +262,8 @@ class Dataset: # (pd.DataFrame):
|
|
|
495
262
|
target_column=target_column,
|
|
496
263
|
task_type=self.task_type,
|
|
497
264
|
random_state=self.random_state,
|
|
498
|
-
|
|
265
|
+
binary_min_sample_threshold=self.BINARY_MIN_SAMPLE_THRESHOLD,
|
|
266
|
+
multiclass_min_sample_threshold=self.MULTICLASS_MIN_SAMPLE_THRESHOLD,
|
|
499
267
|
binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
|
|
500
268
|
multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
|
|
501
269
|
logger=self.logger,
|
|
@@ -520,52 +288,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
520
288
|
self.data = resampled_data
|
|
521
289
|
self.logger.info(f"Shape after threshold resampling: {self.data.shape}")
|
|
522
290
|
|
|
523
|
-
def __convert_phone(self):
|
|
524
|
-
"""Convert phone/msisdn to int"""
|
|
525
|
-
# self.logger.info("Convert phone to int")
|
|
526
|
-
msisdn_column = self.etalon_def_checked.get(FileColumnMeaningType.MSISDN.value)
|
|
527
|
-
country_column = self.etalon_def_checked.get(FileColumnMeaningType.COUNTRY.value)
|
|
528
|
-
if msisdn_column is not None and msisdn_column in self.data.columns:
|
|
529
|
-
normalizer = PhoneNormalizer(self.data, msisdn_column, country_column)
|
|
530
|
-
self.data[msisdn_column] = normalizer.normalize()
|
|
531
|
-
if self.data[msisdn_column].isnull().all():
|
|
532
|
-
raise ValidationError(f"All values of PHONE column `{msisdn_column}` are invalid")
|
|
533
|
-
|
|
534
|
-
def __features(self):
|
|
535
|
-
return [
|
|
536
|
-
f for f, meaning_type in self.meaning_types_checked.items() if meaning_type == FileColumnMeaningType.FEATURE
|
|
537
|
-
]
|
|
538
|
-
|
|
539
|
-
def __remove_dates_from_features(self, silent_mode: bool = False):
|
|
540
|
-
# self.logger.info("Remove date columns from features")
|
|
541
|
-
|
|
542
|
-
removed_features = []
|
|
543
|
-
for f in self.__features():
|
|
544
|
-
if is_datetime(self.data[f]) or is_period_dtype(self.data[f]):
|
|
545
|
-
removed_features.append(f)
|
|
546
|
-
self.data.drop(columns=f, inplace=True)
|
|
547
|
-
del self.meaning_types_checked[f]
|
|
548
|
-
|
|
549
|
-
if removed_features:
|
|
550
|
-
msg = self.bundle.get("dataset_date_features").format(removed_features)
|
|
551
|
-
self.logger.warning(msg)
|
|
552
|
-
if not silent_mode:
|
|
553
|
-
print(msg)
|
|
554
|
-
self.warning_counter.increment()
|
|
555
|
-
|
|
556
|
-
def __validate_features_count(self):
|
|
557
|
-
if len(self.__features()) > self.MAX_FEATURES_COUNT:
|
|
558
|
-
msg = self.bundle.get("dataset_too_many_features").format(self.MAX_FEATURES_COUNT)
|
|
559
|
-
self.logger.warning(msg)
|
|
560
|
-
raise ValidationError(msg)
|
|
561
|
-
|
|
562
|
-
def __convert_features_types(self):
|
|
563
|
-
# self.logger.info("Convert features to supported data types")
|
|
564
|
-
|
|
565
|
-
for f in self.__features():
|
|
566
|
-
if not is_numeric_dtype(self.data[f]):
|
|
567
|
-
self.data[f] = self.data[f].astype("string")
|
|
568
|
-
|
|
569
291
|
def __validate_dataset(self, validate_target: bool, silent_mode: bool):
|
|
570
292
|
"""Validate DataSet"""
|
|
571
293
|
# self.logger.info("validating etalon")
|
|
@@ -588,7 +310,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
588
310
|
key
|
|
589
311
|
for search_group in self.search_keys_checked
|
|
590
312
|
for key in search_group
|
|
591
|
-
if self.columns_renaming.get(key)
|
|
313
|
+
if not self.columns_renaming.get(key).endswith(EmailSearchKeyConverter.ONE_DOMAIN_SUFFIX)
|
|
592
314
|
}
|
|
593
315
|
ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS.value)
|
|
594
316
|
if (
|
|
@@ -696,69 +418,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
696
418
|
if len(self.data) == 0:
|
|
697
419
|
raise ValidationError(self.bundle.get("all_search_keys_invalid"))
|
|
698
420
|
|
|
699
|
-
def __validate_meaning_types(self, validate_target: bool):
|
|
700
|
-
# self.logger.info("Validating meaning types")
|
|
701
|
-
if self.meaning_types is None or len(self.meaning_types) == 0:
|
|
702
|
-
raise ValueError(self.bundle.get("dataset_missing_meaning_types"))
|
|
703
|
-
|
|
704
|
-
if SYSTEM_RECORD_ID not in self.data.columns:
|
|
705
|
-
raise ValueError("Internal error")
|
|
706
|
-
|
|
707
|
-
for column in self.meaning_types:
|
|
708
|
-
if column not in self.data.columns:
|
|
709
|
-
raise ValueError(self.bundle.get("dataset_missing_meaning_column").format(column, self.data.columns))
|
|
710
|
-
if validate_target and FileColumnMeaningType.TARGET not in self.meaning_types.values():
|
|
711
|
-
raise ValueError(self.bundle.get("dataset_missing_target"))
|
|
712
|
-
|
|
713
|
-
def __validate_search_keys(self):
|
|
714
|
-
# self.logger.info("Validating search keys")
|
|
715
|
-
if self.search_keys is None or len(self.search_keys) == 0:
|
|
716
|
-
raise ValueError(self.bundle.get("dataset_missing_search_keys"))
|
|
717
|
-
for keys_group in self.search_keys:
|
|
718
|
-
for key in keys_group:
|
|
719
|
-
if key not in self.data.columns:
|
|
720
|
-
showing_columns = set(self.data.columns) - SYSTEM_COLUMNS
|
|
721
|
-
raise ValidationError(
|
|
722
|
-
self.bundle.get("dataset_missing_search_key_column").format(key, showing_columns)
|
|
723
|
-
)
|
|
724
|
-
|
|
725
421
|
def validate(self, validate_target: bool = True, silent_mode: bool = False):
|
|
726
|
-
# self.logger.info("Validating dataset")
|
|
727
|
-
|
|
728
|
-
self.__validate_search_keys()
|
|
729
|
-
|
|
730
|
-
self.__validate_meaning_types(validate_target=validate_target)
|
|
731
|
-
|
|
732
|
-
self.__drop_ignore_columns()
|
|
733
|
-
|
|
734
|
-
self.__rename_columns()
|
|
735
|
-
|
|
736
|
-
self.__remove_dates_from_features(silent_mode)
|
|
737
|
-
|
|
738
|
-
self.__validate_features_count()
|
|
739
|
-
|
|
740
|
-
self.__validate_too_long_string_values()
|
|
741
|
-
|
|
742
|
-
self.__convert_bools()
|
|
743
|
-
|
|
744
|
-
self.__convert_float16()
|
|
745
|
-
|
|
746
|
-
self.__correct_decimal_comma()
|
|
747
|
-
|
|
748
|
-
self.__remove_old_dates(silent_mode)
|
|
749
|
-
|
|
750
|
-
self.__convert_ip()
|
|
751
|
-
|
|
752
|
-
self.__convert_phone()
|
|
753
|
-
|
|
754
|
-
self.__normalize_iso_code()
|
|
755
|
-
|
|
756
|
-
self.__normalize_postal_code()
|
|
757
|
-
|
|
758
|
-
self.__normalize_hem()
|
|
759
|
-
|
|
760
|
-
self.__convert_features_types()
|
|
761
|
-
|
|
762
422
|
self.__validate_dataset(validate_target, silent_mode)
|
|
763
423
|
|
|
764
424
|
if validate_target:
|
|
@@ -776,35 +436,39 @@ class Dataset: # (pd.DataFrame):
|
|
|
776
436
|
# self.logger.info("Constructing dataset metadata")
|
|
777
437
|
columns = []
|
|
778
438
|
for index, (column_name, column_type) in enumerate(zip(self.data.columns, self.data.dtypes)):
|
|
779
|
-
if column_name
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
else:
|
|
797
|
-
min_max_values = None
|
|
798
|
-
column_meta = FileColumnMetadata(
|
|
799
|
-
index=index,
|
|
800
|
-
name=column_name,
|
|
801
|
-
originalName=self.columns_renaming.get(column_name) or column_name,
|
|
802
|
-
dataType=self.__get_data_type(column_type, column_name),
|
|
803
|
-
meaningType=meaning_type,
|
|
804
|
-
minMaxValues=min_max_values,
|
|
439
|
+
if column_name in self.meaning_types_checked:
|
|
440
|
+
meaning_type = self.meaning_types_checked[column_name]
|
|
441
|
+
# Temporary workaround while backend doesn't support datetime
|
|
442
|
+
if meaning_type == FileColumnMeaningType.DATETIME:
|
|
443
|
+
meaning_type = FileColumnMeaningType.DATE
|
|
444
|
+
else:
|
|
445
|
+
meaning_type = FileColumnMeaningType.FEATURE
|
|
446
|
+
if meaning_type in {
|
|
447
|
+
FileColumnMeaningType.DATE,
|
|
448
|
+
FileColumnMeaningType.DATETIME,
|
|
449
|
+
# FileColumnMeaningType.IP_ADDRESS,
|
|
450
|
+
}:
|
|
451
|
+
min_value = self.data[column_name].astype("Int64").min()
|
|
452
|
+
max_value = self.data[column_name].astype("Int64").max()
|
|
453
|
+
min_max_values = NumericInterval(
|
|
454
|
+
minValue=min_value,
|
|
455
|
+
maxValue=max_value,
|
|
805
456
|
)
|
|
457
|
+
else:
|
|
458
|
+
min_max_values = None
|
|
459
|
+
column_meta = FileColumnMetadata(
|
|
460
|
+
index=index,
|
|
461
|
+
name=column_name,
|
|
462
|
+
originalName=self.columns_renaming.get(column_name) or column_name,
|
|
463
|
+
dataType=self.__get_data_type(column_type, column_name),
|
|
464
|
+
meaningType=meaning_type,
|
|
465
|
+
minMaxValues=min_max_values,
|
|
466
|
+
)
|
|
467
|
+
if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
|
|
468
|
+
column_meta.isUnnest = True
|
|
469
|
+
column_meta.unnestKeyNames = self.unnest_search_keys[column_meta.originalName]
|
|
806
470
|
|
|
807
|
-
|
|
471
|
+
columns.append(column_meta)
|
|
808
472
|
|
|
809
473
|
return FileMetadata(
|
|
810
474
|
name=self.dataset_name,
|
|
@@ -1036,7 +700,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
1036
700
|
parquet_file_path = f"{base_path}/{self.dataset_name}.parquet"
|
|
1037
701
|
self.data.to_parquet(path=parquet_file_path, index=False, compression="gzip", engine="fastparquet")
|
|
1038
702
|
uploading_file_size = Path(parquet_file_path).stat().st_size
|
|
1039
|
-
self.logger.info(f"Size of prepared uploading file: {uploading_file_size}")
|
|
703
|
+
self.logger.info(f"Size of prepared uploading file: {uploading_file_size}. {len(self.data)} rows")
|
|
1040
704
|
if uploading_file_size > self.MAX_UPLOADING_FILE_SIZE:
|
|
1041
705
|
raise ValidationError(self.bundle.get("dataset_too_big_file"))
|
|
1042
706
|
return parquet_file_path
|