upgini 1.1.280a3418.post2__py3-none-any.whl → 1.2.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (43) hide show
  1. upgini/__about__.py +1 -1
  2. upgini/__init__.py +4 -20
  3. upgini/autofe/all_operands.py +39 -10
  4. upgini/autofe/binary.py +148 -45
  5. upgini/autofe/date.py +197 -26
  6. upgini/autofe/feature.py +102 -19
  7. upgini/autofe/groupby.py +22 -22
  8. upgini/autofe/operand.py +9 -6
  9. upgini/autofe/unary.py +78 -54
  10. upgini/autofe/vector.py +8 -8
  11. upgini/data_source/data_source_publisher.py +128 -5
  12. upgini/dataset.py +50 -386
  13. upgini/features_enricher.py +936 -541
  14. upgini/http.py +27 -16
  15. upgini/lazy_import.py +35 -0
  16. upgini/metadata.py +84 -59
  17. upgini/metrics.py +164 -34
  18. upgini/normalizer/normalize_utils.py +197 -0
  19. upgini/resource_bundle/strings.properties +66 -51
  20. upgini/search_task.py +10 -4
  21. upgini/utils/Roboto-Regular.ttf +0 -0
  22. upgini/utils/base_search_key_detector.py +14 -12
  23. upgini/utils/country_utils.py +16 -0
  24. upgini/utils/custom_loss_utils.py +39 -36
  25. upgini/utils/datetime_utils.py +98 -45
  26. upgini/utils/deduplicate_utils.py +135 -112
  27. upgini/utils/display_utils.py +46 -15
  28. upgini/utils/email_utils.py +54 -16
  29. upgini/utils/feature_info.py +172 -0
  30. upgini/utils/features_validator.py +34 -20
  31. upgini/utils/ip_utils.py +100 -1
  32. upgini/utils/phone_utils.py +343 -0
  33. upgini/utils/postal_code_utils.py +34 -0
  34. upgini/utils/sklearn_ext.py +28 -19
  35. upgini/utils/target_utils.py +113 -57
  36. upgini/utils/warning_counter.py +1 -0
  37. upgini/version_validator.py +8 -4
  38. {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31.dist-info}/METADATA +31 -16
  39. upgini-1.2.31.dist-info/RECORD +65 -0
  40. upgini/normalizer/phone_normalizer.py +0 -340
  41. upgini-1.1.280a3418.post2.dist-info/RECORD +0 -62
  42. {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31.dist-info}/WHEEL +0 -0
  43. {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31.dist-info}/licenses/LICENSE +0 -0
upgini/dataset.py CHANGED
@@ -1,30 +1,25 @@
1
1
  import csv
2
- import hashlib
3
2
  import logging
4
3
  import tempfile
5
4
  import time
6
- from ipaddress import IPv4Address, IPv6Address, _BaseAddress, ip_address
7
5
  from pathlib import Path
8
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union
6
+ from typing import Any, Callable, Dict, List, Optional, Tuple
9
7
 
10
8
  import numpy as np
11
9
  import pandas as pd
12
- from pandas.api.types import is_bool_dtype as is_bool
13
- from pandas.api.types import is_datetime64_any_dtype as is_datetime
14
10
  from pandas.api.types import (
15
11
  is_float_dtype,
16
12
  is_integer_dtype,
17
13
  is_numeric_dtype,
18
14
  is_object_dtype,
19
- is_period_dtype,
20
15
  is_string_dtype,
21
16
  )
22
17
 
23
18
  from upgini.errors import ValidationError
24
19
  from upgini.http import ProgressStage, SearchProgress, _RestClient
25
20
  from upgini.metadata import (
21
+ ENTITY_SYSTEM_RECORD_ID,
26
22
  EVAL_SET_INDEX,
27
- SYSTEM_COLUMNS,
28
23
  SYSTEM_RECORD_ID,
29
24
  TARGET,
30
25
  DataType,
@@ -38,10 +33,8 @@ from upgini.metadata import (
38
33
  RuntimeParameters,
39
34
  SearchCustomization,
40
35
  )
41
- from upgini.normalizer.phone_normalizer import PhoneNormalizer
42
36
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
43
37
  from upgini.search_task import SearchTask
44
- from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
45
38
  from upgini.utils.email_utils import EmailSearchKeyConverter
46
39
  from upgini.utils.target_utils import balance_undersample
47
40
 
@@ -60,7 +53,8 @@ class Dataset: # (pd.DataFrame):
60
53
  FIT_SAMPLE_THRESHOLD = 200_000
61
54
  FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
62
55
  FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
63
- MIN_SAMPLE_THRESHOLD = 5_000
56
+ BINARY_MIN_SAMPLE_THRESHOLD = 5_000
57
+ MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
64
58
  IMBALANCE_THESHOLD = 0.6
65
59
  BINARY_BOOTSTRAP_LOOPS = 5
66
60
  MULTICLASS_BOOTSTRAP_LOOPS = 2
@@ -79,6 +73,7 @@ class Dataset: # (pd.DataFrame):
79
73
  path: Optional[str] = None,
80
74
  meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
81
75
  search_keys: Optional[List[Tuple[str, ...]]] = None,
76
+ unnest_search_keys: Optional[Dict[str, str]] = None,
82
77
  model_task_type: Optional[ModelTaskType] = None,
83
78
  random_state: Optional[int] = None,
84
79
  rest_client: Optional[_RestClient] = None,
@@ -113,7 +108,7 @@ class Dataset: # (pd.DataFrame):
113
108
  self.description = description
114
109
  self.meaning_types = meaning_types
115
110
  self.search_keys = search_keys
116
- self.ignore_columns = []
111
+ self.unnest_search_keys = unnest_search_keys
117
112
  self.hierarchical_group_keys = []
118
113
  self.hierarchical_subgroup_keys = []
119
114
  self.file_upload_id: Optional[str] = None
@@ -164,242 +159,13 @@ class Dataset: # (pd.DataFrame):
164
159
  raise ValidationError(self.bundle.get("dataset_too_few_rows").format(self.MIN_ROWS_COUNT))
165
160
 
166
161
  def __validate_max_row_count(self):
167
- if len(self.data) > self.MAX_ROWS:
162
+ if ENTITY_SYSTEM_RECORD_ID in self.data.columns:
163
+ rows_count = self.data[ENTITY_SYSTEM_RECORD_ID].nunique()
164
+ else:
165
+ rows_count = len(self.data)
166
+ if rows_count > self.MAX_ROWS:
168
167
  raise ValidationError(self.bundle.get("dataset_too_many_rows_registered").format(self.MAX_ROWS))
169
168
 
170
- def __rename_columns(self):
171
- # self.logger.info("Replace restricted symbols in column names")
172
- new_columns = []
173
- dup_counter = 0
174
- for column in self.data.columns:
175
- if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
176
- self.columns_renaming[column] = column
177
- new_columns.append(column)
178
- continue
179
-
180
- new_column = str(column)
181
- suffix = hashlib.sha256(new_column.encode()).hexdigest()[:6]
182
- if len(new_column) == 0:
183
- raise ValidationError(self.bundle.get("dataset_empty_column_names"))
184
- # db limit for column length
185
- if len(new_column) > 250:
186
- new_column = new_column[:250]
187
-
188
- # make column name unique relative to server features
189
- new_column = f"{new_column}_{suffix}"
190
-
191
- new_column = new_column.lower()
192
-
193
- # if column starts with non alphabetic symbol then add "a" to the beginning of string
194
- if ord(new_column[0]) not in range(ord("a"), ord("z") + 1):
195
- new_column = "a" + new_column
196
-
197
- # replace unsupported characters to "_"
198
- for idx, c in enumerate(new_column):
199
- if ord(c) not in range(ord("a"), ord("z") + 1) and ord(c) not in range(ord("0"), ord("9") + 1):
200
- new_column = new_column[:idx] + "_" + new_column[idx + 1 :]
201
-
202
- if new_column in new_columns:
203
- new_column = f"{new_column}_{dup_counter}"
204
- dup_counter += 1
205
- new_columns.append(new_column)
206
-
207
- # self.data.columns.values[col_idx] = new_column
208
- # self.rename(columns={column: new_column}, inplace=True)
209
- self.meaning_types = {
210
- (new_column if key == str(column) else key): value for key, value in self.meaning_types_checked.items()
211
- }
212
- self.search_keys = [
213
- tuple(new_column if key == str(column) else key for key in keys) for keys in self.search_keys_checked
214
- ]
215
- self.columns_renaming[new_column] = str(column)
216
- self.data.columns = new_columns
217
- self.etalon_def = None
218
-
219
- def __validate_too_long_string_values(self):
220
- """Check that string values less than maximum characters for LLM"""
221
- # self.logger.info("Validate too long string values")
222
- for col in self.data.columns:
223
- if is_string_dtype(self.data[col]) or is_object_dtype(self.data[col]):
224
- max_length: int = self.data[col].astype("str").str.len().max()
225
- if max_length > self.MAX_STRING_FEATURE_LENGTH:
226
- self.data[col] = self.data[col].astype("str").str.slice(stop=self.MAX_STRING_FEATURE_LENGTH)
227
-
228
- def __convert_bools(self):
229
- """Convert bool columns to string"""
230
- # self.logger.info("Converting bool to int")
231
- for col in self.data.columns:
232
- if is_bool(self.data[col]):
233
- self.data[col] = self.data[col].astype("str")
234
-
235
- def __convert_float16(self):
236
- """Convert float16 to float"""
237
- # self.logger.info("Converting float16 to float")
238
- for col in self.data.columns:
239
- if is_float_dtype(self.data[col]):
240
- self.data[col] = self.data[col].astype("float64")
241
-
242
- def __correct_decimal_comma(self):
243
- """Check DataSet for decimal commas and fix them"""
244
- # self.logger.info("Correct decimal commas")
245
- columns_to_fix = find_numbers_with_decimal_comma(self.data)
246
- if len(columns_to_fix) > 0:
247
- self.logger.warning(f"Convert strings with decimal comma to float: {columns_to_fix}")
248
- for col in columns_to_fix:
249
- self.data[col] = self.data[col].astype("string").str.replace(",", ".").astype(np.float64)
250
-
251
- @staticmethod
252
- def _ip_to_int(ip: Optional[_BaseAddress]) -> Optional[int]:
253
- try:
254
- if isinstance(ip, (IPv4Address, IPv6Address)):
255
- return int(ip)
256
- except Exception:
257
- pass
258
-
259
- @staticmethod
260
- def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
261
- try:
262
- if isinstance(ip, (IPv4Address, IPv6Address)):
263
- return str(int(ip))
264
- except Exception:
265
- pass
266
-
267
- @staticmethod
268
- def _safe_ip_parse(ip: Union[str, int, IPv4Address, IPv6Address]) -> Optional[_BaseAddress]:
269
- try:
270
- return ip_address(ip)
271
- except ValueError:
272
- pass
273
-
274
- @staticmethod
275
- def _is_ipv4(ip: Optional[_BaseAddress]):
276
- return ip is not None and (
277
- isinstance(ip, IPv4Address) or (isinstance(ip, IPv6Address) and ip.ipv4_mapped is not None)
278
- )
279
-
280
- @staticmethod
281
- def _to_ipv4(ip: Optional[_BaseAddress]) -> Optional[IPv4Address]:
282
- if isinstance(ip, IPv4Address):
283
- return ip
284
- return None
285
-
286
- @staticmethod
287
- def _to_ipv6(ip: Optional[_BaseAddress]) -> Optional[IPv6Address]:
288
- if isinstance(ip, IPv6Address):
289
- return ip
290
- if isinstance(ip, IPv4Address):
291
- return IPv6Address("::ffff:" + str(ip))
292
- return None
293
-
294
- def __convert_ip(self):
295
- """Convert ip address to int"""
296
- ip = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS.value)
297
- if ip is not None and ip in self.data.columns:
298
- self.logger.info("Convert ip address to int")
299
- del self.etalon_def[FileColumnMeaningType.IP_ADDRESS.value]
300
- del self.meaning_types[ip]
301
- original_ip = self.columns_renaming[ip]
302
- del self.columns_renaming[ip]
303
-
304
- search_keys = set()
305
- for tup in self.search_keys_checked:
306
- search_keys.update(tup)
307
- search_keys.remove(ip)
308
-
309
- self.data[ip] = self.data[ip].apply(self._safe_ip_parse)
310
- if self.data[ip].isnull().all():
311
- raise ValidationError(self.bundle.get("invalid_ip").format(ip))
312
-
313
- ipv4 = ip + "_v4"
314
- self.data[ipv4] = self.data[ip].apply(self._to_ipv4).apply(self._ip_to_int).astype("Int64")
315
- self.meaning_types[ipv4] = FileColumnMeaningType.IP_ADDRESS
316
- self.etalon_def[FileColumnMeaningType.IP_ADDRESS.value] = ipv4
317
- search_keys.add(ipv4)
318
- self.columns_renaming[ipv4] = original_ip
319
-
320
- ipv6 = ip + "_v6"
321
- self.data[ipv6] = (
322
- self.data[ip]
323
- .apply(self._to_ipv6)
324
- .apply(self._ip_to_int_str)
325
- .astype("string")
326
- # .str.replace(".0", "", regex=False)
327
- )
328
- self.data = self.data.drop(columns=ip)
329
- self.meaning_types[ipv6] = FileColumnMeaningType.IPV6_ADDRESS
330
- self.etalon_def[FileColumnMeaningType.IPV6_ADDRESS.value] = ipv6
331
- search_keys.add(ipv6)
332
- self.columns_renaming[ipv6] = original_ip
333
- self.search_keys = combine_search_keys(search_keys)
334
-
335
- def __normalize_iso_code(self):
336
- iso_code = self.etalon_def_checked.get(FileColumnMeaningType.COUNTRY.value)
337
- if iso_code is not None and iso_code in self.data.columns:
338
- # self.logger.info("Normalize iso code column")
339
- self.data[iso_code] = (
340
- self.data[iso_code]
341
- .astype("string")
342
- .str.upper()
343
- .str.replace(r"[^A-Z]", "", regex=True)
344
- .str.replace("UK", "GB", regex=False)
345
- )
346
- if (self.data[iso_code] == "").all():
347
- raise ValidationError(self.bundle.get("invalid_country").format(iso_code))
348
-
349
- def __normalize_postal_code(self):
350
- postal_code = self.etalon_def_checked.get(FileColumnMeaningType.POSTAL_CODE.value)
351
- if postal_code is not None and postal_code in self.data.columns:
352
- # self.logger.info("Normalize postal code")
353
-
354
- if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
355
- try:
356
- self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
357
- except Exception:
358
- pass
359
- elif is_float_dtype(self.data[postal_code]):
360
- self.data[postal_code] = self.data[postal_code].astype("Int64").astype("string")
361
-
362
- self.data[postal_code] = (
363
- self.data[postal_code]
364
- .astype("string")
365
- .str.upper()
366
- .str.replace(r"[^0-9A-Z]", "", regex=True) # remove non alphanumeric characters
367
- .str.replace(r"^0+\B", "", regex=True) # remove leading zeros
368
- )
369
- if (self.data[postal_code] == "").all():
370
- raise ValidationError(self.bundle.get("invalid_postal_code").format(postal_code))
371
-
372
- def __normalize_hem(self):
373
- hem = self.etalon_def_checked.get(FileColumnMeaningType.HEM.value)
374
- if hem is not None and hem in self.data.columns:
375
- self.data[hem] = self.data[hem].str.lower()
376
-
377
- def __remove_old_dates(self, silent_mode: bool = False):
378
- date_column = self.etalon_def_checked.get(FileColumnMeaningType.DATE.value) or self.etalon_def_checked.get(
379
- FileColumnMeaningType.DATETIME.value
380
- )
381
- if date_column is not None and is_numeric_dtype(self.data[date_column]):
382
- old_subset = self.data[self.data[date_column] < self.MIN_SUPPORTED_DATE_TS]
383
- if len(old_subset) > 0:
384
- self.logger.info(f"df before dropping old rows: {self.data.shape}")
385
- self.data.drop(index=old_subset.index, inplace=True) # type: ignore
386
- self.logger.info(f"df after dropping old rows: {self.data.shape}")
387
- if len(self.data) == 0:
388
- raise ValidationError(self.bundle.get("dataset_all_dates_old"))
389
- else:
390
- msg = self.bundle.get("dataset_drop_old_dates")
391
- self.logger.warning(msg)
392
- if not silent_mode:
393
- print(msg)
394
- self.warning_counter.increment()
395
-
396
- def __drop_ignore_columns(self):
397
- """Drop ignore columns"""
398
- columns_to_drop = list(set(self.data.columns) & set(self.ignore_columns))
399
- if len(columns_to_drop) > 0:
400
- # self.logger.info(f"Dropping ignore columns: {self.ignore_columns}")
401
- self.data.drop(columns_to_drop, axis=1, inplace=True)
402
-
403
169
  def __target_value(self) -> pd.Series:
404
170
  target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, "")
405
171
  target: pd.Series = self.data[target_column]
@@ -439,14 +205,14 @@ class Dataset: # (pd.DataFrame):
439
205
  elif self.task_type == ModelTaskType.REGRESSION:
440
206
  if not is_float_dtype(target):
441
207
  try:
442
- self.data[target_column] = self.data[target_column].astype("float")
208
+ self.data[target_column] = self.data[target_column].astype("float64")
443
209
  except ValueError:
444
210
  self.logger.exception("Failed to cast target to float for regression task type")
445
211
  raise ValidationError(self.bundle.get("dataset_invalid_regression_target").format(target.dtype))
446
212
  elif self.task_type == ModelTaskType.TIMESERIES:
447
213
  if not is_float_dtype(target):
448
214
  try:
449
- self.data[target_column] = self.data[target_column].astype("float")
215
+ self.data[target_column] = self.data[target_column].astype("float64")
450
216
  except ValueError:
451
217
  self.logger.exception("Failed to cast target to float for timeseries task type")
452
218
  raise ValidationError(self.bundle.get("dataset_invalid_timeseries_target").format(target.dtype))
@@ -460,7 +226,7 @@ class Dataset: # (pd.DataFrame):
460
226
  train_segment = self.data
461
227
 
462
228
  if self.task_type == ModelTaskType.MULTICLASS or (
463
- self.task_type == ModelTaskType.BINARY and len(train_segment) > self.MIN_SAMPLE_THRESHOLD
229
+ self.task_type == ModelTaskType.BINARY and len(train_segment) > self.BINARY_MIN_SAMPLE_THRESHOLD
464
230
  ):
465
231
  count = len(train_segment)
466
232
  target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
@@ -488,6 +254,7 @@ class Dataset: # (pd.DataFrame):
488
254
  min_class_percent = self.IMBALANCE_THESHOLD / target_classes_count
489
255
  min_class_threshold = min_class_percent * count
490
256
 
257
+ # If min class count less than 30% for binary or (60 / classes_count)% for multiclass
491
258
  if min_class_count < min_class_threshold:
492
259
  self.imbalanced = True
493
260
  self.data = balance_undersample(
@@ -495,7 +262,8 @@ class Dataset: # (pd.DataFrame):
495
262
  target_column=target_column,
496
263
  task_type=self.task_type,
497
264
  random_state=self.random_state,
498
- imbalance_threshold=self.IMBALANCE_THESHOLD,
265
+ binary_min_sample_threshold=self.BINARY_MIN_SAMPLE_THRESHOLD,
266
+ multiclass_min_sample_threshold=self.MULTICLASS_MIN_SAMPLE_THRESHOLD,
499
267
  binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
500
268
  multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
501
269
  logger=self.logger,
@@ -520,52 +288,6 @@ class Dataset: # (pd.DataFrame):
520
288
  self.data = resampled_data
521
289
  self.logger.info(f"Shape after threshold resampling: {self.data.shape}")
522
290
 
523
- def __convert_phone(self):
524
- """Convert phone/msisdn to int"""
525
- # self.logger.info("Convert phone to int")
526
- msisdn_column = self.etalon_def_checked.get(FileColumnMeaningType.MSISDN.value)
527
- country_column = self.etalon_def_checked.get(FileColumnMeaningType.COUNTRY.value)
528
- if msisdn_column is not None and msisdn_column in self.data.columns:
529
- normalizer = PhoneNormalizer(self.data, msisdn_column, country_column)
530
- self.data[msisdn_column] = normalizer.normalize()
531
- if self.data[msisdn_column].isnull().all():
532
- raise ValidationError(f"All values of PHONE column `{msisdn_column}` are invalid")
533
-
534
- def __features(self):
535
- return [
536
- f for f, meaning_type in self.meaning_types_checked.items() if meaning_type == FileColumnMeaningType.FEATURE
537
- ]
538
-
539
- def __remove_dates_from_features(self, silent_mode: bool = False):
540
- # self.logger.info("Remove date columns from features")
541
-
542
- removed_features = []
543
- for f in self.__features():
544
- if is_datetime(self.data[f]) or is_period_dtype(self.data[f]):
545
- removed_features.append(f)
546
- self.data.drop(columns=f, inplace=True)
547
- del self.meaning_types_checked[f]
548
-
549
- if removed_features:
550
- msg = self.bundle.get("dataset_date_features").format(removed_features)
551
- self.logger.warning(msg)
552
- if not silent_mode:
553
- print(msg)
554
- self.warning_counter.increment()
555
-
556
- def __validate_features_count(self):
557
- if len(self.__features()) > self.MAX_FEATURES_COUNT:
558
- msg = self.bundle.get("dataset_too_many_features").format(self.MAX_FEATURES_COUNT)
559
- self.logger.warning(msg)
560
- raise ValidationError(msg)
561
-
562
- def __convert_features_types(self):
563
- # self.logger.info("Convert features to supported data types")
564
-
565
- for f in self.__features():
566
- if not is_numeric_dtype(self.data[f]):
567
- self.data[f] = self.data[f].astype("string")
568
-
569
291
  def __validate_dataset(self, validate_target: bool, silent_mode: bool):
570
292
  """Validate DataSet"""
571
293
  # self.logger.info("validating etalon")
@@ -588,7 +310,7 @@ class Dataset: # (pd.DataFrame):
588
310
  key
589
311
  for search_group in self.search_keys_checked
590
312
  for key in search_group
591
- if self.columns_renaming.get(key) != EmailSearchKeyConverter.EMAIL_ONE_DOMAIN_COLUMN_NAME
313
+ if not self.columns_renaming.get(key).endswith(EmailSearchKeyConverter.ONE_DOMAIN_SUFFIX)
592
314
  }
593
315
  ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS.value)
594
316
  if (
@@ -696,69 +418,7 @@ class Dataset: # (pd.DataFrame):
696
418
  if len(self.data) == 0:
697
419
  raise ValidationError(self.bundle.get("all_search_keys_invalid"))
698
420
 
699
- def __validate_meaning_types(self, validate_target: bool):
700
- # self.logger.info("Validating meaning types")
701
- if self.meaning_types is None or len(self.meaning_types) == 0:
702
- raise ValueError(self.bundle.get("dataset_missing_meaning_types"))
703
-
704
- if SYSTEM_RECORD_ID not in self.data.columns:
705
- raise ValueError("Internal error")
706
-
707
- for column in self.meaning_types:
708
- if column not in self.data.columns:
709
- raise ValueError(self.bundle.get("dataset_missing_meaning_column").format(column, self.data.columns))
710
- if validate_target and FileColumnMeaningType.TARGET not in self.meaning_types.values():
711
- raise ValueError(self.bundle.get("dataset_missing_target"))
712
-
713
- def __validate_search_keys(self):
714
- # self.logger.info("Validating search keys")
715
- if self.search_keys is None or len(self.search_keys) == 0:
716
- raise ValueError(self.bundle.get("dataset_missing_search_keys"))
717
- for keys_group in self.search_keys:
718
- for key in keys_group:
719
- if key not in self.data.columns:
720
- showing_columns = set(self.data.columns) - SYSTEM_COLUMNS
721
- raise ValidationError(
722
- self.bundle.get("dataset_missing_search_key_column").format(key, showing_columns)
723
- )
724
-
725
421
  def validate(self, validate_target: bool = True, silent_mode: bool = False):
726
- # self.logger.info("Validating dataset")
727
-
728
- self.__validate_search_keys()
729
-
730
- self.__validate_meaning_types(validate_target=validate_target)
731
-
732
- self.__drop_ignore_columns()
733
-
734
- self.__rename_columns()
735
-
736
- self.__remove_dates_from_features(silent_mode)
737
-
738
- self.__validate_features_count()
739
-
740
- self.__validate_too_long_string_values()
741
-
742
- self.__convert_bools()
743
-
744
- self.__convert_float16()
745
-
746
- self.__correct_decimal_comma()
747
-
748
- self.__remove_old_dates(silent_mode)
749
-
750
- self.__convert_ip()
751
-
752
- self.__convert_phone()
753
-
754
- self.__normalize_iso_code()
755
-
756
- self.__normalize_postal_code()
757
-
758
- self.__normalize_hem()
759
-
760
- self.__convert_features_types()
761
-
762
422
  self.__validate_dataset(validate_target, silent_mode)
763
423
 
764
424
  if validate_target:
@@ -776,35 +436,39 @@ class Dataset: # (pd.DataFrame):
776
436
  # self.logger.info("Constructing dataset metadata")
777
437
  columns = []
778
438
  for index, (column_name, column_type) in enumerate(zip(self.data.columns, self.data.dtypes)):
779
- if column_name not in self.ignore_columns:
780
- if column_name in self.meaning_types_checked:
781
- meaning_type = self.meaning_types_checked[column_name]
782
- # Temporary workaround while backend doesn't support datetime
783
- if meaning_type == FileColumnMeaningType.DATETIME:
784
- meaning_type = FileColumnMeaningType.DATE
785
- else:
786
- meaning_type = FileColumnMeaningType.FEATURE
787
- if meaning_type in {
788
- FileColumnMeaningType.DATE,
789
- FileColumnMeaningType.DATETIME,
790
- # FileColumnMeaningType.IP_ADDRESS,
791
- }:
792
- min_max_values = NumericInterval(
793
- minValue=self.data[column_name].astype("Int64").min(),
794
- maxValue=self.data[column_name].astype("Int64").max(),
795
- )
796
- else:
797
- min_max_values = None
798
- column_meta = FileColumnMetadata(
799
- index=index,
800
- name=column_name,
801
- originalName=self.columns_renaming.get(column_name) or column_name,
802
- dataType=self.__get_data_type(column_type, column_name),
803
- meaningType=meaning_type,
804
- minMaxValues=min_max_values,
439
+ if column_name in self.meaning_types_checked:
440
+ meaning_type = self.meaning_types_checked[column_name]
441
+ # Temporary workaround while backend doesn't support datetime
442
+ if meaning_type == FileColumnMeaningType.DATETIME:
443
+ meaning_type = FileColumnMeaningType.DATE
444
+ else:
445
+ meaning_type = FileColumnMeaningType.FEATURE
446
+ if meaning_type in {
447
+ FileColumnMeaningType.DATE,
448
+ FileColumnMeaningType.DATETIME,
449
+ # FileColumnMeaningType.IP_ADDRESS,
450
+ }:
451
+ min_value = self.data[column_name].astype("Int64").min()
452
+ max_value = self.data[column_name].astype("Int64").max()
453
+ min_max_values = NumericInterval(
454
+ minValue=min_value,
455
+ maxValue=max_value,
805
456
  )
457
+ else:
458
+ min_max_values = None
459
+ column_meta = FileColumnMetadata(
460
+ index=index,
461
+ name=column_name,
462
+ originalName=self.columns_renaming.get(column_name) or column_name,
463
+ dataType=self.__get_data_type(column_type, column_name),
464
+ meaningType=meaning_type,
465
+ minMaxValues=min_max_values,
466
+ )
467
+ if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
468
+ column_meta.isUnnest = True
469
+ column_meta.unnestKeyNames = self.unnest_search_keys[column_meta.originalName]
806
470
 
807
- columns.append(column_meta)
471
+ columns.append(column_meta)
808
472
 
809
473
  return FileMetadata(
810
474
  name=self.dataset_name,
@@ -1036,7 +700,7 @@ class Dataset: # (pd.DataFrame):
1036
700
  parquet_file_path = f"{base_path}/{self.dataset_name}.parquet"
1037
701
  self.data.to_parquet(path=parquet_file_path, index=False, compression="gzip", engine="fastparquet")
1038
702
  uploading_file_size = Path(parquet_file_path).stat().st_size
1039
- self.logger.info(f"Size of prepared uploading file: {uploading_file_size}")
703
+ self.logger.info(f"Size of prepared uploading file: {uploading_file_size}. {len(self.data)} rows")
1040
704
  if uploading_file_size > self.MAX_UPLOADING_FILE_SIZE:
1041
705
  raise ValidationError(self.bundle.get("dataset_too_big_file"))
1042
706
  return parquet_file_path