upgini 1.2.122a4__py3-none-any.whl → 1.2.146a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.122a4"
1
+ __version__ = "1.2.146a4"
upgini/autofe/binary.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import abc
2
2
  from typing import Optional
3
+
3
4
  import Levenshtein
4
5
  import numpy as np
5
6
  import pandas as pd
@@ -201,7 +202,7 @@ class JaroWinklerSim1(StringSim):
201
202
  has_symmetry_importance: bool = True
202
203
 
203
204
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
204
- return value
205
+ return value if value is not None and len(value) > 0 else None
205
206
 
206
207
  def _similarity(self, left: str, right: str) -> float:
207
208
  return jarowinkler_similarity(left, right)
@@ -216,7 +217,7 @@ class JaroWinklerSim2(StringSim):
216
217
  has_symmetry_importance: bool = True
217
218
 
218
219
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
219
- return value[::-1] if value is not None else None
220
+ return value[::-1] if value is not None and len(value) > 0 else None
220
221
 
221
222
  def _similarity(self, left: str, right: str) -> float:
222
223
  return jarowinkler_similarity(left, right)
@@ -231,7 +232,7 @@ class LevenshteinSim(StringSim):
231
232
  has_symmetry_importance: bool = True
232
233
 
233
234
  def _prepare_value(self, value: Optional[str]) -> Optional[str]:
234
- return value
235
+ return value if value is not None and len(value) > 0 else None
235
236
 
236
237
  def _similarity(self, left: str, right: str) -> float:
237
238
  return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
@@ -123,17 +123,9 @@ class DataSourcePublisher:
123
123
  set(search_keys.values()) == {SearchKey.IP_RANGE_FROM, SearchKey.IP_RANGE_TO}
124
124
  or set(search_keys.values()) == {SearchKey.IPV6_RANGE_FROM, SearchKey.IPV6_RANGE_TO}
125
125
  or set(search_keys.values()) == {SearchKey.MSISDN_RANGE_FROM, SearchKey.MSISDN_RANGE_TO}
126
+ or snapshot_frequency_days is not None or join_date_abs_limit_days is not None
126
127
  ) and sort_column is None:
127
128
  raise ValidationError("Sort column is required for passed search keys")
128
- if (
129
- set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
130
- and snapshot_frequency_days is None
131
- and join_date_abs_limit_days is None
132
- ):
133
- raise ValidationError(
134
- "With MSISDN and DATE keys one of the snapshot_frequency_days or"
135
- " join_date_abs_limit_days parameters is required"
136
- )
137
129
  if (
138
130
  set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
139
131
  or set(search_keys.values()) == {SearchKey.HEM, SearchKey.DATE}
upgini/dataset.py CHANGED
@@ -7,6 +7,8 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
7
7
 
8
8
  import numpy as np
9
9
  import pandas as pd
10
+ import pyarrow as pa
11
+ import pyarrow.parquet as pq
10
12
  from pandas.api.types import (
11
13
  is_float_dtype,
12
14
  is_integer_dtype,
@@ -18,6 +20,7 @@ from pandas.api.types import (
18
20
  from upgini.errors import ValidationError
19
21
  from upgini.http import ProgressStage, SearchProgress, _RestClient
20
22
  from upgini.metadata import (
23
+ CURRENT_DATE_COL,
21
24
  ENTITY_SYSTEM_RECORD_ID,
22
25
  EVAL_SET_INDEX,
23
26
  SYSTEM_RECORD_ID,
@@ -33,11 +36,13 @@ from upgini.metadata import (
33
36
  NumericInterval,
34
37
  RuntimeParameters,
35
38
  SearchCustomization,
39
+ SearchKey,
36
40
  )
37
41
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
38
42
  from upgini.search_task import SearchTask
39
43
  from upgini.utils.config import SampleConfig
40
44
  from upgini.utils.email_utils import EmailSearchKeyConverter
45
+ from upgini.utils.hash_utils import file_hash
41
46
  from upgini.utils.sample_utils import SampleColumns, sample
42
47
 
43
48
  try:
@@ -50,7 +55,7 @@ except Exception:
50
55
 
51
56
  class Dataset:
52
57
  MIN_ROWS_COUNT = 100
53
- MAX_ROWS = 200_000
58
+ MAX_ROWS = 3_000_000
54
59
  MIN_SUPPORTED_DATE_TS = 946684800000 # 2000-01-01
55
60
  MAX_FEATURES_COUNT = 3500
56
61
  MAX_UPLOADING_FILE_SIZE = 268435456 # 256 Mb
@@ -71,6 +76,8 @@ class Dataset:
71
76
  date_column: Optional[str] = None,
72
77
  id_columns: Optional[List[str]] = None,
73
78
  is_imbalanced: bool = False,
79
+ dropped_columns: Optional[List[str]] = None,
80
+ autodetected_search_keys: Optional[Dict[str, SearchKey]] = None,
74
81
  random_state: Optional[int] = None,
75
82
  sample_config: Optional[SampleConfig] = None,
76
83
  rest_client: Optional[_RestClient] = None,
@@ -118,6 +125,8 @@ class Dataset:
118
125
  self.is_imbalanced: bool = False
119
126
  self.id_columns = id_columns
120
127
  self.is_imbalanced = is_imbalanced
128
+ self.dropped_columns = dropped_columns
129
+ self.autodetected_search_keys = autodetected_search_keys
121
130
  self.date_column = date_column
122
131
  if logger is not None:
123
132
  self.logger = logger
@@ -151,7 +160,9 @@ class Dataset:
151
160
  def etalon_def_checked(self) -> Dict[str, str]:
152
161
  if self.etalon_def is None:
153
162
  self.etalon_def = {
154
- v.value: k for k, v in self.meaning_types_checked.items() if v != FileColumnMeaningType.FEATURE
163
+ v.value: k
164
+ for k, v in self.meaning_types_checked.items()
165
+ if v not in [FileColumnMeaningType.FEATURE, FileColumnMeaningType.DATE_FEATURE]
155
166
  }
156
167
 
157
168
  return self.etalon_def
@@ -283,6 +294,7 @@ class Dataset:
283
294
  for key in search_group
284
295
  if key in self.columns_renaming
285
296
  and not self.columns_renaming.get(key).endswith(EmailSearchKeyConverter.ONE_DOMAIN_SUFFIX)
297
+ and not self.columns_renaming.get(key) == CURRENT_DATE_COL
286
298
  }
287
299
  ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS.value)
288
300
  if (
@@ -292,10 +304,11 @@ class Dataset:
292
304
  ):
293
305
  keys_to_validate.remove(ipv4_column)
294
306
 
295
- mandatory_columns = [target]
307
+ mandatory_columns = {target} if target is not None else {}
296
308
  columns_to_validate = mandatory_columns.copy()
297
- columns_to_validate.extend(keys_to_validate)
298
- columns_to_validate = set([i for i in columns_to_validate if i is not None])
309
+ columns_to_validate.update(keys_to_validate)
310
+ if len(columns_to_validate) == 0:
311
+ return
299
312
 
300
313
  nrows = len(self.data)
301
314
  validation_stats = {}
@@ -358,7 +371,10 @@ class Dataset:
358
371
  self.data["valid_keys"] = self.data["valid_keys"] + self.data[f"{col}_is_valid"]
359
372
  self.data.drop(columns=f"{col}_is_valid", inplace=True)
360
373
 
361
- self.data["is_valid"] = self.data["valid_keys"] > 0
374
+ if len(keys_to_validate) > 0:
375
+ self.data["is_valid"] = self.data["valid_keys"] > 0
376
+ else:
377
+ self.data["is_valid"] = True
362
378
  self.data["is_valid"] = self.data["is_valid"] & self.data["valid_mandatory"]
363
379
  self.data.drop(columns=["valid_keys", "valid_mandatory"], inplace=True)
364
380
 
@@ -464,6 +480,37 @@ class Dataset:
464
480
 
465
481
  columns.append(column_meta)
466
482
 
483
+ current_date = int(pd.Timestamp(pd.Timestamp.now().date(), tz="UTC").timestamp() * 1000)
484
+ with tempfile.TemporaryDirectory() as tmp_dir:
485
+ if (
486
+ self.date_column is not None
487
+ and self.data[self.date_column].nunique() == 1
488
+ and self.data[self.date_column].iloc[0] == current_date
489
+ ):
490
+ df_without_fake_date = self.data.drop(columns=[self.date_column])
491
+ else:
492
+ df_without_fake_date = self.data
493
+ parquet_file_path = f"{tmp_dir}/{self.dataset_name}.parquet"
494
+
495
+ # calculate deterministic digest for any environment
496
+
497
+ table = pa.Table.from_pandas(df_without_fake_date, preserve_index=False)
498
+ table = table.replace_schema_metadata({}) # remove all metadata
499
+ pq.write_table(
500
+ table,
501
+ parquet_file_path,
502
+ compression=None, # any compression will make it non-deterministic
503
+ data_page_size=0, # optional, to remove page layout variations
504
+ use_deprecated_int96_timestamps=False, # fix timestamp format
505
+ write_statistics=False, # remove statistics to make it deterministic
506
+ )
507
+
508
+ deterministic_digest = file_hash(parquet_file_path)
509
+
510
+ autodetected_search_keys = (
511
+ {k: v.name for k, v in self.autodetected_search_keys.items()} if self.autodetected_search_keys else None
512
+ )
513
+
467
514
  return FileMetadata(
468
515
  name=self.dataset_name,
469
516
  description=self.description,
@@ -473,6 +520,9 @@ class Dataset:
473
520
  hierarchicalGroupKeys=self.hierarchical_group_keys,
474
521
  hierarchicalSubgroupKeys=self.hierarchical_subgroup_keys,
475
522
  taskType=self.task_type,
523
+ droppedColumns=self.dropped_columns,
524
+ autodetectedSearchKeys=autodetected_search_keys,
525
+ deterministicDigest=deterministic_digest,
476
526
  )
477
527
 
478
528
  @staticmethod