upgini 1.2.124__py3-none-any.whl → 1.2.146a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/binary.py +4 -3
- upgini/data_source/data_source_publisher.py +1 -9
- upgini/dataset.py +56 -6
- upgini/features_enricher.py +634 -556
- upgini/http.py +2 -2
- upgini/metadata.py +16 -2
- upgini/normalizer/normalize_utils.py +6 -6
- upgini/resource_bundle/strings.properties +15 -11
- upgini/search_task.py +14 -2
- upgini/utils/base_search_key_detector.py +5 -1
- upgini/utils/datetime_utils.py +125 -39
- upgini/utils/deduplicate_utils.py +8 -5
- upgini/utils/display_utils.py +61 -20
- upgini/utils/feature_info.py +18 -7
- upgini/utils/features_validator.py +6 -4
- upgini/utils/postal_code_utils.py +35 -2
- upgini/utils/target_utils.py +3 -1
- upgini/utils/track_info.py +29 -1
- {upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/METADATA +123 -121
- {upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/RECORD +23 -23
- {upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/WHEEL +1 -1
- {upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.146a4"
|
upgini/autofe/binary.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import abc
|
|
2
2
|
from typing import Optional
|
|
3
|
+
|
|
3
4
|
import Levenshtein
|
|
4
5
|
import numpy as np
|
|
5
6
|
import pandas as pd
|
|
@@ -201,7 +202,7 @@ class JaroWinklerSim1(StringSim):
|
|
|
201
202
|
has_symmetry_importance: bool = True
|
|
202
203
|
|
|
203
204
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
204
|
-
return value
|
|
205
|
+
return value if value is not None and len(value) > 0 else None
|
|
205
206
|
|
|
206
207
|
def _similarity(self, left: str, right: str) -> float:
|
|
207
208
|
return jarowinkler_similarity(left, right)
|
|
@@ -216,7 +217,7 @@ class JaroWinklerSim2(StringSim):
|
|
|
216
217
|
has_symmetry_importance: bool = True
|
|
217
218
|
|
|
218
219
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
219
|
-
return value[::-1] if value is not None else None
|
|
220
|
+
return value[::-1] if value is not None and len(value) > 0 else None
|
|
220
221
|
|
|
221
222
|
def _similarity(self, left: str, right: str) -> float:
|
|
222
223
|
return jarowinkler_similarity(left, right)
|
|
@@ -231,7 +232,7 @@ class LevenshteinSim(StringSim):
|
|
|
231
232
|
has_symmetry_importance: bool = True
|
|
232
233
|
|
|
233
234
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
234
|
-
return value
|
|
235
|
+
return value if value is not None and len(value) > 0 else None
|
|
235
236
|
|
|
236
237
|
def _similarity(self, left: str, right: str) -> float:
|
|
237
238
|
return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
|
|
@@ -123,17 +123,9 @@ class DataSourcePublisher:
|
|
|
123
123
|
set(search_keys.values()) == {SearchKey.IP_RANGE_FROM, SearchKey.IP_RANGE_TO}
|
|
124
124
|
or set(search_keys.values()) == {SearchKey.IPV6_RANGE_FROM, SearchKey.IPV6_RANGE_TO}
|
|
125
125
|
or set(search_keys.values()) == {SearchKey.MSISDN_RANGE_FROM, SearchKey.MSISDN_RANGE_TO}
|
|
126
|
+
or snapshot_frequency_days is not None or join_date_abs_limit_days is not None
|
|
126
127
|
) and sort_column is None:
|
|
127
128
|
raise ValidationError("Sort column is required for passed search keys")
|
|
128
|
-
if (
|
|
129
|
-
set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
|
|
130
|
-
and snapshot_frequency_days is None
|
|
131
|
-
and join_date_abs_limit_days is None
|
|
132
|
-
):
|
|
133
|
-
raise ValidationError(
|
|
134
|
-
"With MSISDN and DATE keys one of the snapshot_frequency_days or"
|
|
135
|
-
" join_date_abs_limit_days parameters is required"
|
|
136
|
-
)
|
|
137
129
|
if (
|
|
138
130
|
set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
|
|
139
131
|
or set(search_keys.values()) == {SearchKey.HEM, SearchKey.DATE}
|
upgini/dataset.py
CHANGED
|
@@ -7,6 +7,8 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
import pandas as pd
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
import pyarrow.parquet as pq
|
|
10
12
|
from pandas.api.types import (
|
|
11
13
|
is_float_dtype,
|
|
12
14
|
is_integer_dtype,
|
|
@@ -18,6 +20,7 @@ from pandas.api.types import (
|
|
|
18
20
|
from upgini.errors import ValidationError
|
|
19
21
|
from upgini.http import ProgressStage, SearchProgress, _RestClient
|
|
20
22
|
from upgini.metadata import (
|
|
23
|
+
CURRENT_DATE_COL,
|
|
21
24
|
ENTITY_SYSTEM_RECORD_ID,
|
|
22
25
|
EVAL_SET_INDEX,
|
|
23
26
|
SYSTEM_RECORD_ID,
|
|
@@ -33,11 +36,13 @@ from upgini.metadata import (
|
|
|
33
36
|
NumericInterval,
|
|
34
37
|
RuntimeParameters,
|
|
35
38
|
SearchCustomization,
|
|
39
|
+
SearchKey,
|
|
36
40
|
)
|
|
37
41
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
38
42
|
from upgini.search_task import SearchTask
|
|
39
43
|
from upgini.utils.config import SampleConfig
|
|
40
44
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
|
45
|
+
from upgini.utils.hash_utils import file_hash
|
|
41
46
|
from upgini.utils.sample_utils import SampleColumns, sample
|
|
42
47
|
|
|
43
48
|
try:
|
|
@@ -50,7 +55,7 @@ except Exception:
|
|
|
50
55
|
|
|
51
56
|
class Dataset:
|
|
52
57
|
MIN_ROWS_COUNT = 100
|
|
53
|
-
MAX_ROWS =
|
|
58
|
+
MAX_ROWS = 3_000_000
|
|
54
59
|
MIN_SUPPORTED_DATE_TS = 946684800000 # 2000-01-01
|
|
55
60
|
MAX_FEATURES_COUNT = 3500
|
|
56
61
|
MAX_UPLOADING_FILE_SIZE = 268435456 # 256 Mb
|
|
@@ -71,6 +76,8 @@ class Dataset:
|
|
|
71
76
|
date_column: Optional[str] = None,
|
|
72
77
|
id_columns: Optional[List[str]] = None,
|
|
73
78
|
is_imbalanced: bool = False,
|
|
79
|
+
dropped_columns: Optional[List[str]] = None,
|
|
80
|
+
autodetected_search_keys: Optional[Dict[str, SearchKey]] = None,
|
|
74
81
|
random_state: Optional[int] = None,
|
|
75
82
|
sample_config: Optional[SampleConfig] = None,
|
|
76
83
|
rest_client: Optional[_RestClient] = None,
|
|
@@ -118,6 +125,8 @@ class Dataset:
|
|
|
118
125
|
self.is_imbalanced: bool = False
|
|
119
126
|
self.id_columns = id_columns
|
|
120
127
|
self.is_imbalanced = is_imbalanced
|
|
128
|
+
self.dropped_columns = dropped_columns
|
|
129
|
+
self.autodetected_search_keys = autodetected_search_keys
|
|
121
130
|
self.date_column = date_column
|
|
122
131
|
if logger is not None:
|
|
123
132
|
self.logger = logger
|
|
@@ -151,7 +160,9 @@ class Dataset:
|
|
|
151
160
|
def etalon_def_checked(self) -> Dict[str, str]:
|
|
152
161
|
if self.etalon_def is None:
|
|
153
162
|
self.etalon_def = {
|
|
154
|
-
v.value: k
|
|
163
|
+
v.value: k
|
|
164
|
+
for k, v in self.meaning_types_checked.items()
|
|
165
|
+
if v not in [FileColumnMeaningType.FEATURE, FileColumnMeaningType.DATE_FEATURE]
|
|
155
166
|
}
|
|
156
167
|
|
|
157
168
|
return self.etalon_def
|
|
@@ -283,6 +294,7 @@ class Dataset:
|
|
|
283
294
|
for key in search_group
|
|
284
295
|
if key in self.columns_renaming
|
|
285
296
|
and not self.columns_renaming.get(key).endswith(EmailSearchKeyConverter.ONE_DOMAIN_SUFFIX)
|
|
297
|
+
and not self.columns_renaming.get(key) == CURRENT_DATE_COL
|
|
286
298
|
}
|
|
287
299
|
ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS.value)
|
|
288
300
|
if (
|
|
@@ -292,10 +304,11 @@ class Dataset:
|
|
|
292
304
|
):
|
|
293
305
|
keys_to_validate.remove(ipv4_column)
|
|
294
306
|
|
|
295
|
-
mandatory_columns =
|
|
307
|
+
mandatory_columns = {target} if target is not None else {}
|
|
296
308
|
columns_to_validate = mandatory_columns.copy()
|
|
297
|
-
columns_to_validate.
|
|
298
|
-
|
|
309
|
+
columns_to_validate.update(keys_to_validate)
|
|
310
|
+
if len(columns_to_validate) == 0:
|
|
311
|
+
return
|
|
299
312
|
|
|
300
313
|
nrows = len(self.data)
|
|
301
314
|
validation_stats = {}
|
|
@@ -358,7 +371,10 @@ class Dataset:
|
|
|
358
371
|
self.data["valid_keys"] = self.data["valid_keys"] + self.data[f"{col}_is_valid"]
|
|
359
372
|
self.data.drop(columns=f"{col}_is_valid", inplace=True)
|
|
360
373
|
|
|
361
|
-
|
|
374
|
+
if len(keys_to_validate) > 0:
|
|
375
|
+
self.data["is_valid"] = self.data["valid_keys"] > 0
|
|
376
|
+
else:
|
|
377
|
+
self.data["is_valid"] = True
|
|
362
378
|
self.data["is_valid"] = self.data["is_valid"] & self.data["valid_mandatory"]
|
|
363
379
|
self.data.drop(columns=["valid_keys", "valid_mandatory"], inplace=True)
|
|
364
380
|
|
|
@@ -464,6 +480,37 @@ class Dataset:
|
|
|
464
480
|
|
|
465
481
|
columns.append(column_meta)
|
|
466
482
|
|
|
483
|
+
current_date = int(pd.Timestamp(pd.Timestamp.now().date(), tz="UTC").timestamp() * 1000)
|
|
484
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
485
|
+
if (
|
|
486
|
+
self.date_column is not None
|
|
487
|
+
and self.data[self.date_column].nunique() == 1
|
|
488
|
+
and self.data[self.date_column].iloc[0] == current_date
|
|
489
|
+
):
|
|
490
|
+
df_without_fake_date = self.data.drop(columns=[self.date_column])
|
|
491
|
+
else:
|
|
492
|
+
df_without_fake_date = self.data
|
|
493
|
+
parquet_file_path = f"{tmp_dir}/{self.dataset_name}.parquet"
|
|
494
|
+
|
|
495
|
+
# calculate deterministic digest for any environment
|
|
496
|
+
|
|
497
|
+
table = pa.Table.from_pandas(df_without_fake_date, preserve_index=False)
|
|
498
|
+
table = table.replace_schema_metadata({}) # remove all metadata
|
|
499
|
+
pq.write_table(
|
|
500
|
+
table,
|
|
501
|
+
parquet_file_path,
|
|
502
|
+
compression=None, # any compression will make it non-deterministic
|
|
503
|
+
data_page_size=0, # optional, to remove page layout variations
|
|
504
|
+
use_deprecated_int96_timestamps=False, # fix timestamp format
|
|
505
|
+
write_statistics=False, # remove statistics to make it deterministic
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
deterministic_digest = file_hash(parquet_file_path)
|
|
509
|
+
|
|
510
|
+
autodetected_search_keys = (
|
|
511
|
+
{k: v.name for k, v in self.autodetected_search_keys.items()} if self.autodetected_search_keys else None
|
|
512
|
+
)
|
|
513
|
+
|
|
467
514
|
return FileMetadata(
|
|
468
515
|
name=self.dataset_name,
|
|
469
516
|
description=self.description,
|
|
@@ -473,6 +520,9 @@ class Dataset:
|
|
|
473
520
|
hierarchicalGroupKeys=self.hierarchical_group_keys,
|
|
474
521
|
hierarchicalSubgroupKeys=self.hierarchical_subgroup_keys,
|
|
475
522
|
taskType=self.task_type,
|
|
523
|
+
droppedColumns=self.dropped_columns,
|
|
524
|
+
autodetectedSearchKeys=autodetected_search_keys,
|
|
525
|
+
deterministicDigest=deterministic_digest,
|
|
476
526
|
)
|
|
477
527
|
|
|
478
528
|
@staticmethod
|