upgini 1.1.280.dev0__py3-none-any.whl → 1.2.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/__init__.py +4 -20
- upgini/autofe/all_operands.py +39 -9
- upgini/autofe/binary.py +148 -45
- upgini/autofe/date.py +197 -26
- upgini/autofe/feature.py +102 -19
- upgini/autofe/groupby.py +22 -22
- upgini/autofe/operand.py +9 -6
- upgini/autofe/unary.py +83 -41
- upgini/autofe/vector.py +8 -8
- upgini/data_source/data_source_publisher.py +128 -5
- upgini/dataset.py +50 -386
- upgini/features_enricher.py +931 -542
- upgini/http.py +27 -16
- upgini/lazy_import.py +35 -0
- upgini/metadata.py +84 -59
- upgini/metrics.py +164 -34
- upgini/normalizer/normalize_utils.py +197 -0
- upgini/resource_bundle/strings.properties +66 -51
- upgini/search_task.py +10 -4
- upgini/utils/Roboto-Regular.ttf +0 -0
- upgini/utils/base_search_key_detector.py +14 -12
- upgini/utils/country_utils.py +16 -0
- upgini/utils/custom_loss_utils.py +39 -36
- upgini/utils/datetime_utils.py +98 -45
- upgini/utils/deduplicate_utils.py +135 -112
- upgini/utils/display_utils.py +46 -15
- upgini/utils/email_utils.py +54 -16
- upgini/utils/feature_info.py +172 -0
- upgini/utils/features_validator.py +34 -20
- upgini/utils/ip_utils.py +100 -1
- upgini/utils/phone_utils.py +343 -0
- upgini/utils/postal_code_utils.py +34 -0
- upgini/utils/sklearn_ext.py +28 -19
- upgini/utils/target_utils.py +113 -57
- upgini/utils/warning_counter.py +1 -0
- upgini/version_validator.py +8 -4
- {upgini-1.1.280.dev0.dist-info → upgini-1.2.31.dist-info}/METADATA +31 -16
- upgini-1.2.31.dist-info/RECORD +65 -0
- upgini/normalizer/phone_normalizer.py +0 -340
- upgini-1.1.280.dev0.dist-info/RECORD +0 -62
- {upgini-1.1.280.dev0.dist-info → upgini-1.2.31.dist-info}/WHEEL +0 -0
- {upgini-1.1.280.dev0.dist-info → upgini-1.2.31.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,7 +3,7 @@ import time
|
|
|
3
3
|
import uuid
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
from enum import Enum
|
|
6
|
-
from typing import Dict, List, Optional, Union
|
|
6
|
+
from typing import Dict, List, Literal, Optional, Union
|
|
7
7
|
|
|
8
8
|
from upgini.errors import HttpError, ValidationError
|
|
9
9
|
from upgini.http import LoggerFactory, get_rest_client
|
|
@@ -47,7 +47,9 @@ class DataSourcePublisher:
|
|
|
47
47
|
self,
|
|
48
48
|
data_table_uri: str,
|
|
49
49
|
search_keys: Dict[str, SearchKey],
|
|
50
|
-
update_frequency:
|
|
50
|
+
update_frequency: Union[
|
|
51
|
+
Literal["Daily"], Literal["Weekly"], Literal["Monthly"], Literal["Quarterly"], Literal["Annually"]
|
|
52
|
+
],
|
|
51
53
|
exclude_from_autofe_generation: Optional[List[str]],
|
|
52
54
|
secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
|
|
53
55
|
sort_column: Optional[str] = None,
|
|
@@ -58,9 +60,43 @@ class DataSourcePublisher:
|
|
|
58
60
|
join_date_abs_limit_days: Optional[int] = None,
|
|
59
61
|
features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
|
|
60
62
|
data_table_id_to_replace: Optional[str] = None,
|
|
63
|
+
keep_features: Optional[List[str]] = None,
|
|
64
|
+
date_features: Optional[List[str]] = None,
|
|
65
|
+
date_vector_features: Optional[List[str]] = None,
|
|
66
|
+
generate_runtime_embeddings: Optional[List[str]] = None,
|
|
67
|
+
exclude_raw: Optional[List[str]] = None,
|
|
61
68
|
_force_generation=False,
|
|
62
69
|
_silent=False,
|
|
63
70
|
) -> str:
|
|
71
|
+
"""Register new ADS
|
|
72
|
+
|
|
73
|
+
Parameters
|
|
74
|
+
----------
|
|
75
|
+
data_table_uri - str - table name in format {project_id}.{datasource_name}.{table_name}
|
|
76
|
+
|
|
77
|
+
search_keys - dict with column names as keys and SearchKey as value
|
|
78
|
+
|
|
79
|
+
update_frequency - str - (Monthly, Weekly, Daily, Annually, Quarterly)
|
|
80
|
+
|
|
81
|
+
exclude_from_autofe_generation - optional list of features that should be excluded from AutoFE
|
|
82
|
+
|
|
83
|
+
secondary_search_keys - optional dict of secondary search keys
|
|
84
|
+
|
|
85
|
+
sort_column - optional str - name of unique column that could be used for sort
|
|
86
|
+
|
|
87
|
+
date_format - optional str - format of date if it is present in search keys
|
|
88
|
+
|
|
89
|
+
features_for_embeddings - optional list of str - list of features that should be used for GPT features
|
|
90
|
+
generation
|
|
91
|
+
|
|
92
|
+
exclude_raw - optional list of str - list of features that should NOT be used as raw features
|
|
93
|
+
|
|
94
|
+
...
|
|
95
|
+
|
|
96
|
+
data_table_id_to_replace - optional str - id of registered ADS that should be replaced by new table
|
|
97
|
+
|
|
98
|
+
keep_features - optional list - features that should not be removed from ADS (even if they are personal)
|
|
99
|
+
"""
|
|
64
100
|
trace_id = str(uuid.uuid4())
|
|
65
101
|
|
|
66
102
|
with MDC(trace_id=trace_id):
|
|
@@ -94,6 +130,11 @@ class DataSourcePublisher:
|
|
|
94
130
|
"With MSISDN and DATE keys one of the snapshot_frequency_days or"
|
|
95
131
|
" join_date_abs_limit_days parameters is required"
|
|
96
132
|
)
|
|
133
|
+
if (
|
|
134
|
+
set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
|
|
135
|
+
or set(search_keys.values()) == {SearchKey.HEM, SearchKey.DATE}
|
|
136
|
+
) and not date_format:
|
|
137
|
+
raise ValidationError("date_format argument is required for PHONE+DATE and HEM+DATE search keys")
|
|
97
138
|
|
|
98
139
|
request = {
|
|
99
140
|
"dataTableUri": data_table_uri,
|
|
@@ -116,6 +157,20 @@ class DataSourcePublisher:
|
|
|
116
157
|
request["adsDefinitionIdToReplace"] = data_table_id_to_replace
|
|
117
158
|
if exclude_from_autofe_generation is not None:
|
|
118
159
|
request["excludeFromGeneration"] = exclude_from_autofe_generation
|
|
160
|
+
if keep_features is not None:
|
|
161
|
+
request["keepFeatures"] = keep_features
|
|
162
|
+
if date_features is not None:
|
|
163
|
+
if date_format is None:
|
|
164
|
+
raise ValidationError("date_format should be presented if you use date features")
|
|
165
|
+
request["dateFeatures"] = date_features
|
|
166
|
+
if date_vector_features is not None:
|
|
167
|
+
if date_format is None:
|
|
168
|
+
raise ValidationError("date_format should be presented if you use date vector features")
|
|
169
|
+
request["dateVectorFeatures"] = date_vector_features
|
|
170
|
+
if generate_runtime_embeddings is not None:
|
|
171
|
+
request["generateRuntimeEmbeddingsFeatures"] = generate_runtime_embeddings
|
|
172
|
+
if exclude_raw is not None:
|
|
173
|
+
request["excludeRaw"] = exclude_raw
|
|
119
174
|
self.logger.info(f"Start registering data table {request}")
|
|
120
175
|
|
|
121
176
|
task_id = self._rest_client.register_ads(request, trace_id)
|
|
@@ -173,6 +228,9 @@ class DataSourcePublisher:
|
|
|
173
228
|
msg = f"Data table successfully registered with id: {data_table_id}"
|
|
174
229
|
self.logger.info(msg)
|
|
175
230
|
print(msg)
|
|
231
|
+
if "warnings" in status_response and status_response["warnings"]:
|
|
232
|
+
self.logger.warning(status_response["warnings"])
|
|
233
|
+
print(status_response["warnings"])
|
|
176
234
|
return data_table_id
|
|
177
235
|
except KeyboardInterrupt:
|
|
178
236
|
if task_id is not None:
|
|
@@ -185,11 +243,17 @@ class DataSourcePublisher:
|
|
|
185
243
|
self.logger.exception("Failed to register data table")
|
|
186
244
|
raise
|
|
187
245
|
|
|
188
|
-
def remove(self, data_table_ids: List[str]):
|
|
246
|
+
def remove(self, data_table_ids: Union[List[str], str]):
|
|
189
247
|
trace_id = str(uuid.uuid4())
|
|
190
248
|
with MDC(trace_id=trace_id):
|
|
191
249
|
try:
|
|
192
|
-
if
|
|
250
|
+
if not data_table_ids:
|
|
251
|
+
raise ValidationError("Empty data table ids")
|
|
252
|
+
if isinstance(data_table_ids, str):
|
|
253
|
+
data_table_ids = [data_table_ids]
|
|
254
|
+
if not isinstance(data_table_ids, list):
|
|
255
|
+
raise ValidationError("Invalid format of data_table_ids argument")
|
|
256
|
+
if len(data_table_ids) == 0:
|
|
193
257
|
raise ValidationError("Empty data table ids")
|
|
194
258
|
|
|
195
259
|
for data_table_id in data_table_ids:
|
|
@@ -218,16 +282,23 @@ class DataSourcePublisher:
|
|
|
218
282
|
source_link: Optional[str] = None,
|
|
219
283
|
update_frequency: Optional[str] = None,
|
|
220
284
|
client_emails: Optional[List[str]] = None,
|
|
285
|
+
date_features: Optional[List[str]] = None,
|
|
286
|
+
date_vector_features: Optional[List[str]] = None,
|
|
287
|
+
exclude_from_autofe_generation: Optional[List[str]] = None,
|
|
288
|
+
generate_runtime_embeddings: Optional[List[str]] = None,
|
|
289
|
+
exclude_raw: Optional[List[str]] = None,
|
|
221
290
|
):
|
|
222
291
|
trace_id = str(uuid.uuid4())
|
|
223
292
|
with MDC(trace_id=trace_id):
|
|
224
293
|
try:
|
|
225
|
-
if data_table_ids is None
|
|
294
|
+
if data_table_ids is None:
|
|
226
295
|
raise ValidationError("Empty data table ids")
|
|
227
296
|
if isinstance(data_table_ids, str):
|
|
228
297
|
data_table_ids = [data_table_ids]
|
|
229
298
|
if not isinstance(data_table_ids, list):
|
|
230
299
|
raise ValidationError("data_table_ids should be string or list of strings")
|
|
300
|
+
if len(data_table_ids) == 0:
|
|
301
|
+
raise ValidationError("Empty data table ids")
|
|
231
302
|
if update_frequency is not None and update_frequency not in self.ACCEPTABLE_UPDATE_FREQUENCIES:
|
|
232
303
|
raise ValidationError(
|
|
233
304
|
f"Invalid update frequency: {update_frequency}. "
|
|
@@ -263,6 +334,16 @@ class DataSourcePublisher:
|
|
|
263
334
|
request["updateFrequency"] = update_frequency
|
|
264
335
|
if client_emails is not None:
|
|
265
336
|
request["clientEmails"] = client_emails
|
|
337
|
+
if date_features is not None:
|
|
338
|
+
request["dateFeatures"] = date_features
|
|
339
|
+
if date_vector_features is not None:
|
|
340
|
+
request["dateVectorFeatures"] = date_vector_features
|
|
341
|
+
if exclude_from_autofe_generation is not None:
|
|
342
|
+
request["excludeFromGenerationFeatures"] = exclude_from_autofe_generation
|
|
343
|
+
if generate_runtime_embeddings is not None:
|
|
344
|
+
request["generateRuntimeEmbeddingsFeatures"] = generate_runtime_embeddings
|
|
345
|
+
if exclude_raw is not None:
|
|
346
|
+
request["excludeRaw"] = exclude_raw
|
|
266
347
|
self.logger.info(f"Activating data tables with request {request}")
|
|
267
348
|
|
|
268
349
|
self._rest_client.activate_datatables(request, trace_id)
|
|
@@ -344,3 +425,45 @@ class DataSourcePublisher:
|
|
|
344
425
|
self.upload_online(search_keys=keys)
|
|
345
426
|
|
|
346
427
|
print("All ADS-es successfully uploaded")
|
|
428
|
+
|
|
429
|
+
def union_search_tasks(
|
|
430
|
+
self,
|
|
431
|
+
search_ids: List[str],
|
|
432
|
+
target_user_email: str,
|
|
433
|
+
selected_features: Optional[List[str]] = None,
|
|
434
|
+
exclude_features: Optional[List[str]] = None,
|
|
435
|
+
) -> str:
|
|
436
|
+
if not search_ids:
|
|
437
|
+
raise Exception("Empty search ids list")
|
|
438
|
+
|
|
439
|
+
if not target_user_email:
|
|
440
|
+
raise Exception("Empty target user email")
|
|
441
|
+
|
|
442
|
+
request = {
|
|
443
|
+
"search_task_ids": search_ids,
|
|
444
|
+
"target_user_email": target_user_email,
|
|
445
|
+
}
|
|
446
|
+
if selected_features:
|
|
447
|
+
request["selected_features"] = selected_features
|
|
448
|
+
if exclude_features:
|
|
449
|
+
request["exclude_features"] = exclude_features
|
|
450
|
+
|
|
451
|
+
response = self._rest_client.union_search_tasks(request, "trace_id")
|
|
452
|
+
print(response)
|
|
453
|
+
return response
|
|
454
|
+
|
|
455
|
+
def reannounce_all_ads(self):
|
|
456
|
+
trace_id = str(uuid.uuid4())
|
|
457
|
+
with MDC(trace_id=trace_id):
|
|
458
|
+
try:
|
|
459
|
+
task_id = self._rest_client.reannounce_all_ads(trace_id)
|
|
460
|
+
with Spinner():
|
|
461
|
+
status_response = self._rest_client.poll_ads_management_task_status(task_id, trace_id)
|
|
462
|
+
while status_response["status"] not in self.FINAL_STATUSES:
|
|
463
|
+
time.sleep(5)
|
|
464
|
+
status_response = self._rest_client.poll_ads_management_task_status(task_id, trace_id)
|
|
465
|
+
|
|
466
|
+
if status_response["status"] != "COMPLETED":
|
|
467
|
+
raise Exception("Failed to reannounce all ADS: " + status_response["errorMessage"])
|
|
468
|
+
except Exception:
|
|
469
|
+
self.logger.exception("Failed to reannounce all ADS-es")
|