upgini 1.1.280.dev0__py3-none-any.whl → 1.2.31a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (43) hide show
  1. upgini/__about__.py +1 -1
  2. upgini/__init__.py +4 -20
  3. upgini/autofe/all_operands.py +39 -9
  4. upgini/autofe/binary.py +148 -45
  5. upgini/autofe/date.py +197 -26
  6. upgini/autofe/feature.py +102 -19
  7. upgini/autofe/groupby.py +22 -22
  8. upgini/autofe/operand.py +9 -6
  9. upgini/autofe/unary.py +83 -41
  10. upgini/autofe/vector.py +8 -8
  11. upgini/data_source/data_source_publisher.py +128 -5
  12. upgini/dataset.py +50 -386
  13. upgini/features_enricher.py +931 -542
  14. upgini/http.py +27 -16
  15. upgini/lazy_import.py +35 -0
  16. upgini/metadata.py +84 -59
  17. upgini/metrics.py +164 -34
  18. upgini/normalizer/normalize_utils.py +197 -0
  19. upgini/resource_bundle/strings.properties +66 -51
  20. upgini/search_task.py +10 -4
  21. upgini/utils/Roboto-Regular.ttf +0 -0
  22. upgini/utils/base_search_key_detector.py +14 -12
  23. upgini/utils/country_utils.py +16 -0
  24. upgini/utils/custom_loss_utils.py +39 -36
  25. upgini/utils/datetime_utils.py +98 -45
  26. upgini/utils/deduplicate_utils.py +135 -112
  27. upgini/utils/display_utils.py +46 -15
  28. upgini/utils/email_utils.py +54 -16
  29. upgini/utils/feature_info.py +172 -0
  30. upgini/utils/features_validator.py +34 -20
  31. upgini/utils/ip_utils.py +100 -1
  32. upgini/utils/phone_utils.py +343 -0
  33. upgini/utils/postal_code_utils.py +34 -0
  34. upgini/utils/sklearn_ext.py +28 -19
  35. upgini/utils/target_utils.py +113 -57
  36. upgini/utils/warning_counter.py +1 -0
  37. upgini/version_validator.py +8 -4
  38. {upgini-1.1.280.dev0.dist-info → upgini-1.2.31a2.dist-info}/METADATA +31 -16
  39. upgini-1.2.31a2.dist-info/RECORD +65 -0
  40. upgini/normalizer/phone_normalizer.py +0 -340
  41. upgini-1.1.280.dev0.dist-info/RECORD +0 -62
  42. {upgini-1.1.280.dev0.dist-info → upgini-1.2.31a2.dist-info}/WHEEL +0 -0
  43. {upgini-1.1.280.dev0.dist-info → upgini-1.2.31a2.dist-info}/licenses/LICENSE +0 -0
@@ -3,7 +3,7 @@ import time
3
3
  import uuid
4
4
  from datetime import datetime
5
5
  from enum import Enum
6
- from typing import Dict, List, Optional, Union
6
+ from typing import Dict, List, Literal, Optional, Union
7
7
 
8
8
  from upgini.errors import HttpError, ValidationError
9
9
  from upgini.http import LoggerFactory, get_rest_client
@@ -47,7 +47,9 @@ class DataSourcePublisher:
47
47
  self,
48
48
  data_table_uri: str,
49
49
  search_keys: Dict[str, SearchKey],
50
- update_frequency: str,
50
+ update_frequency: Union[
51
+ Literal["Daily"], Literal["Weekly"], Literal["Monthly"], Literal["Quarterly"], Literal["Annually"]
52
+ ],
51
53
  exclude_from_autofe_generation: Optional[List[str]],
52
54
  secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
53
55
  sort_column: Optional[str] = None,
@@ -58,9 +60,43 @@ class DataSourcePublisher:
58
60
  join_date_abs_limit_days: Optional[int] = None,
59
61
  features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
60
62
  data_table_id_to_replace: Optional[str] = None,
63
+ keep_features: Optional[List[str]] = None,
64
+ date_features: Optional[List[str]] = None,
65
+ date_vector_features: Optional[List[str]] = None,
66
+ generate_runtime_embeddings: Optional[List[str]] = None,
67
+ exclude_raw: Optional[List[str]] = None,
61
68
  _force_generation=False,
62
69
  _silent=False,
63
70
  ) -> str:
71
+ """Register new ADS
72
+
73
+ Parameters
74
+ ----------
75
+ data_table_uri - str - table name in format {project_id}.{datasource_name}.{table_name}
76
+
77
+ search_keys - dict with column names as keys and SearchKey as value
78
+
79
+ update_frequency - str - (Monthly, Weekly, Daily, Annually, Quarterly)
80
+
81
+ exclude_from_autofe_generation - optional list of features that should be excluded from AutoFE
82
+
83
+ secondary_search_keys - optional dict of secondary search keys
84
+
85
+ sort_column - optional str - name of unique column that could be used for sort
86
+
87
+ date_format - optional str - format of date if it is present in search keys
88
+
89
+ features_for_embeddings - optional list of str - list of features that should be used for GPT features
90
+ generation
91
+
92
+ exclude_raw - optional list of str - list of features that should NOT be used as raw features
93
+
94
+ ...
95
+
96
+ data_table_id_to_replace - optional str - id of registered ADS that should be replaced by new table
97
+
98
+ keep_features - optional list - features that should not be removed from ADS (even if they are personal)
99
+ """
64
100
  trace_id = str(uuid.uuid4())
65
101
 
66
102
  with MDC(trace_id=trace_id):
@@ -94,6 +130,11 @@ class DataSourcePublisher:
94
130
  "With MSISDN and DATE keys one of the snapshot_frequency_days or"
95
131
  " join_date_abs_limit_days parameters is required"
96
132
  )
133
+ if (
134
+ set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
135
+ or set(search_keys.values()) == {SearchKey.HEM, SearchKey.DATE}
136
+ ) and not date_format:
137
+ raise ValidationError("date_format argument is required for PHONE+DATE and HEM+DATE search keys")
97
138
 
98
139
  request = {
99
140
  "dataTableUri": data_table_uri,
@@ -116,6 +157,20 @@ class DataSourcePublisher:
116
157
  request["adsDefinitionIdToReplace"] = data_table_id_to_replace
117
158
  if exclude_from_autofe_generation is not None:
118
159
  request["excludeFromGeneration"] = exclude_from_autofe_generation
160
+ if keep_features is not None:
161
+ request["keepFeatures"] = keep_features
162
+ if date_features is not None:
163
+ if date_format is None:
164
+ raise ValidationError("date_format should be presented if you use date features")
165
+ request["dateFeatures"] = date_features
166
+ if date_vector_features is not None:
167
+ if date_format is None:
168
+ raise ValidationError("date_format should be presented if you use date vector features")
169
+ request["dateVectorFeatures"] = date_vector_features
170
+ if generate_runtime_embeddings is not None:
171
+ request["generateRuntimeEmbeddingsFeatures"] = generate_runtime_embeddings
172
+ if exclude_raw is not None:
173
+ request["excludeRaw"] = exclude_raw
119
174
  self.logger.info(f"Start registering data table {request}")
120
175
 
121
176
  task_id = self._rest_client.register_ads(request, trace_id)
@@ -173,6 +228,9 @@ class DataSourcePublisher:
173
228
  msg = f"Data table successfully registered with id: {data_table_id}"
174
229
  self.logger.info(msg)
175
230
  print(msg)
231
+ if "warnings" in status_response and status_response["warnings"]:
232
+ self.logger.warning(status_response["warnings"])
233
+ print(status_response["warnings"])
176
234
  return data_table_id
177
235
  except KeyboardInterrupt:
178
236
  if task_id is not None:
@@ -185,11 +243,17 @@ class DataSourcePublisher:
185
243
  self.logger.exception("Failed to register data table")
186
244
  raise
187
245
 
188
- def remove(self, data_table_ids: List[str]):
246
+ def remove(self, data_table_ids: Union[List[str], str]):
189
247
  trace_id = str(uuid.uuid4())
190
248
  with MDC(trace_id=trace_id):
191
249
  try:
192
- if data_table_ids is None or len(data_table_ids) == 0:
250
+ if not data_table_ids:
251
+ raise ValidationError("Empty data table ids")
252
+ if isinstance(data_table_ids, str):
253
+ data_table_ids = [data_table_ids]
254
+ if not isinstance(data_table_ids, list):
255
+ raise ValidationError("Invalid format of data_table_ids argument")
256
+ if len(data_table_ids) == 0:
193
257
  raise ValidationError("Empty data table ids")
194
258
 
195
259
  for data_table_id in data_table_ids:
@@ -218,16 +282,23 @@ class DataSourcePublisher:
218
282
  source_link: Optional[str] = None,
219
283
  update_frequency: Optional[str] = None,
220
284
  client_emails: Optional[List[str]] = None,
285
+ date_features: Optional[List[str]] = None,
286
+ date_vector_features: Optional[List[str]] = None,
287
+ exclude_from_autofe_generation: Optional[List[str]] = None,
288
+ generate_runtime_embeddings: Optional[List[str]] = None,
289
+ exclude_raw: Optional[List[str]] = None,
221
290
  ):
222
291
  trace_id = str(uuid.uuid4())
223
292
  with MDC(trace_id=trace_id):
224
293
  try:
225
- if data_table_ids is None or len(data_table_ids) == 0:
294
+ if data_table_ids is None:
226
295
  raise ValidationError("Empty data table ids")
227
296
  if isinstance(data_table_ids, str):
228
297
  data_table_ids = [data_table_ids]
229
298
  if not isinstance(data_table_ids, list):
230
299
  raise ValidationError("data_table_ids should be string or list of strings")
300
+ if len(data_table_ids) == 0:
301
+ raise ValidationError("Empty data table ids")
231
302
  if update_frequency is not None and update_frequency not in self.ACCEPTABLE_UPDATE_FREQUENCIES:
232
303
  raise ValidationError(
233
304
  f"Invalid update frequency: {update_frequency}. "
@@ -263,6 +334,16 @@ class DataSourcePublisher:
263
334
  request["updateFrequency"] = update_frequency
264
335
  if client_emails is not None:
265
336
  request["clientEmails"] = client_emails
337
+ if date_features is not None:
338
+ request["dateFeatures"] = date_features
339
+ if date_vector_features is not None:
340
+ request["dateVectorFeatures"] = date_vector_features
341
+ if exclude_from_autofe_generation is not None:
342
+ request["excludeFromGenerationFeatures"] = exclude_from_autofe_generation
343
+ if generate_runtime_embeddings is not None:
344
+ request["generateRuntimeEmbeddingsFeatures"] = generate_runtime_embeddings
345
+ if exclude_raw is not None:
346
+ request["excludeRaw"] = exclude_raw
266
347
  self.logger.info(f"Activating data tables with request {request}")
267
348
 
268
349
  self._rest_client.activate_datatables(request, trace_id)
@@ -344,3 +425,45 @@ class DataSourcePublisher:
344
425
  self.upload_online(search_keys=keys)
345
426
 
346
427
  print("All ADS-es successfully uploaded")
428
+
429
+ def union_search_tasks(
430
+ self,
431
+ search_ids: List[str],
432
+ target_user_email: str,
433
+ selected_features: Optional[List[str]] = None,
434
+ exclude_features: Optional[List[str]] = None,
435
+ ) -> str:
436
+ if not search_ids:
437
+ raise Exception("Empty search ids list")
438
+
439
+ if not target_user_email:
440
+ raise Exception("Empty target user email")
441
+
442
+ request = {
443
+ "search_task_ids": search_ids,
444
+ "target_user_email": target_user_email,
445
+ }
446
+ if selected_features:
447
+ request["selected_features"] = selected_features
448
+ if exclude_features:
449
+ request["exclude_features"] = exclude_features
450
+
451
+ response = self._rest_client.union_search_tasks(request, "trace_id")
452
+ print(response)
453
+ return response
454
+
455
+ def reannounce_all_ads(self):
456
+ trace_id = str(uuid.uuid4())
457
+ with MDC(trace_id=trace_id):
458
+ try:
459
+ task_id = self._rest_client.reannounce_all_ads(trace_id)
460
+ with Spinner():
461
+ status_response = self._rest_client.poll_ads_management_task_status(task_id, trace_id)
462
+ while status_response["status"] not in self.FINAL_STATUSES:
463
+ time.sleep(5)
464
+ status_response = self._rest_client.poll_ads_management_task_status(task_id, trace_id)
465
+
466
+ if status_response["status"] != "COMPLETED":
467
+ raise Exception("Failed to reannounce all ADS: " + status_response["errorMessage"])
468
+ except Exception:
469
+ self.logger.exception("Failed to reannounce all ADS-es")