upgini 1.1.280.dev0__py3-none-any.whl → 1.2.31a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (43) hide show
  1. upgini/__about__.py +1 -1
  2. upgini/__init__.py +4 -20
  3. upgini/autofe/all_operands.py +39 -9
  4. upgini/autofe/binary.py +148 -45
  5. upgini/autofe/date.py +197 -26
  6. upgini/autofe/feature.py +102 -19
  7. upgini/autofe/groupby.py +22 -22
  8. upgini/autofe/operand.py +9 -6
  9. upgini/autofe/unary.py +83 -41
  10. upgini/autofe/vector.py +8 -8
  11. upgini/data_source/data_source_publisher.py +128 -5
  12. upgini/dataset.py +50 -386
  13. upgini/features_enricher.py +931 -542
  14. upgini/http.py +27 -16
  15. upgini/lazy_import.py +35 -0
  16. upgini/metadata.py +84 -59
  17. upgini/metrics.py +164 -34
  18. upgini/normalizer/normalize_utils.py +197 -0
  19. upgini/resource_bundle/strings.properties +66 -51
  20. upgini/search_task.py +10 -4
  21. upgini/utils/Roboto-Regular.ttf +0 -0
  22. upgini/utils/base_search_key_detector.py +14 -12
  23. upgini/utils/country_utils.py +16 -0
  24. upgini/utils/custom_loss_utils.py +39 -36
  25. upgini/utils/datetime_utils.py +98 -45
  26. upgini/utils/deduplicate_utils.py +135 -112
  27. upgini/utils/display_utils.py +46 -15
  28. upgini/utils/email_utils.py +54 -16
  29. upgini/utils/feature_info.py +172 -0
  30. upgini/utils/features_validator.py +34 -20
  31. upgini/utils/ip_utils.py +100 -1
  32. upgini/utils/phone_utils.py +343 -0
  33. upgini/utils/postal_code_utils.py +34 -0
  34. upgini/utils/sklearn_ext.py +28 -19
  35. upgini/utils/target_utils.py +113 -57
  36. upgini/utils/warning_counter.py +1 -0
  37. upgini/version_validator.py +8 -4
  38. {upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/METADATA +31 -16
  39. upgini-1.2.31a1.dist-info/RECORD +65 -0
  40. upgini/normalizer/phone_normalizer.py +0 -340
  41. upgini-1.1.280.dev0.dist-info/RECORD +0 -62
  42. {upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/WHEEL +0 -0
  43. {upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/licenses/LICENSE +0 -0
upgini/http.py CHANGED
@@ -39,18 +39,6 @@ from upgini.metadata import (
39
39
  from upgini.resource_bundle import bundle
40
40
  from upgini.utils.track_info import get_track_metrics
41
41
 
42
- # try:
43
- # from importlib.metadata import version # type: ignore
44
-
45
- # __version__ = version("upgini")
46
- # except ImportError:
47
- # try:
48
- # from importlib_metadata import version # type: ignore
49
-
50
- # __version__ = version("upgini")
51
- # except ImportError:
52
- # __version__ = "Upgini wasn't installed"
53
-
54
42
  UPGINI_URL: str = "UPGINI_URL"
55
43
  UPGINI_API_KEY: str = "UPGINI_API_KEY"
56
44
  DEMO_API_KEY: str = "Aa4BPwGFbn1zNEXIkZ-NbhsRk0ricN6puKuga1-O5lM"
@@ -285,12 +273,14 @@ class _RestClient:
285
273
  DEACTIVATE_ADS_URI = "private/api/v2/ads/deactivate"
286
274
  TOGGLE_ADS_URI_FMT = "private/api/v2/ads/{0}/toggle"
287
275
  DELETE_ADS_URI_FMT = "private/api/v2/ads/{0}"
276
+ REANNOUNCE_ALL_ADS_URI_FMT = "private/api/v2/ads/reannounce-all-ads"
288
277
  POLL_ADS_MANAGEMENT_STATUS_URI_FMT = "private/api/v2/ads/management-task/{0}"
289
278
  GET_ADS_DESCRIPTION_URI_FMT = "private/api/v2/ads/{0}"
290
279
  GET_ALL_ADS_DESCRIPTIONS_URI = "private/api/v2/ads/descriptions"
291
280
  GET_ACTIVE_ADS_DEFINITIONS_URI = "private/api/v2/ads/definitions"
292
281
  UPLOAD_ONLINE_URI = "private/api/v2/ads/upload-online"
293
282
  STOP_ADS_MANAGEMENT_TASK_URI_FMT = "private/api/v2/ads/management-task/{0}/stop"
283
+ UNION_SEARCH_TASKS_URI_FMT = SERVICE_ROOT_V2 + "search/merge"
294
284
 
295
285
  ACCESS_TOKEN_HEADER_NAME = "Authorization"
296
286
  CONTENT_TYPE_HEADER_NAME = "Content-Type"
@@ -469,7 +459,11 @@ class _RestClient:
469
459
  dumps(track_metrics).encode(),
470
460
  "application/json",
471
461
  ),
472
- "metrics": ("metrics.json", metrics.json(exclude_none=True).encode(), "application/json"),
462
+ "metrics": (
463
+ "metrics.json",
464
+ metrics.json(exclude_none=True).encode(),
465
+ "application/json",
466
+ ),
473
467
  "file": (metadata_with_md5.name, file, "application/octet-stream"),
474
468
  }
475
469
  if search_customization is not None:
@@ -553,7 +547,11 @@ class _RestClient:
553
547
  dumps(get_track_metrics(self.client_ip, self.client_visitorid)).encode(),
554
548
  "application/json",
555
549
  ),
556
- "metrics": ("metrics.json", metrics.json(exclude_none=True).encode(), "application/json"),
550
+ "metrics": (
551
+ "metrics.json",
552
+ metrics.json(exclude_none=True).encode(),
553
+ "application/json",
554
+ ),
557
555
  "file": (metadata_with_md5.name, file, "application/octet-stream"),
558
556
  }
559
557
  if search_customization is not None:
@@ -649,7 +647,11 @@ class _RestClient:
649
647
  with open(file_path, "rb") as file:
650
648
  files = {
651
649
  "file": (metadata.name, file, "application/octet-stream"),
652
- "metadata": ("metadata.json", metadata.json(exclude_none=True).encode(), "application/json"),
650
+ "metadata": (
651
+ "metadata.json",
652
+ metadata.json(exclude_none=True).encode(),
653
+ "application/json",
654
+ ),
653
655
  }
654
656
 
655
657
  return self._send_post_file_req_v2(api_path, files)
@@ -725,6 +727,11 @@ class _RestClient:
725
727
  api_path = self.DEACTIVATE_ADS_URI
726
728
  self._with_unauth_retry(lambda: self._send_post_req(api_path, trace_id, request, result_format=None))
727
729
 
730
+ def reannounce_all_ads(self, trace_id: str) -> str:
731
+ api_path = self.REANNOUNCE_ALL_ADS_URI_FMT
732
+ response = self._with_unauth_retry(lambda: self._send_post_req(api_path, trace_id))
733
+ return response["adsManagementTaskId"]
734
+
728
735
  def toggle_ads(self, ads_definition_id: str, trace_id: str):
729
736
  api_path = self.TOGGLE_ADS_URI_FMT.format(ads_definition_id)
730
737
  return self._with_unauth_retry(lambda: self._send_post_req(api_path, trace_id))
@@ -760,6 +767,10 @@ class _RestClient:
760
767
  api_path = self.STOP_ADS_MANAGEMENT_TASK_URI_FMT.format(ads_management_task_id)
761
768
  self._with_unauth_retry(lambda: self._send_post_req(api_path, trace_id))
762
769
 
770
+ def union_search_tasks(self, request: dict, trace_id: str):
771
+ api_path = self.UNION_SEARCH_TASKS_URI_FMT
772
+ return self._with_unauth_retry(lambda: self._send_post_req(api_path, trace_id, request, result_format=None))
773
+
763
774
  # ---
764
775
 
765
776
  def _send_get_req(self, api_path: str, trace_id: Optional[str], additional_headers: Optional[dict] = None):
@@ -871,7 +882,7 @@ class _RestClient:
871
882
  if content_type:
872
883
  headers[_RestClient.CONTENT_TYPE_HEADER_NAME] = content_type
873
884
  if trace_id:
874
- headers[_RestClient.TRACE_ID_HEADER_NAME] = trace_id
885
+ headers[_RestClient.TRACE_ID_HEADER_NAME] = str(trace_id)
875
886
  for header_key, header_value in additional_headers.items():
876
887
  headers[header_key] = header_value
877
888
  return headers
upgini/lazy_import.py ADDED
@@ -0,0 +1,35 @@
1
+ import importlib
2
+ import importlib.util
3
+ import importlib.machinery
4
+
5
+
6
+ class LazyImport:
7
+ def __init__(self, module_name, class_name):
8
+ self.module_name = module_name
9
+ self.class_name = class_name
10
+ self._module = None
11
+ self._class = None
12
+
13
+ def _load(self):
14
+ if self._module is None:
15
+ # Load module and save link to it
16
+ spec = importlib.util.find_spec(self.module_name)
17
+ if spec is None:
18
+ raise ImportError(f"Module {self.module_name} not found")
19
+
20
+ # Create module
21
+ self._module = importlib.util.module_from_spec(spec)
22
+
23
+ # Execute module
24
+ spec.loader.exec_module(self._module)
25
+
26
+ # Get class from module
27
+ self._class = getattr(self._module, self.class_name)
28
+
29
+ def __call__(self, *args, **kwargs):
30
+ self._load()
31
+ return self._class(*args, **kwargs)
32
+
33
+ def __getattr__(self, name):
34
+ self._load()
35
+ return getattr(self._class, name)
upgini/metadata.py CHANGED
@@ -1,11 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from enum import Enum
4
- from typing import Dict, List, Optional, Set
4
+ from typing import Any, Dict, List, Optional, Union
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
8
8
  SYSTEM_RECORD_ID = "system_record_id"
9
+ ENTITY_SYSTEM_RECORD_ID = "entity_system_record_id"
10
+ SEARCH_KEY_UNNEST = "search_key_unnest"
9
11
  SORT_ID = "sort_id"
10
12
  EVAL_SET_INDEX = "eval_set_index"
11
13
  TARGET = "target"
@@ -13,7 +15,7 @@ COUNTRY = "country_iso_code"
13
15
  RENAMED_INDEX = "index_col"
14
16
  DEFAULT_INDEX = "index"
15
17
  ORIGINAL_INDEX = "original_index"
16
- SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET, COUNTRY, SORT_ID}
18
+ SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST, EVAL_SET_INDEX, TARGET, COUNTRY}
17
19
 
18
20
 
19
21
  class FileColumnMeaningType(Enum):
@@ -39,6 +41,8 @@ class FileColumnMeaningType(Enum):
39
41
  POSTAL_CODE = "POSTAL_CODE"
40
42
  SYSTEM_RECORD_ID = "SYSTEM_RECORD_ID"
41
43
  EVAL_SET_INDEX = "EVAL_SET_INDEX"
44
+ ENTITY_SYSTEM_RECORD_ID = "ENTITY_SYSTEM_RECORD_ID"
45
+ UNNEST_KEY = "UNNEST_KEY"
42
46
 
43
47
 
44
48
  class SearchKey(Enum):
@@ -109,6 +113,21 @@ class SearchKey(Enum):
109
113
  if meaning_type == FileColumnMeaningType.MSISDN_RANGE_TO:
110
114
  return SearchKey.MSISDN_RANGE_TO
111
115
 
116
+ @staticmethod
117
+ def find_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[SearchKey]:
118
+ if isinstance(keys, SearchKey):
119
+ keys = [keys]
120
+ for col, key_type in search_keys.items():
121
+ if key_type in keys:
122
+ return col
123
+ return None
124
+
125
+ @staticmethod
126
+ def find_all_keys(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> List[SearchKey]:
127
+ if isinstance(keys, SearchKey):
128
+ keys = [keys]
129
+ return [col for col, key_type in search_keys.items() if key_type in keys]
130
+
112
131
 
113
132
  class DataType(Enum):
114
133
  INT = "INT"
@@ -153,23 +172,23 @@ class FileMetricsInterval(BaseModel):
153
172
  date_cut: float
154
173
  count: float
155
174
  valid_count: float
156
- avg_target: Optional[float] # not for multiclass
157
- avg_score_etalon: Optional[float]
175
+ avg_target: Optional[float] = None # not for multiclass
176
+ avg_score_etalon: Optional[float] = None
158
177
 
159
178
 
160
179
  class FileMetrics(BaseModel):
161
180
  # etalon metadata
162
- task_type: Optional[ModelTaskType]
163
- label: Optional[ModelLabelType]
164
- count: Optional[int]
165
- valid_count: Optional[int]
166
- valid_rate: Optional[float]
167
- avg_target: Optional[float]
168
- metrics_binary_etalon: Optional[BinaryTask]
169
- metrics_regression_etalon: Optional[RegressionTask]
170
- metrics_multiclass_etalon: Optional[MulticlassTask]
171
- cuts: Optional[List[float]]
172
- interval: Optional[List[FileMetricsInterval]]
181
+ task_type: Optional[ModelTaskType] = None
182
+ label: Optional[ModelLabelType] = None
183
+ count: Optional[int] = None
184
+ valid_count: Optional[int] = None
185
+ valid_rate: Optional[float] = None
186
+ avg_target: Optional[float] = None
187
+ metrics_binary_etalon: Optional[BinaryTask] = None
188
+ metrics_regression_etalon: Optional[RegressionTask] = None
189
+ metrics_multiclass_etalon: Optional[MulticlassTask] = None
190
+ cuts: Optional[List[float]] = None
191
+ interval: Optional[List[FileMetricsInterval]] = None
173
192
 
174
193
 
175
194
  class NumericInterval(BaseModel):
@@ -183,21 +202,25 @@ class FileColumnMetadata(BaseModel):
183
202
  dataType: DataType
184
203
  meaningType: FileColumnMeaningType
185
204
  minMaxValues: Optional[NumericInterval] = None
186
- originalName: Optional[str]
205
+ originalName: Optional[str] = None
206
+ # is this column contains keys from multiple key columns like msisdn1, msisdn2
207
+ isUnnest: bool = False
208
+ # list of original etalon key column names like msisdn1, msisdn2
209
+ unnestKeyNames: Optional[List[str]] = None
187
210
 
188
211
 
189
212
  class FileMetadata(BaseModel):
190
213
  name: str
191
- description: Optional[str]
214
+ description: Optional[str] = None
192
215
  columns: List[FileColumnMetadata]
193
216
  searchKeys: List[List[str]]
194
- excludeFeaturesSources: Optional[List[str]]
195
- hierarchicalGroupKeys: Optional[List[str]]
196
- hierarchicalSubgroupKeys: Optional[List[str]]
197
- taskType: Optional[ModelTaskType]
198
- rowsCount: Optional[int]
199
- checksumMD5: Optional[str]
200
- digest: Optional[str]
217
+ excludeFeaturesSources: Optional[List[str]] = None
218
+ hierarchicalGroupKeys: Optional[List[str]] = None
219
+ hierarchicalSubgroupKeys: Optional[List[str]] = None
220
+ taskType: Optional[ModelTaskType] = None
221
+ rowsCount: Optional[int] = None
222
+ checksumMD5: Optional[str] = None
223
+ digest: Optional[str] = None
201
224
 
202
225
  def column_by_name(self, name: str) -> Optional[FileColumnMetadata]:
203
226
  for c in self.columns:
@@ -205,13 +228,13 @@ class FileMetadata(BaseModel):
205
228
  return c
206
229
  return None
207
230
 
208
- def search_types(self) -> Set[SearchKey]:
209
- search_keys = set()
231
+ def search_types(self) -> Dict[SearchKey, str]:
232
+ search_keys = dict()
210
233
  for keys_group in self.searchKeys:
211
234
  for key in keys_group:
212
235
  column = self.column_by_name(key)
213
236
  if column:
214
- search_keys.add(SearchKey.from_meaning_type(column.meaningType))
237
+ search_keys[SearchKey.from_meaning_type(column.meaningType)] = column.name
215
238
  return search_keys
216
239
 
217
240
 
@@ -221,17 +244,17 @@ class FeaturesMetadataV2(BaseModel):
221
244
  source: str
222
245
  hit_rate: float
223
246
  shap_value: float
224
- commercial_schema: Optional[str]
225
- data_provider: Optional[str]
226
- data_providers: Optional[List[str]]
227
- data_provider_link: Optional[str]
228
- data_provider_links: Optional[List[str]]
229
- data_source: Optional[str]
230
- data_sources: Optional[List[str]]
231
- data_source_link: Optional[str]
232
- data_source_links: Optional[List[str]]
233
- doc_link: Optional[str]
234
- update_frequency: Optional[str]
247
+ commercial_schema: Optional[str] = None
248
+ data_provider: Optional[str] = None
249
+ data_providers: Optional[List[str]] = None
250
+ data_provider_link: Optional[str] = None
251
+ data_provider_links: Optional[List[str]] = None
252
+ data_source: Optional[str] = None
253
+ data_sources: Optional[List[str]] = None
254
+ data_source_link: Optional[str] = None
255
+ data_source_links: Optional[List[str]] = None
256
+ doc_link: Optional[str] = None
257
+ update_frequency: Optional[str] = None
235
258
 
236
259
 
237
260
  class HitRateMetrics(BaseModel):
@@ -251,46 +274,48 @@ class ModelEvalSet(BaseModel):
251
274
  class BaseColumnMetadata(BaseModel):
252
275
  original_name: str
253
276
  hashed_name: str
254
- ads_definition_id: Optional[str]
277
+ ads_definition_id: Optional[str] = None
255
278
  is_augmented: bool
256
279
 
257
280
 
258
281
  class GeneratedFeatureMetadata(BaseModel):
259
- formula: str # on hashed names
282
+ alias: Optional[str] = None
283
+ formula: str
260
284
  display_index: str
261
285
  base_columns: List[BaseColumnMetadata]
286
+ operator_params: Optional[Dict[str, str]] = None
262
287
 
263
288
 
264
289
  class ProviderTaskMetadataV2(BaseModel):
265
290
  features: List[FeaturesMetadataV2]
266
- hit_rate_metrics: Optional[HitRateMetrics]
267
- eval_set_metrics: Optional[List[ModelEvalSet]]
268
- zero_hit_rate_search_keys: Optional[List[str]]
269
- features_used_for_embeddings: Optional[List[str]]
270
- shuffle_kfold: Optional[bool]
271
- generated_features: Optional[List[GeneratedFeatureMetadata]]
291
+ hit_rate_metrics: Optional[HitRateMetrics] = None
292
+ eval_set_metrics: Optional[List[ModelEvalSet]] = None
293
+ zero_hit_rate_search_keys: Optional[List[str]] = None
294
+ features_used_for_embeddings: Optional[List[str]] = None
295
+ shuffle_kfold: Optional[bool] = None
296
+ generated_features: Optional[List[GeneratedFeatureMetadata]] = None
272
297
 
273
298
 
274
299
  class FeaturesFilter(BaseModel):
275
- minImportance: Optional[float]
276
- maxPSI: Optional[float]
277
- maxCount: Optional[int]
278
- selectedFeatures: Optional[List[str]]
300
+ minImportance: Optional[float] = None
301
+ maxPSI: Optional[float] = None
302
+ maxCount: Optional[int] = None
303
+ selectedFeatures: Optional[List[str]] = None
279
304
 
280
305
 
281
306
  class RuntimeParameters(BaseModel):
282
- properties: Dict[str, str] = dict()
307
+ properties: Dict[str, Any] = {}
283
308
 
284
309
 
285
310
  class SearchCustomization(BaseModel):
286
- featuresFilter: Optional[FeaturesFilter]
287
- extractFeatures: Optional[bool]
288
- accurateModel: Optional[bool]
289
- importanceThreshold: Optional[float]
290
- maxFeatures: Optional[int]
291
- returnScores: Optional[bool]
292
- runtimeParameters: Optional[RuntimeParameters]
293
- metricsCalculation: Optional[bool]
311
+ featuresFilter: Optional[FeaturesFilter] = None
312
+ extractFeatures: Optional[bool] = None
313
+ accurateModel: Optional[bool] = None
314
+ importanceThreshold: Optional[float] = None
315
+ maxFeatures: Optional[int] = None
316
+ returnScores: Optional[bool] = None
317
+ runtimeParameters: Optional[RuntimeParameters] = None
318
+ metricsCalculation: Optional[bool] = None
294
319
 
295
320
  def __repr__(self):
296
321
  return (