upgini 1.2.106a3956.dev1__py3-none-any.whl → 1.2.108__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +76 -53
- upgini/http.py +3 -3
- upgini/metadata.py +9 -0
- {upgini-1.2.106a3956.dev1.dist-info → upgini-1.2.108.dist-info}/METADATA +1 -1
- {upgini-1.2.106a3956.dev1.dist-info → upgini-1.2.108.dist-info}/RECORD +8 -8
- {upgini-1.2.106a3956.dev1.dist-info → upgini-1.2.108.dist-info}/WHEEL +1 -1
- {upgini-1.2.106a3956.dev1.dist-info → upgini-1.2.108.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.108"
|
upgini/features_enricher.py
CHANGED
@@ -208,7 +208,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
208
208
|
self,
|
209
209
|
search_keys: Optional[Dict[str, SearchKey]] = None,
|
210
210
|
country_code: Optional[str] = None,
|
211
|
-
model_task_type: Optional[ModelTaskType] = None,
|
211
|
+
model_task_type: Optional[Union[ModelTaskType, str]] = None,
|
212
212
|
api_key: Optional[str] = None,
|
213
213
|
endpoint: Optional[str] = None,
|
214
214
|
search_id: Optional[str] = None,
|
@@ -234,6 +234,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
234
234
|
id_columns: Optional[List[str]] = None,
|
235
235
|
generate_search_key_features: bool = True,
|
236
236
|
sample_config: Optional[SampleConfig] = None,
|
237
|
+
print_trace_id: bool = False,
|
237
238
|
**kwargs,
|
238
239
|
):
|
239
240
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
@@ -282,6 +283,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
282
283
|
self.country_code = country_code
|
283
284
|
self.__validate_search_keys(search_keys, search_id)
|
284
285
|
|
286
|
+
if model_task_type is not None:
|
287
|
+
self.model_task_type = ModelTaskType.parse(model_task_type)
|
285
288
|
self.model_task_type = model_task_type
|
286
289
|
self.endpoint = endpoint
|
287
290
|
self._search_task: Optional[SearchTask] = None
|
@@ -303,6 +306,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
303
306
|
|
304
307
|
print(self.bundle.get("search_by_task_id_start"))
|
305
308
|
trace_id = str(uuid.uuid4())
|
309
|
+
if self.print_trace_id:
|
310
|
+
print(f"@trace_id:{trace_id}")
|
306
311
|
with MDC(trace_id=trace_id):
|
307
312
|
try:
|
308
313
|
self.logger.debug(f"FeaturesEnricher created from existing search: {search_id}")
|
@@ -366,6 +371,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
366
371
|
self.data_sources_display_handle = None
|
367
372
|
self.autofe_features_display_handle = None
|
368
373
|
self.report_button_handle = None
|
374
|
+
self.print_trace_id = print_trace_id
|
369
375
|
|
370
376
|
def _get_sample_config(self, sample_config: Optional[SampleConfig] = None):
|
371
377
|
sample_config = sample_config or SampleConfig(force_sample_size=Dataset.FORCE_SAMPLE_SIZE)
|
@@ -461,6 +467,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
461
467
|
Otherwise, return all features from input and only selected features from data sources.
|
462
468
|
"""
|
463
469
|
trace_id = str(uuid.uuid4())
|
470
|
+
if self.print_trace_id:
|
471
|
+
print(f"@trace_id:{trace_id}")
|
464
472
|
start_time = time.time()
|
465
473
|
auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
|
466
474
|
search_progress = SearchProgress(0.0, ProgressStage.START_FIT)
|
@@ -619,6 +627,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
619
627
|
self.warning_counter.reset()
|
620
628
|
auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
|
621
629
|
trace_id = str(uuid.uuid4())
|
630
|
+
if self.print_trace_id:
|
631
|
+
print(f"@trace_id:{trace_id}")
|
622
632
|
start_time = time.time()
|
623
633
|
with MDC(trace_id=trace_id):
|
624
634
|
if len(args) > 0:
|
@@ -4633,65 +4643,78 @@ if response.status_code == 200:
|
|
4633
4643
|
eval_set: Union[Tuple, None] = None,
|
4634
4644
|
):
|
4635
4645
|
def dump_task(X_, y_, eval_set_):
|
4636
|
-
|
4637
|
-
|
4638
|
-
X_
|
4639
|
-
|
4640
|
-
|
4641
|
-
|
4642
|
-
|
4643
|
-
|
4644
|
-
self.
|
4645
|
-
else:
|
4646
|
-
self.rest_client.dump_input_file(trace_id, f"{tmp_dir}/x.parquet", "x.parquet")
|
4647
|
-
|
4648
|
-
if y_ is not None:
|
4649
|
-
if isinstance(y_, pd.Series):
|
4650
|
-
y_ = y_.to_frame()
|
4651
|
-
y_.to_parquet(f"{tmp_dir}/y.parquet", compression="zstd")
|
4652
|
-
y_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/y.parquet")
|
4653
|
-
if self.rest_client.is_file_uploaded(trace_id, y_digest_sha256):
|
4646
|
+
with MDC(trace_id=trace_id):
|
4647
|
+
try:
|
4648
|
+
if isinstance(X_, pd.Series):
|
4649
|
+
X_ = X_.to_frame()
|
4650
|
+
|
4651
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
4652
|
+
X_.to_parquet(f"{tmp_dir}/x.parquet", compression="zstd")
|
4653
|
+
x_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/x.parquet")
|
4654
|
+
if self.rest_client.is_file_uploaded(trace_id, x_digest_sha256):
|
4654
4655
|
self.logger.info(
|
4655
|
-
f"File
|
4656
|
+
f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping"
|
4656
4657
|
)
|
4657
4658
|
else:
|
4658
|
-
self.rest_client.dump_input_file(
|
4659
|
-
|
4660
|
-
|
4661
|
-
for idx, (eval_x_, eval_y_) in enumerate(eval_set_):
|
4662
|
-
if isinstance(eval_x_, pd.Series):
|
4663
|
-
eval_x_ = eval_x_.to_frame()
|
4664
|
-
eval_x_.to_parquet(f"{tmp_dir}/eval_x_{idx}.parquet", compression="zstd")
|
4665
|
-
eval_x_digest_sha256 = self.rest_client.compute_file_digest(
|
4666
|
-
f"{tmp_dir}/eval_x_{idx}.parquet"
|
4667
|
-
)
|
4668
|
-
if self.rest_client.is_file_uploaded(trace_id, eval_x_digest_sha256):
|
4669
|
-
self.logger.info(
|
4670
|
-
f"File eval_x_{idx}.parquet was already uploaded with"
|
4671
|
-
f" digest {eval_x_digest_sha256}, skipping"
|
4672
|
-
)
|
4673
|
-
else:
|
4674
|
-
self.rest_client.dump_input_file(
|
4675
|
-
trace_id, f"{tmp_dir}/eval_x_{idx}.parquet", f"eval_x_{idx}.parquet"
|
4676
|
-
)
|
4659
|
+
self.rest_client.dump_input_file(
|
4660
|
+
trace_id, f"{tmp_dir}/x.parquet", "x.parquet", x_digest_sha256
|
4661
|
+
)
|
4677
4662
|
|
4678
|
-
|
4679
|
-
|
4680
|
-
|
4681
|
-
|
4682
|
-
|
4663
|
+
if y_ is not None:
|
4664
|
+
if isinstance(y_, pd.Series):
|
4665
|
+
y_ = y_.to_frame()
|
4666
|
+
y_.to_parquet(f"{tmp_dir}/y.parquet", compression="zstd")
|
4667
|
+
y_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/y.parquet")
|
4668
|
+
if self.rest_client.is_file_uploaded(trace_id, y_digest_sha256):
|
4669
|
+
self.logger.info(
|
4670
|
+
f"File y.parquet was already uploaded with digest {y_digest_sha256}, skipping"
|
4683
4671
|
)
|
4684
|
-
|
4685
|
-
|
4686
|
-
|
4687
|
-
|
4672
|
+
else:
|
4673
|
+
self.rest_client.dump_input_file(
|
4674
|
+
trace_id, f"{tmp_dir}/y.parquet", "y.parquet", y_digest_sha256
|
4675
|
+
)
|
4676
|
+
|
4677
|
+
if eval_set_ is not None and len(eval_set_) > 0:
|
4678
|
+
for idx, (eval_x_, eval_y_) in enumerate(eval_set_):
|
4679
|
+
if isinstance(eval_x_, pd.Series):
|
4680
|
+
eval_x_ = eval_x_.to_frame()
|
4681
|
+
eval_x_.to_parquet(f"{tmp_dir}/eval_x_{idx}.parquet", compression="zstd")
|
4682
|
+
eval_x_digest_sha256 = self.rest_client.compute_file_digest(
|
4683
|
+
f"{tmp_dir}/eval_x_{idx}.parquet"
|
4688
4684
|
)
|
4689
|
-
|
4690
|
-
|
4691
|
-
|
4685
|
+
if self.rest_client.is_file_uploaded(trace_id, eval_x_digest_sha256):
|
4686
|
+
self.logger.info(
|
4687
|
+
f"File eval_x_{idx}.parquet was already uploaded with"
|
4688
|
+
f" digest {eval_x_digest_sha256}, skipping"
|
4689
|
+
)
|
4690
|
+
else:
|
4691
|
+
self.rest_client.dump_input_file(
|
4692
|
+
trace_id,
|
4693
|
+
f"{tmp_dir}/eval_x_{idx}.parquet",
|
4694
|
+
f"eval_x_{idx}.parquet",
|
4695
|
+
eval_x_digest_sha256,
|
4696
|
+
)
|
4697
|
+
|
4698
|
+
if isinstance(eval_y_, pd.Series):
|
4699
|
+
eval_y_ = eval_y_.to_frame()
|
4700
|
+
eval_y_.to_parquet(f"{tmp_dir}/eval_y_{idx}.parquet", compression="zstd")
|
4701
|
+
eval_y_digest_sha256 = self.rest_client.compute_file_digest(
|
4702
|
+
f"{tmp_dir}/eval_y_{idx}.parquet"
|
4692
4703
|
)
|
4693
|
-
|
4694
|
-
|
4704
|
+
if self.rest_client.is_file_uploaded(trace_id, eval_y_digest_sha256):
|
4705
|
+
self.logger.info(
|
4706
|
+
f"File eval_y_{idx}.parquet was already uploaded"
|
4707
|
+
f" with digest {eval_y_digest_sha256}, skipping"
|
4708
|
+
)
|
4709
|
+
else:
|
4710
|
+
self.rest_client.dump_input_file(
|
4711
|
+
trace_id,
|
4712
|
+
f"{tmp_dir}/eval_y_{idx}.parquet",
|
4713
|
+
f"eval_y_{idx}.parquet",
|
4714
|
+
eval_y_digest_sha256,
|
4715
|
+
)
|
4716
|
+
except Exception:
|
4717
|
+
self.logger.warning("Failed to dump input files", exc_info=True)
|
4695
4718
|
|
4696
4719
|
try:
|
4697
4720
|
Thread(target=dump_task, args=(X, y, eval_set), daemon=True).start()
|
upgini/http.py
CHANGED
@@ -274,7 +274,7 @@ class _RestClient:
|
|
274
274
|
SEARCH_FILE_METADATA_URI_FMT_V2 = SERVICE_ROOT_V2 + "search/{0}/metadata"
|
275
275
|
SEARCH_TASK_METADATA_FMT_V3 = SERVICE_ROOT_V2 + "search/metadata-v2/{0}"
|
276
276
|
SEARCH_DUMP_INPUT_FMT_V2 = SERVICE_ROOT_V2 + "search/dump-input"
|
277
|
-
SEARCH_DUMP_INPUT_FILE_FMT = SERVICE_ROOT_V2 + "search/dump-input-file"
|
277
|
+
SEARCH_DUMP_INPUT_FILE_FMT = SERVICE_ROOT_V2 + "search/dump-input-file?digest={0}"
|
278
278
|
TRANSFORM_USAGE_FMT = SERVICE_ROOT_V2 + "user/transform-usage"
|
279
279
|
|
280
280
|
UPLOAD_USER_ADS_URI = SERVICE_ROOT + "ads/upload"
|
@@ -406,8 +406,8 @@ class _RestClient:
|
|
406
406
|
meaning_types = [_RestClient.meaning_type_by_name(name, metadata) for name in search_key_names]
|
407
407
|
return [meaning_type.value for meaning_type in meaning_types if meaning_type is not None]
|
408
408
|
|
409
|
-
def dump_input_file(self, trace_id: str, path: str, file_name: str):
|
410
|
-
api_path = self.SEARCH_DUMP_INPUT_FILE_FMT
|
409
|
+
def dump_input_file(self, trace_id: str, path: str, file_name: str, digest: str):
|
410
|
+
api_path = self.SEARCH_DUMP_INPUT_FILE_FMT.format(digest)
|
411
411
|
with open(path, "rb") as file:
|
412
412
|
files = {"file": (file_name, file, "application/octet-stream")}
|
413
413
|
self._with_unauth_retry(
|
upgini/metadata.py
CHANGED
@@ -162,6 +162,15 @@ class ModelTaskType(Enum):
|
|
162
162
|
def is_classification(self) -> bool:
|
163
163
|
return self in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
|
164
164
|
|
165
|
+
@staticmethod
|
166
|
+
def parse(task_type: Any) -> "ModelTaskType":
|
167
|
+
if isinstance(task_type, ModelTaskType):
|
168
|
+
return task_type
|
169
|
+
elif isinstance(task_type, str):
|
170
|
+
return ModelTaskType(task_type.upper())
|
171
|
+
else:
|
172
|
+
raise ValueError(f"Invalid task type: {task_type}")
|
173
|
+
|
165
174
|
|
166
175
|
class ModelLabelType(Enum):
|
167
176
|
GINI = "gini"
|
@@ -1,11 +1,11 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=MtpgyPilS-p0uCXLJRENxbcYk2BQX6y8kTyPV7OfGCU,24
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=e6JDYTZ2AwC5aF-dqclKZKkiKrHo2f6cFmMQO2ZZmjM,32724
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
7
|
-
upgini/http.py,sha256=
|
8
|
-
upgini/metadata.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=BHZcpkUl7ncSdTgiMxk_a6oD1pLEkKPOm5wxebK8TsU,221609
|
7
|
+
upgini/http.py,sha256=zeAZvT6IAzOs9jQ3WG8mJBANLajgvv2LZePFzKz004w,45482
|
8
|
+
upgini/metadata.py,sha256=9_0lFEWPpIHRBW-xWYSEcwPzICTC6_bQ6dUUlE75Xns,12773
|
9
9
|
upgini/metrics.py,sha256=V2SP6NS5bfFHzRqufeKVsCXME1yG4t_8Dmk2E3zKdYk,45715
|
10
10
|
upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
@@ -71,7 +71,7 @@ upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,
|
|
71
71
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
72
72
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
73
73
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
74
|
-
upgini-1.2.
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
77
|
-
upgini-1.2.
|
74
|
+
upgini-1.2.108.dist-info/METADATA,sha256=O9XA7uFUs-bvKrspv0HZi6vD_v8ZLSVckmDW5N5CeRo,49529
|
75
|
+
upgini-1.2.108.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
76
|
+
upgini-1.2.108.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
77
|
+
upgini-1.2.108.dist-info/RECORD,,
|
File without changes
|