upgini 1.2.107__py3-none-any.whl → 1.2.108__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +73 -52
- upgini/http.py +3 -3
- {upgini-1.2.107.dist-info → upgini-1.2.108.dist-info}/METADATA +1 -1
- {upgini-1.2.107.dist-info → upgini-1.2.108.dist-info}/RECORD +7 -7
- {upgini-1.2.107.dist-info → upgini-1.2.108.dist-info}/WHEEL +0 -0
- {upgini-1.2.107.dist-info → upgini-1.2.108.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.108"
|
upgini/features_enricher.py
CHANGED
@@ -234,6 +234,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
234
234
|
id_columns: Optional[List[str]] = None,
|
235
235
|
generate_search_key_features: bool = True,
|
236
236
|
sample_config: Optional[SampleConfig] = None,
|
237
|
+
print_trace_id: bool = False,
|
237
238
|
**kwargs,
|
238
239
|
):
|
239
240
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
@@ -305,6 +306,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
305
306
|
|
306
307
|
print(self.bundle.get("search_by_task_id_start"))
|
307
308
|
trace_id = str(uuid.uuid4())
|
309
|
+
if self.print_trace_id:
|
310
|
+
print(f"@trace_id:{trace_id}")
|
308
311
|
with MDC(trace_id=trace_id):
|
309
312
|
try:
|
310
313
|
self.logger.debug(f"FeaturesEnricher created from existing search: {search_id}")
|
@@ -368,6 +371,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
368
371
|
self.data_sources_display_handle = None
|
369
372
|
self.autofe_features_display_handle = None
|
370
373
|
self.report_button_handle = None
|
374
|
+
self.print_trace_id = print_trace_id
|
371
375
|
|
372
376
|
def _get_sample_config(self, sample_config: Optional[SampleConfig] = None):
|
373
377
|
sample_config = sample_config or SampleConfig(force_sample_size=Dataset.FORCE_SAMPLE_SIZE)
|
@@ -463,6 +467,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
463
467
|
Otherwise, return all features from input and only selected features from data sources.
|
464
468
|
"""
|
465
469
|
trace_id = str(uuid.uuid4())
|
470
|
+
if self.print_trace_id:
|
471
|
+
print(f"@trace_id:{trace_id}")
|
466
472
|
start_time = time.time()
|
467
473
|
auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
|
468
474
|
search_progress = SearchProgress(0.0, ProgressStage.START_FIT)
|
@@ -621,6 +627,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
621
627
|
self.warning_counter.reset()
|
622
628
|
auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
|
623
629
|
trace_id = str(uuid.uuid4())
|
630
|
+
if self.print_trace_id:
|
631
|
+
print(f"@trace_id:{trace_id}")
|
624
632
|
start_time = time.time()
|
625
633
|
with MDC(trace_id=trace_id):
|
626
634
|
if len(args) > 0:
|
@@ -4635,65 +4643,78 @@ if response.status_code == 200:
|
|
4635
4643
|
eval_set: Union[Tuple, None] = None,
|
4636
4644
|
):
|
4637
4645
|
def dump_task(X_, y_, eval_set_):
|
4638
|
-
|
4639
|
-
|
4640
|
-
X_
|
4641
|
-
|
4642
|
-
|
4643
|
-
|
4644
|
-
|
4645
|
-
|
4646
|
-
self.
|
4647
|
-
else:
|
4648
|
-
self.rest_client.dump_input_file(trace_id, f"{tmp_dir}/x.parquet", "x.parquet")
|
4649
|
-
|
4650
|
-
if y_ is not None:
|
4651
|
-
if isinstance(y_, pd.Series):
|
4652
|
-
y_ = y_.to_frame()
|
4653
|
-
y_.to_parquet(f"{tmp_dir}/y.parquet", compression="zstd")
|
4654
|
-
y_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/y.parquet")
|
4655
|
-
if self.rest_client.is_file_uploaded(trace_id, y_digest_sha256):
|
4646
|
+
with MDC(trace_id=trace_id):
|
4647
|
+
try:
|
4648
|
+
if isinstance(X_, pd.Series):
|
4649
|
+
X_ = X_.to_frame()
|
4650
|
+
|
4651
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
4652
|
+
X_.to_parquet(f"{tmp_dir}/x.parquet", compression="zstd")
|
4653
|
+
x_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/x.parquet")
|
4654
|
+
if self.rest_client.is_file_uploaded(trace_id, x_digest_sha256):
|
4656
4655
|
self.logger.info(
|
4657
|
-
f"File
|
4656
|
+
f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping"
|
4658
4657
|
)
|
4659
4658
|
else:
|
4660
|
-
self.rest_client.dump_input_file(
|
4661
|
-
|
4662
|
-
|
4663
|
-
for idx, (eval_x_, eval_y_) in enumerate(eval_set_):
|
4664
|
-
if isinstance(eval_x_, pd.Series):
|
4665
|
-
eval_x_ = eval_x_.to_frame()
|
4666
|
-
eval_x_.to_parquet(f"{tmp_dir}/eval_x_{idx}.parquet", compression="zstd")
|
4667
|
-
eval_x_digest_sha256 = self.rest_client.compute_file_digest(
|
4668
|
-
f"{tmp_dir}/eval_x_{idx}.parquet"
|
4669
|
-
)
|
4670
|
-
if self.rest_client.is_file_uploaded(trace_id, eval_x_digest_sha256):
|
4671
|
-
self.logger.info(
|
4672
|
-
f"File eval_x_{idx}.parquet was already uploaded with"
|
4673
|
-
f" digest {eval_x_digest_sha256}, skipping"
|
4674
|
-
)
|
4675
|
-
else:
|
4676
|
-
self.rest_client.dump_input_file(
|
4677
|
-
trace_id, f"{tmp_dir}/eval_x_{idx}.parquet", f"eval_x_{idx}.parquet"
|
4678
|
-
)
|
4659
|
+
self.rest_client.dump_input_file(
|
4660
|
+
trace_id, f"{tmp_dir}/x.parquet", "x.parquet", x_digest_sha256
|
4661
|
+
)
|
4679
4662
|
|
4680
|
-
|
4681
|
-
|
4682
|
-
|
4683
|
-
|
4684
|
-
|
4663
|
+
if y_ is not None:
|
4664
|
+
if isinstance(y_, pd.Series):
|
4665
|
+
y_ = y_.to_frame()
|
4666
|
+
y_.to_parquet(f"{tmp_dir}/y.parquet", compression="zstd")
|
4667
|
+
y_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/y.parquet")
|
4668
|
+
if self.rest_client.is_file_uploaded(trace_id, y_digest_sha256):
|
4669
|
+
self.logger.info(
|
4670
|
+
f"File y.parquet was already uploaded with digest {y_digest_sha256}, skipping"
|
4685
4671
|
)
|
4686
|
-
|
4687
|
-
|
4688
|
-
|
4689
|
-
|
4672
|
+
else:
|
4673
|
+
self.rest_client.dump_input_file(
|
4674
|
+
trace_id, f"{tmp_dir}/y.parquet", "y.parquet", y_digest_sha256
|
4675
|
+
)
|
4676
|
+
|
4677
|
+
if eval_set_ is not None and len(eval_set_) > 0:
|
4678
|
+
for idx, (eval_x_, eval_y_) in enumerate(eval_set_):
|
4679
|
+
if isinstance(eval_x_, pd.Series):
|
4680
|
+
eval_x_ = eval_x_.to_frame()
|
4681
|
+
eval_x_.to_parquet(f"{tmp_dir}/eval_x_{idx}.parquet", compression="zstd")
|
4682
|
+
eval_x_digest_sha256 = self.rest_client.compute_file_digest(
|
4683
|
+
f"{tmp_dir}/eval_x_{idx}.parquet"
|
4690
4684
|
)
|
4691
|
-
|
4692
|
-
|
4693
|
-
|
4685
|
+
if self.rest_client.is_file_uploaded(trace_id, eval_x_digest_sha256):
|
4686
|
+
self.logger.info(
|
4687
|
+
f"File eval_x_{idx}.parquet was already uploaded with"
|
4688
|
+
f" digest {eval_x_digest_sha256}, skipping"
|
4689
|
+
)
|
4690
|
+
else:
|
4691
|
+
self.rest_client.dump_input_file(
|
4692
|
+
trace_id,
|
4693
|
+
f"{tmp_dir}/eval_x_{idx}.parquet",
|
4694
|
+
f"eval_x_{idx}.parquet",
|
4695
|
+
eval_x_digest_sha256,
|
4696
|
+
)
|
4697
|
+
|
4698
|
+
if isinstance(eval_y_, pd.Series):
|
4699
|
+
eval_y_ = eval_y_.to_frame()
|
4700
|
+
eval_y_.to_parquet(f"{tmp_dir}/eval_y_{idx}.parquet", compression="zstd")
|
4701
|
+
eval_y_digest_sha256 = self.rest_client.compute_file_digest(
|
4702
|
+
f"{tmp_dir}/eval_y_{idx}.parquet"
|
4694
4703
|
)
|
4695
|
-
|
4696
|
-
|
4704
|
+
if self.rest_client.is_file_uploaded(trace_id, eval_y_digest_sha256):
|
4705
|
+
self.logger.info(
|
4706
|
+
f"File eval_y_{idx}.parquet was already uploaded"
|
4707
|
+
f" with digest {eval_y_digest_sha256}, skipping"
|
4708
|
+
)
|
4709
|
+
else:
|
4710
|
+
self.rest_client.dump_input_file(
|
4711
|
+
trace_id,
|
4712
|
+
f"{tmp_dir}/eval_y_{idx}.parquet",
|
4713
|
+
f"eval_y_{idx}.parquet",
|
4714
|
+
eval_y_digest_sha256,
|
4715
|
+
)
|
4716
|
+
except Exception:
|
4717
|
+
self.logger.warning("Failed to dump input files", exc_info=True)
|
4697
4718
|
|
4698
4719
|
try:
|
4699
4720
|
Thread(target=dump_task, args=(X, y, eval_set), daemon=True).start()
|
upgini/http.py
CHANGED
@@ -274,7 +274,7 @@ class _RestClient:
|
|
274
274
|
SEARCH_FILE_METADATA_URI_FMT_V2 = SERVICE_ROOT_V2 + "search/{0}/metadata"
|
275
275
|
SEARCH_TASK_METADATA_FMT_V3 = SERVICE_ROOT_V2 + "search/metadata-v2/{0}"
|
276
276
|
SEARCH_DUMP_INPUT_FMT_V2 = SERVICE_ROOT_V2 + "search/dump-input"
|
277
|
-
SEARCH_DUMP_INPUT_FILE_FMT = SERVICE_ROOT_V2 + "search/dump-input-file"
|
277
|
+
SEARCH_DUMP_INPUT_FILE_FMT = SERVICE_ROOT_V2 + "search/dump-input-file?digest={0}"
|
278
278
|
TRANSFORM_USAGE_FMT = SERVICE_ROOT_V2 + "user/transform-usage"
|
279
279
|
|
280
280
|
UPLOAD_USER_ADS_URI = SERVICE_ROOT + "ads/upload"
|
@@ -406,8 +406,8 @@ class _RestClient:
|
|
406
406
|
meaning_types = [_RestClient.meaning_type_by_name(name, metadata) for name in search_key_names]
|
407
407
|
return [meaning_type.value for meaning_type in meaning_types if meaning_type is not None]
|
408
408
|
|
409
|
-
def dump_input_file(self, trace_id: str, path: str, file_name: str):
|
410
|
-
api_path = self.SEARCH_DUMP_INPUT_FILE_FMT
|
409
|
+
def dump_input_file(self, trace_id: str, path: str, file_name: str, digest: str):
|
410
|
+
api_path = self.SEARCH_DUMP_INPUT_FILE_FMT.format(digest)
|
411
411
|
with open(path, "rb") as file:
|
412
412
|
files = {"file": (file_name, file, "application/octet-stream")}
|
413
413
|
self._with_unauth_retry(
|
@@ -1,10 +1,10 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=MtpgyPilS-p0uCXLJRENxbcYk2BQX6y8kTyPV7OfGCU,24
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=e6JDYTZ2AwC5aF-dqclKZKkiKrHo2f6cFmMQO2ZZmjM,32724
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
7
|
-
upgini/http.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=BHZcpkUl7ncSdTgiMxk_a6oD1pLEkKPOm5wxebK8TsU,221609
|
7
|
+
upgini/http.py,sha256=zeAZvT6IAzOs9jQ3WG8mJBANLajgvv2LZePFzKz004w,45482
|
8
8
|
upgini/metadata.py,sha256=9_0lFEWPpIHRBW-xWYSEcwPzICTC6_bQ6dUUlE75Xns,12773
|
9
9
|
upgini/metrics.py,sha256=V2SP6NS5bfFHzRqufeKVsCXME1yG4t_8Dmk2E3zKdYk,45715
|
10
10
|
upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
|
@@ -71,7 +71,7 @@ upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,
|
|
71
71
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
72
72
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
73
73
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
74
|
-
upgini-1.2.
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
77
|
-
upgini-1.2.
|
74
|
+
upgini-1.2.108.dist-info/METADATA,sha256=O9XA7uFUs-bvKrspv0HZi6vD_v8ZLSVckmDW5N5CeRo,49529
|
75
|
+
upgini-1.2.108.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
76
|
+
upgini-1.2.108.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
77
|
+
upgini-1.2.108.dist-info/RECORD,,
|
File without changes
|
File without changes
|