upgini 1.2.106a3956.dev1__py3-none-any.whl → 1.2.108__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.106a3956.dev1"
1
+ __version__ = "1.2.108"
@@ -208,7 +208,7 @@ class FeaturesEnricher(TransformerMixin):
208
208
  self,
209
209
  search_keys: Optional[Dict[str, SearchKey]] = None,
210
210
  country_code: Optional[str] = None,
211
- model_task_type: Optional[ModelTaskType] = None,
211
+ model_task_type: Optional[Union[ModelTaskType, str]] = None,
212
212
  api_key: Optional[str] = None,
213
213
  endpoint: Optional[str] = None,
214
214
  search_id: Optional[str] = None,
@@ -234,6 +234,7 @@ class FeaturesEnricher(TransformerMixin):
234
234
  id_columns: Optional[List[str]] = None,
235
235
  generate_search_key_features: bool = True,
236
236
  sample_config: Optional[SampleConfig] = None,
237
+ print_trace_id: bool = False,
237
238
  **kwargs,
238
239
  ):
239
240
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -282,6 +283,8 @@ class FeaturesEnricher(TransformerMixin):
282
283
  self.country_code = country_code
283
284
  self.__validate_search_keys(search_keys, search_id)
284
285
 
286
+ if model_task_type is not None:
287
+ self.model_task_type = ModelTaskType.parse(model_task_type)
285
288
  self.model_task_type = model_task_type
286
289
  self.endpoint = endpoint
287
290
  self._search_task: Optional[SearchTask] = None
@@ -303,6 +306,8 @@ class FeaturesEnricher(TransformerMixin):
303
306
 
304
307
  print(self.bundle.get("search_by_task_id_start"))
305
308
  trace_id = str(uuid.uuid4())
309
+ if self.print_trace_id:
310
+ print(f"@trace_id:{trace_id}")
306
311
  with MDC(trace_id=trace_id):
307
312
  try:
308
313
  self.logger.debug(f"FeaturesEnricher created from existing search: {search_id}")
@@ -366,6 +371,7 @@ class FeaturesEnricher(TransformerMixin):
366
371
  self.data_sources_display_handle = None
367
372
  self.autofe_features_display_handle = None
368
373
  self.report_button_handle = None
374
+ self.print_trace_id = print_trace_id
369
375
 
370
376
  def _get_sample_config(self, sample_config: Optional[SampleConfig] = None):
371
377
  sample_config = sample_config or SampleConfig(force_sample_size=Dataset.FORCE_SAMPLE_SIZE)
@@ -461,6 +467,8 @@ class FeaturesEnricher(TransformerMixin):
461
467
  Otherwise, return all features from input and only selected features from data sources.
462
468
  """
463
469
  trace_id = str(uuid.uuid4())
470
+ if self.print_trace_id:
471
+ print(f"@trace_id:{trace_id}")
464
472
  start_time = time.time()
465
473
  auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
466
474
  search_progress = SearchProgress(0.0, ProgressStage.START_FIT)
@@ -619,6 +627,8 @@ class FeaturesEnricher(TransformerMixin):
619
627
  self.warning_counter.reset()
620
628
  auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
621
629
  trace_id = str(uuid.uuid4())
630
+ if self.print_trace_id:
631
+ print(f"@trace_id:{trace_id}")
622
632
  start_time = time.time()
623
633
  with MDC(trace_id=trace_id):
624
634
  if len(args) > 0:
@@ -4633,65 +4643,78 @@ if response.status_code == 200:
4633
4643
  eval_set: Union[Tuple, None] = None,
4634
4644
  ):
4635
4645
  def dump_task(X_, y_, eval_set_):
4636
- try:
4637
- if isinstance(X_, pd.Series):
4638
- X_ = X_.to_frame()
4639
-
4640
- with tempfile.TemporaryDirectory() as tmp_dir:
4641
- X_.to_parquet(f"{tmp_dir}/x.parquet", compression="zstd")
4642
- x_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/x.parquet")
4643
- if self.rest_client.is_file_uploaded(trace_id, x_digest_sha256):
4644
- self.logger.info(f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping")
4645
- else:
4646
- self.rest_client.dump_input_file(trace_id, f"{tmp_dir}/x.parquet", "x.parquet")
4647
-
4648
- if y_ is not None:
4649
- if isinstance(y_, pd.Series):
4650
- y_ = y_.to_frame()
4651
- y_.to_parquet(f"{tmp_dir}/y.parquet", compression="zstd")
4652
- y_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/y.parquet")
4653
- if self.rest_client.is_file_uploaded(trace_id, y_digest_sha256):
4646
+ with MDC(trace_id=trace_id):
4647
+ try:
4648
+ if isinstance(X_, pd.Series):
4649
+ X_ = X_.to_frame()
4650
+
4651
+ with tempfile.TemporaryDirectory() as tmp_dir:
4652
+ X_.to_parquet(f"{tmp_dir}/x.parquet", compression="zstd")
4653
+ x_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/x.parquet")
4654
+ if self.rest_client.is_file_uploaded(trace_id, x_digest_sha256):
4654
4655
  self.logger.info(
4655
- f"File y.parquet was already uploaded with digest {y_digest_sha256}, skipping"
4656
+ f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping"
4656
4657
  )
4657
4658
  else:
4658
- self.rest_client.dump_input_file(trace_id, f"{tmp_dir}/y.parquet", "y.parquet")
4659
-
4660
- if eval_set_ is not None and len(eval_set_) > 0:
4661
- for idx, (eval_x_, eval_y_) in enumerate(eval_set_):
4662
- if isinstance(eval_x_, pd.Series):
4663
- eval_x_ = eval_x_.to_frame()
4664
- eval_x_.to_parquet(f"{tmp_dir}/eval_x_{idx}.parquet", compression="zstd")
4665
- eval_x_digest_sha256 = self.rest_client.compute_file_digest(
4666
- f"{tmp_dir}/eval_x_{idx}.parquet"
4667
- )
4668
- if self.rest_client.is_file_uploaded(trace_id, eval_x_digest_sha256):
4669
- self.logger.info(
4670
- f"File eval_x_{idx}.parquet was already uploaded with"
4671
- f" digest {eval_x_digest_sha256}, skipping"
4672
- )
4673
- else:
4674
- self.rest_client.dump_input_file(
4675
- trace_id, f"{tmp_dir}/eval_x_{idx}.parquet", f"eval_x_{idx}.parquet"
4676
- )
4659
+ self.rest_client.dump_input_file(
4660
+ trace_id, f"{tmp_dir}/x.parquet", "x.parquet", x_digest_sha256
4661
+ )
4677
4662
 
4678
- if isinstance(eval_y_, pd.Series):
4679
- eval_y_ = eval_y_.to_frame()
4680
- eval_y_.to_parquet(f"{tmp_dir}/eval_y_{idx}.parquet", compression="zstd")
4681
- eval_y_digest_sha256 = self.rest_client.compute_file_digest(
4682
- f"{tmp_dir}/eval_y_{idx}.parquet"
4663
+ if y_ is not None:
4664
+ if isinstance(y_, pd.Series):
4665
+ y_ = y_.to_frame()
4666
+ y_.to_parquet(f"{tmp_dir}/y.parquet", compression="zstd")
4667
+ y_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/y.parquet")
4668
+ if self.rest_client.is_file_uploaded(trace_id, y_digest_sha256):
4669
+ self.logger.info(
4670
+ f"File y.parquet was already uploaded with digest {y_digest_sha256}, skipping"
4683
4671
  )
4684
- if self.rest_client.is_file_uploaded(trace_id, eval_y_digest_sha256):
4685
- self.logger.info(
4686
- f"File eval_y_{idx}.parquet was already uploaded"
4687
- f" with digest {eval_y_digest_sha256}, skipping"
4672
+ else:
4673
+ self.rest_client.dump_input_file(
4674
+ trace_id, f"{tmp_dir}/y.parquet", "y.parquet", y_digest_sha256
4675
+ )
4676
+
4677
+ if eval_set_ is not None and len(eval_set_) > 0:
4678
+ for idx, (eval_x_, eval_y_) in enumerate(eval_set_):
4679
+ if isinstance(eval_x_, pd.Series):
4680
+ eval_x_ = eval_x_.to_frame()
4681
+ eval_x_.to_parquet(f"{tmp_dir}/eval_x_{idx}.parquet", compression="zstd")
4682
+ eval_x_digest_sha256 = self.rest_client.compute_file_digest(
4683
+ f"{tmp_dir}/eval_x_{idx}.parquet"
4688
4684
  )
4689
- else:
4690
- self.rest_client.dump_input_file(
4691
- trace_id, f"{tmp_dir}/eval_y_{idx}.parquet", f"eval_y_{idx}.parquet"
4685
+ if self.rest_client.is_file_uploaded(trace_id, eval_x_digest_sha256):
4686
+ self.logger.info(
4687
+ f"File eval_x_{idx}.parquet was already uploaded with"
4688
+ f" digest {eval_x_digest_sha256}, skipping"
4689
+ )
4690
+ else:
4691
+ self.rest_client.dump_input_file(
4692
+ trace_id,
4693
+ f"{tmp_dir}/eval_x_{idx}.parquet",
4694
+ f"eval_x_{idx}.parquet",
4695
+ eval_x_digest_sha256,
4696
+ )
4697
+
4698
+ if isinstance(eval_y_, pd.Series):
4699
+ eval_y_ = eval_y_.to_frame()
4700
+ eval_y_.to_parquet(f"{tmp_dir}/eval_y_{idx}.parquet", compression="zstd")
4701
+ eval_y_digest_sha256 = self.rest_client.compute_file_digest(
4702
+ f"{tmp_dir}/eval_y_{idx}.parquet"
4692
4703
  )
4693
- except Exception:
4694
- self.logger.warning("Failed to dump input files", exc_info=True)
4704
+ if self.rest_client.is_file_uploaded(trace_id, eval_y_digest_sha256):
4705
+ self.logger.info(
4706
+ f"File eval_y_{idx}.parquet was already uploaded"
4707
+ f" with digest {eval_y_digest_sha256}, skipping"
4708
+ )
4709
+ else:
4710
+ self.rest_client.dump_input_file(
4711
+ trace_id,
4712
+ f"{tmp_dir}/eval_y_{idx}.parquet",
4713
+ f"eval_y_{idx}.parquet",
4714
+ eval_y_digest_sha256,
4715
+ )
4716
+ except Exception:
4717
+ self.logger.warning("Failed to dump input files", exc_info=True)
4695
4718
 
4696
4719
  try:
4697
4720
  Thread(target=dump_task, args=(X, y, eval_set), daemon=True).start()
upgini/http.py CHANGED
@@ -274,7 +274,7 @@ class _RestClient:
274
274
  SEARCH_FILE_METADATA_URI_FMT_V2 = SERVICE_ROOT_V2 + "search/{0}/metadata"
275
275
  SEARCH_TASK_METADATA_FMT_V3 = SERVICE_ROOT_V2 + "search/metadata-v2/{0}"
276
276
  SEARCH_DUMP_INPUT_FMT_V2 = SERVICE_ROOT_V2 + "search/dump-input"
277
- SEARCH_DUMP_INPUT_FILE_FMT = SERVICE_ROOT_V2 + "search/dump-input-file"
277
+ SEARCH_DUMP_INPUT_FILE_FMT = SERVICE_ROOT_V2 + "search/dump-input-file?digest={0}"
278
278
  TRANSFORM_USAGE_FMT = SERVICE_ROOT_V2 + "user/transform-usage"
279
279
 
280
280
  UPLOAD_USER_ADS_URI = SERVICE_ROOT + "ads/upload"
@@ -406,8 +406,8 @@ class _RestClient:
406
406
  meaning_types = [_RestClient.meaning_type_by_name(name, metadata) for name in search_key_names]
407
407
  return [meaning_type.value for meaning_type in meaning_types if meaning_type is not None]
408
408
 
409
- def dump_input_file(self, trace_id: str, path: str, file_name: str):
410
- api_path = self.SEARCH_DUMP_INPUT_FILE_FMT
409
+ def dump_input_file(self, trace_id: str, path: str, file_name: str, digest: str):
410
+ api_path = self.SEARCH_DUMP_INPUT_FILE_FMT.format(digest)
411
411
  with open(path, "rb") as file:
412
412
  files = {"file": (file_name, file, "application/octet-stream")}
413
413
  self._with_unauth_retry(
upgini/metadata.py CHANGED
@@ -162,6 +162,15 @@ class ModelTaskType(Enum):
162
162
  def is_classification(self) -> bool:
163
163
  return self in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
164
164
 
165
+ @staticmethod
166
+ def parse(task_type: Any) -> "ModelTaskType":
167
+ if isinstance(task_type, ModelTaskType):
168
+ return task_type
169
+ elif isinstance(task_type, str):
170
+ return ModelTaskType(task_type.upper())
171
+ else:
172
+ raise ValueError(f"Invalid task type: {task_type}")
173
+
165
174
 
166
175
  class ModelLabelType(Enum):
167
176
  GINI = "gini"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.106a3956.dev1
3
+ Version: 1.2.108
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,11 +1,11 @@
1
- upgini/__about__.py,sha256=wMowjQ4NPzJq0VApX8igfNH42M4gS9xSnMJkWS4GHwk,34
1
+ upgini/__about__.py,sha256=MtpgyPilS-p0uCXLJRENxbcYk2BQX6y8kTyPV7OfGCU,24
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=e6JDYTZ2AwC5aF-dqclKZKkiKrHo2f6cFmMQO2ZZmjM,32724
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=dBCBqAhzVHqRB2b1sPy9wzSi9XtIzeb6uArmJhcjj_8,220370
7
- upgini/http.py,sha256=DNcoS7qdxG0mOJn6I8r6O5I6XdIJTdzDzW3hkz3NgG4,45443
8
- upgini/metadata.py,sha256=vsbbHyPCP3Rs8WkeDgQg99uAA_zmsbDStAT-NwDYhO4,12455
6
+ upgini/features_enricher.py,sha256=BHZcpkUl7ncSdTgiMxk_a6oD1pLEkKPOm5wxebK8TsU,221609
7
+ upgini/http.py,sha256=zeAZvT6IAzOs9jQ3WG8mJBANLajgvv2LZePFzKz004w,45482
8
+ upgini/metadata.py,sha256=9_0lFEWPpIHRBW-xWYSEcwPzICTC6_bQ6dUUlE75Xns,12773
9
9
  upgini/metrics.py,sha256=V2SP6NS5bfFHzRqufeKVsCXME1yG4t_8Dmk2E3zKdYk,45715
10
10
  upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
@@ -71,7 +71,7 @@ upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,
71
71
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
72
72
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
73
73
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
74
- upgini-1.2.106a3956.dev1.dist-info/METADATA,sha256=8gm0u1avCVEV1kBGkO9qYEFZHuRJ-lv_c4i0CfyO1Hg,49539
75
- upgini-1.2.106a3956.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
76
- upgini-1.2.106a3956.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
- upgini-1.2.106a3956.dev1.dist-info/RECORD,,
74
+ upgini-1.2.108.dist-info/METADATA,sha256=O9XA7uFUs-bvKrspv0HZi6vD_v8ZLSVckmDW5N5CeRo,49529
75
+ upgini-1.2.108.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
76
+ upgini-1.2.108.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
+ upgini-1.2.108.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.24.2
2
+ Generator: hatchling 1.25.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any