upgini 1.1.131a3__py3-none-any.whl → 1.1.132__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -63,7 +63,7 @@ from upgini.version_validator import validate_version
63
63
  DEMO_DATASET_HASHES = [
64
64
  "7c354d1b1794c53ac7d7e5a2f2574568b660ca9159bc0d2aca9c7127ebcea2f7", # demo_salary fit
65
65
  "2519c9077c559f8975fdcdb5c50e9daae8d50b1d8a3ec72296c65ea7276f8812", # demo_salary transform
66
- ]
66
+ ]
67
67
 
68
68
 
69
69
  class FeaturesEnricher(TransformerMixin):
@@ -139,15 +139,16 @@ class FeaturesEnricher(TransformerMixin):
139
139
  raise_validation_error: bool = False,
140
140
  **kwargs,
141
141
  ):
142
- self.api_key = api_key or os.environ.get(UPGINI_API_KEY)
142
+ self._api_key = api_key or os.environ.get(UPGINI_API_KEY)
143
143
  try:
144
- self.rest_client = get_rest_client(endpoint, self.api_key)
144
+ self.rest_client = get_rest_client(endpoint, self._api_key)
145
145
  except UpginiConnectionError as e:
146
146
  print(e)
147
147
  return
148
148
 
149
+ self.logs_enabled = logs_enabled
149
150
  if logs_enabled:
150
- self.logger = LoggerFactory().get_logger(endpoint, self.api_key)
151
+ self.logger = LoggerFactory().get_logger(endpoint, self._api_key)
151
152
  else:
152
153
  self.logger = logging.getLogger()
153
154
  self.logger.setLevel("FATAL")
@@ -174,7 +175,7 @@ class FeaturesEnricher(TransformerMixin):
174
175
  search_task = SearchTask(
175
176
  search_id,
176
177
  endpoint=self.endpoint,
177
- api_key=self.api_key,
178
+ api_key=self._api_key,
178
179
  )
179
180
 
180
181
  print(bundle.get("search_by_task_id_start"))
@@ -235,6 +236,16 @@ class FeaturesEnricher(TransformerMixin):
235
236
  self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
236
237
  self.raise_validation_error = raise_validation_error
237
238
 
239
+ def _get_api_key(self):
240
+ return self._api_key
241
+
242
+ def _set_api_key(self, api_key: str):
243
+ self._api_key = api_key
244
+ if self.logs_enabled:
245
+ self.logger = LoggerFactory().get_logger(self.endpoint, self._api_key)
246
+
247
+ api_key = property(_get_api_key, _set_api_key)
248
+
238
249
  def fit(
239
250
  self,
240
251
  X: Union[pd.DataFrame, pd.Series, np.ndarray],
@@ -579,13 +590,14 @@ class FeaturesEnricher(TransformerMixin):
579
590
  finally:
580
591
  self.logger.info(f"Transform elapsed time: {time.time() - start_time}")
581
592
 
582
- if self.country_added:
583
- result = drop_existing_columns(result, COUNTRY)
593
+ if result is not None:
594
+ if self.country_added:
595
+ result = drop_existing_columns(result, COUNTRY)
584
596
 
585
- if keep_input:
586
- return result
587
- else:
588
- return drop_existing_columns(result, X.columns)
597
+ if keep_input:
598
+ return result
599
+ else:
600
+ return drop_existing_columns(result, X.columns)
589
601
 
590
602
  def calculate_metrics(
591
603
  self,
@@ -912,13 +924,21 @@ class FeaturesEnricher(TransformerMixin):
912
924
  def _has_features_with_commercial_schema(
913
925
  self, commercial_schema: str, exclude_features_sources: Optional[List[str]]
914
926
  ) -> bool:
927
+ return len(self._get_features_with_commercial_schema(commercial_schema, exclude_features_sources)) > 0
928
+
929
+ def _get_features_with_commercial_schema(
930
+ self, commercial_schema: str, exclude_features_sources: Optional[List[str]]
931
+ ) -> List[str]:
915
932
  if exclude_features_sources:
916
933
  filtered_features_info = self.features_info[
917
934
  ~self.features_info[bundle.get("features_info_name")].isin(exclude_features_sources)
918
935
  ]
919
936
  else:
920
937
  filtered_features_info = self.features_info
921
- return (filtered_features_info[bundle.get("features_info_commercial_schema")] == commercial_schema).any()
938
+ return list(filtered_features_info.loc[
939
+ filtered_features_info[bundle.get("features_info_commercial_schema")] == commercial_schema,
940
+ bundle.get("features_info_name"),
941
+ ].values)
922
942
 
923
943
  def _has_trial_features(self, exclude_features_sources: Optional[List[str]]) -> bool:
924
944
  return self._has_features_with_commercial_schema(CommercialSchema.TRIAL.value, exclude_features_sources)
@@ -1248,7 +1268,6 @@ class FeaturesEnricher(TransformerMixin):
1248
1268
  msg = bundle.get("transform_with_trial_features")
1249
1269
  self.logger.warn(msg)
1250
1270
  print(msg)
1251
- return None
1252
1271
 
1253
1272
  columns_to_drop = [c for c in validated_X.columns if c in self.feature_names_]
1254
1273
  if len(columns_to_drop) > 0:
@@ -1313,7 +1332,9 @@ class FeaturesEnricher(TransformerMixin):
1313
1332
 
1314
1333
  columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
1315
1334
 
1316
- df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False)
1335
+ df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
1336
+ "Float64"
1337
+ )
1317
1338
  meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
1318
1339
 
1319
1340
  df = df.reset_index(drop=True)
@@ -1488,10 +1509,11 @@ class FeaturesEnricher(TransformerMixin):
1488
1509
  else None
1489
1510
  )
1490
1511
  is_demo_dataset = hash_input(validated_X, validated_y, validated_eval_set) in DEMO_DATASET_HASHES
1491
- if (is_demo_dataset):
1512
+ if is_demo_dataset:
1492
1513
  msg = bundle.get("demo_dataset_info")
1493
1514
  self.logger.info(msg)
1494
- print(msg)
1515
+ if not self.__is_registered:
1516
+ print(msg)
1495
1517
 
1496
1518
  if self.generate_features is not None and len(self.generate_features) > 0:
1497
1519
  x_columns = list(validated_X.columns)
upgini/http.py CHANGED
@@ -11,8 +11,8 @@ from http.client import HTTPConnection
11
11
  from json import dumps
12
12
  from typing import Dict, List, Optional
13
13
  from urllib.parse import urljoin
14
- import pandas as pd
15
14
 
15
+ import pandas as pd
16
16
  import requests
17
17
  from pydantic import BaseModel
18
18
  from pythonjsonlogger import jsonlogger
@@ -32,7 +32,7 @@ from upgini.metadata import (
32
32
  SearchCustomization,
33
33
  )
34
34
  from upgini.resource_bundle import bundle
35
- from upgini.utils.track_info import get_track_metrics
35
+ from upgini.utils.track_info import get_track_metrics_with_timeout
36
36
 
37
37
  try:
38
38
  from importlib_metadata import version
@@ -49,6 +49,7 @@ except ImportError:
49
49
  UPGINI_URL: str = "UPGINI_URL"
50
50
  UPGINI_API_KEY: str = "UPGINI_API_KEY"
51
51
  DEMO_API_KEY: str = "Aa4BPwGFbn1zNEXIkZ-NbhsRk0ricN6puKuga1-O5lM"
52
+ TRACK_METRICS_TIMEOUT_SECONDS: int = 10
52
53
 
53
54
  refresh_token_lock = threading.Lock()
54
55
 
@@ -374,7 +375,11 @@ class _RestClient:
374
375
  search_customization.json(exclude_none=True).encode(),
375
376
  "application/json",
376
377
  )
377
- files["tracking"] = ("tracking.json", dumps(get_track_metrics()).encode(), "application/json")
378
+ files["tracking"] = (
379
+ "tracking.json",
380
+ dumps(get_track_metrics_with_timeout(TRACK_METRICS_TIMEOUT_SECONDS)).encode(),
381
+ "application/json",
382
+ )
378
383
  additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
379
384
 
380
385
  return self._send_post_file_req_v2(
@@ -433,7 +438,6 @@ class _RestClient:
433
438
  digest = md5_hash.hexdigest()
434
439
  metadata_with_md5 = metadata.copy(update={"checksumMD5": digest})
435
440
 
436
- import pandas as pd
437
441
  digest_sha256 = hashlib.sha256(pd.util.hash_pandas_object(pd.read_parquet(file_path)).values).hexdigest()
438
442
  metadata_with_md5 = metadata_with_md5.copy(update={"digest": digest_sha256})
439
443
 
@@ -453,7 +457,11 @@ class _RestClient:
453
457
  search_customization.json(exclude_none=True).encode(),
454
458
  "application/json",
455
459
  )
456
- files["tracking"] = ("ide", dumps(get_track_metrics()).encode(), "application/json")
460
+ files["tracking"] = (
461
+ "ide",
462
+ dumps(get_track_metrics_with_timeout(TRACK_METRICS_TIMEOUT_SECONDS)).encode(),
463
+ "application/json",
464
+ )
457
465
 
458
466
  additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
459
467
 
@@ -787,16 +795,20 @@ class BackendLogHandler(logging.Handler):
787
795
  def __init__(self, rest_client: _RestClient, *args, **kwargs) -> None:
788
796
  super().__init__(*args, **kwargs)
789
797
  self.rest_client = rest_client
790
- if "ip" in get_track_metrics().keys():
791
- self.hostname = get_track_metrics()["ip"]
792
- else:
793
- self.hostname = "0.0.0.0"
798
+ self.track_metrics = None
799
+ self.hostname = None
794
800
 
795
801
  def emit(self, record: logging.LogRecord) -> None:
796
802
  def task():
797
803
  try:
804
+ if self.track_metrics is None:
805
+ self.track_metrics = get_track_metrics_with_timeout(TRACK_METRICS_TIMEOUT_SECONDS)
806
+ if "ip" in self.track_metrics.keys():
807
+ self.hostname = self.track_metrics["ip"]
808
+ else:
809
+ self.hostname = "0.0.0.0"
798
810
  text = self.format(record)
799
- tags = get_track_metrics()
811
+ tags = self.track_metrics
800
812
  tags["version"] = __version__
801
813
  self.rest_client.send_log_event(
802
814
  LogEvent(
@@ -25,7 +25,8 @@ metrics_no_important_free_features=WARNING: No important free features to calcul
25
25
  metrics_no_important_features=WARNING: No important features to calculate metrics
26
26
  metrics_negative_uplift_without_cv=Please re-check that your task is not a time series prediction. If so, restart search with cv=CVType.time_series param for correct search results. See docs https://github.com/upgini/upgini#-time-series-prediction-support
27
27
  metrics_with_trial_features=The calculation of final accuracy metrics using Trial data is not available for unauthorized users.\nGet a free API key on https://upgini.com and repeat your request.
28
- transform_with_trial_features=Enriching with Trial data is not available for unauthorized users.\nGet a free API key on https://upgini.com and repeat your request.
28
+ transform_with_trial_features=WARNING: Your search results contain Trial data sources. To enrich your dataframe using transform or fit_transform with features from these Trial data sources, please register for a Free API key at https://upgini.com and resubmit your request.
29
+ # Enriching with Trial data is not available for unauthorized users.\nGet a free API key on https://upgini.com and repeat your request.
29
30
  metrics_with_paid_features=The calculation of final accuracy metrics using Paid data is not available.\nContact Upgini support for the data access
30
31
  transform_with_paid_features=Enriching with Paid data is not available.\nContact Upgini support for the data access
31
32
  trial_quota_limit_riched=WARNING: You have reached the quota limit of trial data usage. Please contact Upgini support to remove restriction
upgini/search_task.py CHANGED
@@ -59,8 +59,8 @@ class SearchTask:
59
59
  submitted_statuses = {"SUBMITTED", "VALIDATION_SUBMITTED"}
60
60
  if not quiet:
61
61
  print(bundle.get("polling_search_task").format(self.search_task_id))
62
- if is_demo_api_key(self.api_key):
63
- print(bundle.get("polling_unregister_information"))
62
+ if is_demo_api_key(self.api_key):
63
+ print(bundle.get("polling_unregister_information"))
64
64
  search_task_id = self.initial_search_task_id if self.initial_search_task_id is not None else self.search_task_id
65
65
 
66
66
  try:
@@ -6,6 +6,7 @@ from functools import lru_cache
6
6
  from getpass import getuser
7
7
  from hashlib import sha256
8
8
  from uuid import getnode
9
+ from concurrent import futures
9
10
 
10
11
  from requests import get, post
11
12
 
@@ -45,6 +46,17 @@ def _get_execution_ide() -> str:
45
46
  return "other"
46
47
 
47
48
 
49
+ def get_track_metrics_with_timeout(timeout_seconds: int = 10) -> dict:
50
+ with futures.ThreadPoolExecutor() as executor:
51
+ future = executor.submit(get_track_metrics)
52
+ try:
53
+ result = future.result(timeout_seconds)
54
+ return result
55
+ except futures.TimeoutError:
56
+ future.cancel()
57
+ return dict()
58
+
59
+
48
60
  @lru_cache()
49
61
  def get_track_metrics() -> dict:
50
62
  # default values
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.131a3
3
+ Version: 1.1.132
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -33,7 +33,7 @@ Requires-Dist: pandas (<2.0.0,>=1.1.0)
33
33
  Requires-Dist: numpy (>=1.19.0)
34
34
  Requires-Dist: scikit-learn (>=1.0.1)
35
35
  Requires-Dist: pydantic (>=1.8.2)
36
- Requires-Dist: fastparquet (>=0.7.1)
36
+ Requires-Dist: fastparquet (>=0.8.1)
37
37
  Requires-Dist: python-json-logger (>=2.0.2)
38
38
  Requires-Dist: catboost (>=1.0.3)
39
39
  Requires-Dist: lightgbm (>=3.3.2)
@@ -2,11 +2,11 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
3
  upgini/dataset.py,sha256=uiba8qC2RNvqka5MQp-XgdYDE7-nqO5FojMO5a0n_HA,42978
4
4
  upgini/errors.py,sha256=BqpvfhW2jJW5fa5KXj0alhXatGl-WK4xTl309-QNLp8,959
5
- upgini/features_enricher.py,sha256=QzNnU2Do5hHzvUA9HUVrSV3gPLtTL659HTiFLI0mfrA,117921
6
- upgini/http.py,sha256=Hq0rtEO2qN-IVYDPSyo7WJJ7gxjgF_z4dKIQzFOXOZg,35947
5
+ upgini/features_enricher.py,sha256=iV9OHy8QFQdB5uUX6S2RuI28WBZFNkR5LZwRrhuB8Oo,118746
6
+ upgini/http.py,sha256=164ysTRGZ2hRtsFTjl4vBsyaz0iS75jiF7BYGFk2MVc,36468
7
7
  upgini/metadata.py,sha256=Oefg-rkA4PsZUHIho_clZcnyZwdtVJ1gXPvEY6oBmpg,5969
8
8
  upgini/metrics.py,sha256=5tQ_6ZKUM0EBLnHmKZD63KHCQXTYGeE-uFLs0wHcYf4,15477
9
- upgini/search_task.py,sha256=4SbJhEqCUrntf-JrEsM4R_y3vebNEsJ5JXBeS7Y1K38,13589
9
+ upgini/search_task.py,sha256=cM-3cAcdnKHrCl7ZTqhHLrSkDVpdgowV5YBaFq2BzjY,13597
10
10
  upgini/spinner.py,sha256=yhakBaydMNS8E8TRAwTdCMdnWrHeWT0cR1M8c9hP6jA,1157
11
11
  upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
12
12
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
@@ -19,7 +19,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
19
19
  upgini/normalizer/phone_normalizer.py,sha256=VIgLXuDuzzjPEXiy_LyDVLZKGaS7-le6Fh6T4D-TQDU,9930
20
20
  upgini/resource_bundle/__init__.py,sha256=M7GtS7KPQw9pinz8P2aQWXpSkD2YFwUPVGk1w92Pn84,7888
21
21
  upgini/resource_bundle/exceptions.py,sha256=KT-OnqA2J4OTfLjhbEl3KFZM2ci7EOPjqJuY_rXp3vs,622
22
- upgini/resource_bundle/strings.properties,sha256=pOQuq5RFEYqsB1edeMnQCwlogRH7E0fMIQg8vH6bhqo,20891
22
+ upgini/resource_bundle/strings.properties,sha256=X0-yWoK3uAMZdlXk3IhGOgqqa5XC4YnElK-olp0sbRM,21137
23
23
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  upgini/sampler/base.py,sha256=X2PVsfZ3Rl7twpFDh5UWyxqY2K_jcMGxZ2NcHLwFRj4,6489
25
25
  upgini/sampler/random_under_sampler.py,sha256=whX_f_TtalHH8Seyn_7n3sX_TSiDHeYfALmme9saqDg,4082
@@ -37,10 +37,10 @@ upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
37
37
  upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,408
38
38
  upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3x_zs,409
39
39
  upgini/utils/target_utils.py,sha256=3eHrDy_Dc9ozuOwHGnGA705m9glCxKmjB-DfLrflqiA,1370
40
- upgini/utils/track_info.py,sha256=O_oL4gy1jH0DVgtiUeZAW0YKCeRT4B_bzH_SZYkFaOE,4076
40
+ upgini/utils/track_info.py,sha256=dfDwLdQX7aY0eRT1izsMpeEiN0bYx_2YH7O69AoVhlU,4465
41
41
  upgini/utils/warning_counter.py,sha256=vnmdFo5-7GBkU2bK9h_uC0K0Y_wtfcYstxOdeRfacO0,228
42
- upgini-1.1.131a3.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
43
- upgini-1.1.131a3.dist-info/METADATA,sha256=VifR-A7emG26-mWmz35BrxZyUYZsS_LiCAvdnspo7ZA,43229
44
- upgini-1.1.131a3.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
45
- upgini-1.1.131a3.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
46
- upgini-1.1.131a3.dist-info/RECORD,,
42
+ upgini-1.1.132.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
43
+ upgini-1.1.132.dist-info/METADATA,sha256=sUIjt9bSFBtawDxGqez0g6UUJmfFXMHgnQd4W8x08IY,43227
44
+ upgini-1.1.132.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
45
+ upgini-1.1.132.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
46
+ upgini-1.1.132.dist-info/RECORD,,