upgini 1.2.88a3884.dev1__py3-none-any.whl → 1.2.90__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/data_source/data_source_publisher.py +23 -2
- upgini/features_enricher.py +34 -74
- upgini/http.py +29 -25
- upgini/metrics.py +10 -11
- upgini/resource_bundle/strings.properties +2 -0
- upgini/utils/sklearn_ext.py +1 -1
- upgini/utils/target_utils.py +2 -1
- {upgini-1.2.88a3884.dev1.dist-info → upgini-1.2.90.dist-info}/METADATA +1 -1
- {upgini-1.2.88a3884.dev1.dist-info → upgini-1.2.90.dist-info}/RECORD +12 -12
- {upgini-1.2.88a3884.dev1.dist-info → upgini-1.2.90.dist-info}/WHEEL +1 -1
- {upgini-1.2.88a3884.dev1.dist-info → upgini-1.2.90.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.90"
|
@@ -5,6 +5,8 @@ from datetime import datetime
|
|
5
5
|
from enum import Enum
|
6
6
|
from typing import Dict, List, Literal, Optional, Union
|
7
7
|
|
8
|
+
import pandas as pd
|
9
|
+
|
8
10
|
from upgini.errors import HttpError, ValidationError
|
9
11
|
from upgini.http import LoggerFactory, get_rest_client
|
10
12
|
from upgini.mdc import MDC
|
@@ -33,7 +35,7 @@ class OnlineUploadingType(Enum):
|
|
33
35
|
class DataSourcePublisher:
|
34
36
|
FINAL_STATUSES = ["COMPLETED", "FAILED", "TIMED_OUT"]
|
35
37
|
ACCEPTABLE_UPDATE_FREQUENCIES = ["Daily", "Weekly", "Monthly", "Quarterly", "Annually"]
|
36
|
-
DEFAULT_GENERATE_EMBEDDINGS =
|
38
|
+
DEFAULT_GENERATE_EMBEDDINGS = dict()
|
37
39
|
|
38
40
|
def __init__(self, api_key: Optional[str] = None, endpoint: Optional[str] = None, logs_enabled=True):
|
39
41
|
self._rest_client = get_rest_client(endpoint, api_key)
|
@@ -58,7 +60,7 @@ class DataSourcePublisher:
|
|
58
60
|
hash_feature_names=False,
|
59
61
|
snapshot_frequency_days: Optional[int] = None,
|
60
62
|
join_date_abs_limit_days: Optional[int] = None,
|
61
|
-
features_for_embeddings: Optional[
|
63
|
+
features_for_embeddings: Optional[Dict[str, str]] = DEFAULT_GENERATE_EMBEDDINGS,
|
62
64
|
data_table_id_to_replace: Optional[str] = None,
|
63
65
|
keep_features: Optional[List[str]] = None,
|
64
66
|
date_features: Optional[List[str]] = None,
|
@@ -137,6 +139,25 @@ class DataSourcePublisher:
|
|
137
139
|
) and not date_format:
|
138
140
|
raise ValidationError("date_format argument is required for PHONE+DATE and HEM+DATE search keys")
|
139
141
|
|
142
|
+
if secondary_search_keys:
|
143
|
+
response = self._rest_client.get_active_ads_definitions()
|
144
|
+
definitions = pd.DataFrame(response["adsDefinitions"])
|
145
|
+
prod_secondary_definitions = definitions.query(
|
146
|
+
"(secondarySearchKeys.astype('string') != '[]') & (adsDefinitionAccessType == 'PROD')"
|
147
|
+
)[["name", "searchKeys", "secondarySearchKeys"]]
|
148
|
+
for _, row in prod_secondary_definitions.iterrows():
|
149
|
+
existing_secondary_keys = {item for sublist in row["secondarySearchKeys"] for item in sublist}
|
150
|
+
if existing_secondary_keys == {v.value.name for v in secondary_search_keys.values()}:
|
151
|
+
existing_search_keys = {item for sublist in row["searchKeys"] for item in sublist}
|
152
|
+
if (
|
153
|
+
existing_search_keys == {v.value.name for v in search_keys.values()}
|
154
|
+
or ("IP" in str(existing_search_keys) and "IP" in str(search_keys.values()))
|
155
|
+
):
|
156
|
+
raise ValidationError(
|
157
|
+
"ADS with the same PRIMARY_KEYS -> SECONDARY_KEYS mapping "
|
158
|
+
f"already exists: {row['name']}"
|
159
|
+
)
|
160
|
+
|
140
161
|
request = {
|
141
162
|
"dataTableUri": data_table_uri,
|
142
163
|
"searchKeys": {k: v.value.value for k, v in search_keys.items()},
|
upgini/features_enricher.py
CHANGED
@@ -7,7 +7,6 @@ import json
|
|
7
7
|
import logging
|
8
8
|
import numbers
|
9
9
|
import os
|
10
|
-
import pickle
|
11
10
|
import sys
|
12
11
|
import tempfile
|
13
12
|
import time
|
@@ -1671,10 +1670,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1671
1670
|
enriched_eval_y_sorted,
|
1672
1671
|
)
|
1673
1672
|
|
1674
|
-
fitting_X, fitting_enriched_X, fitting_eval_set_dict = self._convert_id_columns_to_int(
|
1675
|
-
fitting_X, fitting_enriched_X, fitting_eval_set_dict, columns_renaming
|
1676
|
-
)
|
1677
|
-
|
1678
1673
|
return (
|
1679
1674
|
validated_X,
|
1680
1675
|
fitting_X,
|
@@ -1688,38 +1683,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1688
1683
|
columns_renaming,
|
1689
1684
|
)
|
1690
1685
|
|
1691
|
-
def _convert_id_columns_to_int(
|
1692
|
-
self,
|
1693
|
-
fitting_X: pd.DataFrame,
|
1694
|
-
fitting_enriched_X: pd.DataFrame,
|
1695
|
-
fitting_eval_set_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]],
|
1696
|
-
columns_renaming: Dict[str, str] = {},
|
1697
|
-
) -> pd.DataFrame:
|
1698
|
-
def _set_encoded(col_name: str, df: pd.DataFrame, slice: Tuple[int, int], combined_col: pd.Series):
|
1699
|
-
df[col_name] = combined_col.iloc[slice[0] : slice[1]]
|
1700
|
-
return slice[1]
|
1701
|
-
|
1702
|
-
inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
1703
|
-
|
1704
|
-
if self.id_columns:
|
1705
|
-
self.logger.info(f"Convert id columns to int: {self.id_columns}")
|
1706
|
-
for col in self.id_columns:
|
1707
|
-
col = inverse_columns_renaming.get(col, col)
|
1708
|
-
combined_col = pd.concat(
|
1709
|
-
[fitting_X[col], fitting_enriched_X[col]]
|
1710
|
-
+ [eval_set_pair[0][col] for eval_set_pair in fitting_eval_set_dict.values()]
|
1711
|
-
)
|
1712
|
-
combined_col = combined_col.astype("category").cat.codes
|
1713
|
-
slice_end = _set_encoded(col, fitting_X, (0, len(fitting_X)), combined_col)
|
1714
|
-
slice_end = _set_encoded(
|
1715
|
-
col, fitting_enriched_X, (slice_end, slice_end + len(fitting_enriched_X)), combined_col
|
1716
|
-
)
|
1717
|
-
for eval_set_pair in fitting_eval_set_dict.values():
|
1718
|
-
slice_end = _set_encoded(
|
1719
|
-
col, eval_set_pair[0], (slice_end, slice_end + len(eval_set_pair[0])), combined_col
|
1720
|
-
)
|
1721
|
-
return fitting_X, fitting_enriched_X, fitting_eval_set_dict
|
1722
|
-
|
1723
1686
|
@dataclass
|
1724
1687
|
class _SampledDataForMetrics:
|
1725
1688
|
X_sampled: pd.DataFrame
|
@@ -3486,6 +3449,11 @@ if response.status_code == 200:
|
|
3486
3449
|
if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
|
3487
3450
|
raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
|
3488
3451
|
|
3452
|
+
# Check for duplicates between train and eval sets by comparing all values
|
3453
|
+
train_eval_intersection = pd.merge(X, validated_eval_X, how='inner')
|
3454
|
+
if len(train_eval_intersection) > 0:
|
3455
|
+
raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
|
3456
|
+
|
3489
3457
|
return validated_eval_X, validated_eval_y
|
3490
3458
|
|
3491
3459
|
def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
|
@@ -4012,7 +3980,7 @@ if response.status_code == 200:
|
|
4012
3980
|
if features_meta is None:
|
4013
3981
|
raise Exception(self.bundle.get("missing_features_meta"))
|
4014
3982
|
|
4015
|
-
return [f.name for f in features_meta if f.type == "categorical"
|
3983
|
+
return [f.name for f in features_meta if f.type == "categorical"]
|
4016
3984
|
|
4017
3985
|
def __prepare_feature_importances(
|
4018
3986
|
self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
@@ -4605,60 +4573,52 @@ if response.status_code == 200:
|
|
4605
4573
|
y: Union[pd.DataFrame, pd.Series, None] = None,
|
4606
4574
|
eval_set: Union[Tuple, None] = None,
|
4607
4575
|
):
|
4608
|
-
def dump_task():
|
4576
|
+
def dump_task(X_, y_, eval_set_):
|
4609
4577
|
try:
|
4610
|
-
|
4611
|
-
|
4612
|
-
|
4613
|
-
|
4614
|
-
else:
|
4615
|
-
xy_sample_index = []
|
4616
|
-
|
4617
|
-
def sample(inp, sample_index):
|
4618
|
-
if _num_samples(inp) <= 1000:
|
4619
|
-
return inp
|
4620
|
-
if isinstance(inp, (pd.DataFrame, pd.Series)):
|
4621
|
-
return inp.sample(n=1000, random_state=random_state)
|
4622
|
-
if isinstance(inp, np.ndarray):
|
4623
|
-
return inp[sample_index]
|
4624
|
-
if isinstance(inp, list):
|
4625
|
-
return inp[sample_index]
|
4578
|
+
if isinstance(X_, pd.Series):
|
4579
|
+
X_ = X_.to_frame()
|
4580
|
+
|
4581
|
+
# TODO check that this file was already uploaded
|
4626
4582
|
|
4627
4583
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
4628
|
-
|
4629
|
-
|
4630
|
-
if
|
4631
|
-
|
4632
|
-
|
4633
|
-
|
4634
|
-
|
4635
|
-
|
4636
|
-
|
4637
|
-
|
4638
|
-
|
4584
|
+
X_.to_parquet(f"{tmp_dir}/x.parquet", compression="zstd")
|
4585
|
+
|
4586
|
+
if y_ is not None:
|
4587
|
+
if isinstance(y_, pd.Series):
|
4588
|
+
y_ = y_.to_frame()
|
4589
|
+
y_.to_parquet(f"{tmp_dir}/y.parquet", compression="zstd")
|
4590
|
+
if eval_set_ and _num_samples(eval_set_[0][0]) > 0:
|
4591
|
+
eval_x_ = eval_set_[0][0]
|
4592
|
+
eval_y_ = eval_set_[0][1]
|
4593
|
+
if isinstance(eval_x_, pd.Series):
|
4594
|
+
eval_x_ = eval_x_.to_frame()
|
4595
|
+
eval_x_.to_parquet(f"{tmp_dir}/eval_x.parquet", compression="zstd")
|
4596
|
+
if isinstance(eval_y_, pd.Series):
|
4597
|
+
eval_y_ = eval_y_.to_frame()
|
4598
|
+
eval_y_.to_parquet(f"{tmp_dir}/eval_y.parquet", compression="zstd")
|
4639
4599
|
self.rest_client.dump_input_files(
|
4640
4600
|
trace_id,
|
4641
|
-
f"{tmp_dir}/x.
|
4642
|
-
f"{tmp_dir}/y.
|
4643
|
-
f"{tmp_dir}/eval_x.
|
4644
|
-
f"{tmp_dir}/eval_y.
|
4601
|
+
f"{tmp_dir}/x.parquet",
|
4602
|
+
f"{tmp_dir}/y.parquet",
|
4603
|
+
f"{tmp_dir}/eval_x.parquet",
|
4604
|
+
f"{tmp_dir}/eval_y.parquet",
|
4645
4605
|
)
|
4646
4606
|
else:
|
4647
4607
|
self.rest_client.dump_input_files(
|
4648
4608
|
trace_id,
|
4649
|
-
f"{tmp_dir}/x.
|
4650
|
-
f"{tmp_dir}/y.
|
4609
|
+
f"{tmp_dir}/x.parquet",
|
4610
|
+
f"{tmp_dir}/y.parquet",
|
4651
4611
|
)
|
4652
4612
|
else:
|
4653
4613
|
self.rest_client.dump_input_files(
|
4654
4614
|
trace_id,
|
4655
|
-
f"{tmp_dir}/x.
|
4615
|
+
f"{tmp_dir}/x.parquet",
|
4656
4616
|
)
|
4657
4617
|
except Exception:
|
4658
4618
|
self.logger.warning("Failed to dump input files", exc_info=True)
|
4659
4619
|
|
4660
4620
|
try:
|
4661
|
-
Thread(target=dump_task, daemon=True).start()
|
4621
|
+
Thread(target=dump_task, args=(X, y, eval_set), daemon=True).start()
|
4662
4622
|
except Exception:
|
4663
4623
|
self.logger.warning("Failed to dump input files", exc_info=True)
|
4664
4624
|
|
upgini/http.py
CHANGED
@@ -252,6 +252,7 @@ class _RestClient:
|
|
252
252
|
|
253
253
|
# V2
|
254
254
|
CHECK_UPLOADED_FILE_URL_FMT_V2 = SERVICE_ROOT_V2 + "search/check-file?fileUploadId={0}"
|
255
|
+
IS_FILE_UPLOADED_URL_FMT_V2 = SERVICE_ROOT_V2 + "search/files/exists?digest={0}"
|
255
256
|
INITIAL_SEARCH_URI_FMT_V2 = SERVICE_ROOT_V2 + "search/initial"
|
256
257
|
INITIAL_SEARCH_WITHOUT_UPLOAD_URI_FMT_V2 = SERVICE_ROOT_V2 + "search/initial-without-upload?fileUploadId={0}"
|
257
258
|
VALIDATION_SEARCH_URI_FMT_V2 = SERVICE_ROOT_V2 + "search/validation?initialSearchTaskId={0}"
|
@@ -272,6 +273,7 @@ class _RestClient:
|
|
272
273
|
SEARCH_FILE_METADATA_URI_FMT_V2 = SERVICE_ROOT_V2 + "search/{0}/metadata"
|
273
274
|
SEARCH_TASK_METADATA_FMT_V3 = SERVICE_ROOT_V2 + "search/metadata-v2/{0}"
|
274
275
|
SEARCH_DUMP_INPUT_FMT_V2 = SERVICE_ROOT_V2 + "search/dump-input"
|
276
|
+
SEARCH_DUMP_INPUT_FILE_FMT = SERVICE_ROOT_V2 + "search/dump-input-file"
|
275
277
|
TRANSFORM_USAGE_FMT = SERVICE_ROOT_V2 + "user/transform-usage"
|
276
278
|
|
277
279
|
UPLOAD_USER_ADS_URI = SERVICE_ROOT + "ads/upload"
|
@@ -410,32 +412,29 @@ class _RestClient:
|
|
410
412
|
eval_x_path: Optional[str] = None,
|
411
413
|
eval_y_path: Optional[str] = None,
|
412
414
|
):
|
413
|
-
api_path = self.
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
if
|
418
|
-
with
|
419
|
-
|
420
|
-
if eval_x_path and eval_y_path:
|
421
|
-
with open(eval_x_path, "rb") as eval_x_file, open(eval_y_path, "rb") as eval_y_file:
|
422
|
-
files["eval_x"] = ("eval_x.pickle", eval_x_file, "application/octet-stream")
|
423
|
-
files["eval_y"] = ("eval_y.pickle", eval_y_file, "application/octet-stream")
|
424
|
-
self._with_unauth_retry(
|
425
|
-
lambda: self._send_post_file_req_v2(
|
426
|
-
api_path, files, trace_id=trace_id, need_json_response=False
|
427
|
-
)
|
428
|
-
)
|
429
|
-
else:
|
430
|
-
self._with_unauth_retry(
|
431
|
-
lambda: self._send_post_file_req_v2(
|
432
|
-
api_path, files, trace_id=trace_id, need_json_response=False
|
433
|
-
)
|
434
|
-
)
|
415
|
+
api_path = self.SEARCH_DUMP_INPUT_FILE_FMT
|
416
|
+
|
417
|
+
def upload_with_check(path: str, file_name: str):
|
418
|
+
digest_sha256 = self.compute_file_digest(path)
|
419
|
+
if self.is_file_uploaded(trace_id, digest_sha256):
|
420
|
+
# print(f"File {path} was already uploaded with digest {digest_sha256}, skipping")
|
421
|
+
return
|
435
422
|
else:
|
436
|
-
|
437
|
-
|
438
|
-
|
423
|
+
with open(path, "rb") as file:
|
424
|
+
files = {"file": (file_name, file, "application/octet-stream")}
|
425
|
+
self._with_unauth_retry(
|
426
|
+
lambda: self._send_post_file_req_v2(
|
427
|
+
api_path, files, trace_id=trace_id, need_json_response=False
|
428
|
+
)
|
429
|
+
)
|
430
|
+
|
431
|
+
upload_with_check(x_path, "x.parquet")
|
432
|
+
if y_path:
|
433
|
+
upload_with_check(y_path, "y.parquet")
|
434
|
+
if eval_x_path:
|
435
|
+
upload_with_check(eval_x_path, "eval_x.parquet")
|
436
|
+
if eval_y_path:
|
437
|
+
upload_with_check(eval_y_path, "eval_y.parquet")
|
439
438
|
|
440
439
|
@staticmethod
|
441
440
|
def compute_file_digest(filepath: str, algorithm="sha256", chunk_size=4096) -> str:
|
@@ -514,6 +513,11 @@ class _RestClient:
|
|
514
513
|
)
|
515
514
|
return bool(response)
|
516
515
|
|
516
|
+
def is_file_uploaded(self, trace_id: str, digest: str) -> bool:
|
517
|
+
api_path = self.IS_FILE_UPLOADED_URL_FMT_V2.format(digest)
|
518
|
+
response = self._with_unauth_retry(lambda: self._send_get_req(api_path, trace_id))
|
519
|
+
return bool(response)
|
520
|
+
|
517
521
|
def initial_search_without_upload_v2(
|
518
522
|
self,
|
519
523
|
trace_id: str,
|
upgini/metrics.py
CHANGED
@@ -332,7 +332,7 @@ class EstimatorWrapper:
|
|
332
332
|
self.groups = groups
|
333
333
|
self.text_features = text_features
|
334
334
|
self.logger = logger or logging.getLogger()
|
335
|
-
self.
|
335
|
+
self.droped_features = []
|
336
336
|
self.converted_to_int = []
|
337
337
|
self.converted_to_str = []
|
338
338
|
self.converted_to_numeric = []
|
@@ -381,11 +381,10 @@ class EstimatorWrapper:
|
|
381
381
|
x, y, groups = self._prepare_data(x, y, groups=self.groups)
|
382
382
|
|
383
383
|
self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
|
384
|
-
self.
|
384
|
+
self.droped_features = []
|
385
385
|
self.converted_to_int = []
|
386
386
|
self.converted_to_str = []
|
387
387
|
self.converted_to_numeric = []
|
388
|
-
|
389
388
|
for c in x.columns:
|
390
389
|
|
391
390
|
if _get_unique_count(x[c]) < 2:
|
@@ -393,7 +392,7 @@ class EstimatorWrapper:
|
|
393
392
|
if c in self.cat_features:
|
394
393
|
self.cat_features.remove(c)
|
395
394
|
x.drop(columns=[c], inplace=True)
|
396
|
-
self.
|
395
|
+
self.droped_features.append(c)
|
397
396
|
elif self.text_features is not None and c in self.text_features:
|
398
397
|
x[c] = x[c].astype(str)
|
399
398
|
self.converted_to_str.append(c)
|
@@ -428,16 +427,16 @@ class EstimatorWrapper:
|
|
428
427
|
except (ValueError, TypeError):
|
429
428
|
self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
|
430
429
|
x.drop(columns=[c], inplace=True)
|
431
|
-
self.
|
430
|
+
self.droped_features.append(c)
|
432
431
|
|
433
432
|
return x, y, groups, {}
|
434
433
|
|
435
434
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
436
435
|
x, y, _ = self._prepare_data(x, y)
|
437
436
|
|
438
|
-
if self.
|
439
|
-
self.logger.info(f"Drop features on calculate metrics: {self.
|
440
|
-
x = x.drop(columns=self.
|
437
|
+
if self.droped_features:
|
438
|
+
self.logger.info(f"Drop features on calculate metrics: {self.droped_features}")
|
439
|
+
x = x.drop(columns=self.droped_features)
|
441
440
|
|
442
441
|
if self.converted_to_int:
|
443
442
|
self.logger.info(f"Convert to int features on calculate metrics: {self.converted_to_int}")
|
@@ -798,7 +797,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
798
797
|
)
|
799
798
|
for f in high_cardinality_features:
|
800
799
|
self.text_features.remove(f)
|
801
|
-
self.
|
800
|
+
self.droped_features.append(f)
|
802
801
|
x = x.drop(columns=f, errors="ignore")
|
803
802
|
return super().cross_val_predict(x, y, baseline_score_column)
|
804
803
|
else:
|
@@ -898,7 +897,7 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
898
897
|
for c in x.columns:
|
899
898
|
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
900
899
|
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
901
|
-
self.
|
900
|
+
self.droped_features.append(c)
|
902
901
|
x = x.drop(columns=c, errors="ignore")
|
903
902
|
return x, y_numpy, groups, params
|
904
903
|
|
@@ -989,7 +988,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
989
988
|
for c in x.columns:
|
990
989
|
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
991
990
|
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
992
|
-
self.
|
991
|
+
self.droped_features.append(c)
|
993
992
|
x = x.drop(columns=c, errors="ignore")
|
994
993
|
return x, y_numpy, groups, params
|
995
994
|
|
@@ -137,6 +137,8 @@ eval_y_multiindex_unsupported=Multi index in y in eval_set is not supported
|
|
137
137
|
eval_x_is_empty=X in eval_set is empty.
|
138
138
|
eval_y_is_empty=y in eval_set is empty.
|
139
139
|
x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
|
140
|
+
eval_x_has_train_samples=Eval set X has rows that are present in train set X
|
141
|
+
|
140
142
|
baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
|
141
143
|
baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
|
142
144
|
missing_features_for_transform=Missing some features for transform that were presented on fit: {}
|
upgini/utils/sklearn_ext.py
CHANGED
@@ -1324,7 +1324,7 @@ def _encode_cat_features(X_train, y_train, X_test, y_test, cat_features, estimat
|
|
1324
1324
|
else:
|
1325
1325
|
# Shuffle train data
|
1326
1326
|
X_train_shuffled, y_train_shuffled = _shuffle_pair(
|
1327
|
-
X_train[cat_features]
|
1327
|
+
X_train[cat_features], y_train, random_state
|
1328
1328
|
)
|
1329
1329
|
|
1330
1330
|
# Fit encoder on training fold
|
upgini/utils/target_utils.py
CHANGED
@@ -416,6 +416,7 @@ def calculate_psi(expected: pd.Series, actual: pd.Series) -> Union[float, Except
|
|
416
416
|
test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
|
417
417
|
|
418
418
|
# Calculate the PSI
|
419
|
-
|
419
|
+
ratio = np.where(test_distribution > 0, train_distribution / test_distribution, 1)
|
420
|
+
return np.sum((train_distribution - test_distribution) * np.log(ratio))
|
420
421
|
except Exception as e:
|
421
422
|
return e
|
@@ -1,12 +1,12 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=GHc4XyRcf-LRcunv2-fpap4slj_PhG6QeOQqttDwIno,23
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=fRtqSkXNONLnPe6cCL967GMt349FTIpXzy_u8LUKncw,35354
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
7
|
-
upgini/http.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=rieH8wjC1c_q2LYZoju8KZyshokNzFpwVtrCtG88w3s,215940
|
7
|
+
upgini/http.py,sha256=4i7fQwrwU3WzDUOWzrgR-4C8eJwj_5dBwRAR-UjUtlc,44345
|
8
8
|
upgini/metadata.py,sha256=zt_9k0iQbWXuiRZcel4ORNPdQKt6Ou69ucZD_E1Q46o,12341
|
9
|
-
upgini/metrics.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=zIOaiyfQLedU9Fk4877drnlWh-KiImSkZpPeiq6Xr1E,45295
|
10
10
|
upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
@@ -31,14 +31,14 @@ upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_a
|
|
31
31
|
upgini/autofe/timeseries/trend.py,sha256=K1_iw2ko_LIUU8YCUgrvN3n0MkHtsi7-63-8x9er1k4,2129
|
32
32
|
upgini/autofe/timeseries/volatility.py,sha256=SvZfhM_ZAWCNpTf87WjSnZsnlblARgruDlu4By4Zvhc,8078
|
33
33
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
34
|
-
upgini/data_source/data_source_publisher.py,sha256=
|
34
|
+
upgini/data_source/data_source_publisher.py,sha256=ufL8qK1vg8iUKd5bLWz6hEMGiC3JepUaWYx-nBKVqjA,24294
|
35
35
|
upgini/mdc/__init__.py,sha256=iHJlXQg6xRM1-ZOUtaPSJqw5SpQDszvxp4LyqviNLIQ,1027
|
36
36
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
37
37
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
38
|
upgini/normalizer/normalize_utils.py,sha256=g2TcDXZeJp9kAFO2sTqZ4CAsN4J1qHNgoJHZ8gtzUWo,7376
|
39
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
40
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
41
|
-
upgini/resource_bundle/strings.properties,sha256=
|
41
|
+
upgini/resource_bundle/strings.properties,sha256=SxO1uWFAc1s7BOFi01OyEI3ajklUKBhs8LkKrstImIg,28290
|
42
42
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
43
43
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
44
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
@@ -64,13 +64,13 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
|
64
64
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
65
65
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
66
66
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
67
|
-
upgini/utils/sklearn_ext.py,sha256=
|
67
|
+
upgini/utils/sklearn_ext.py,sha256=jLJWAKkqQinV15Z4y1ZnsN3c-fKFwXTsprs00COnyVU,49315
|
68
68
|
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
69
|
-
upgini/utils/target_utils.py,sha256=
|
69
|
+
upgini/utils/target_utils.py,sha256=mVZ8wrkBb-tzEnVZwZw0m-Y0Sojb5t-wIsACRH05nIw,16890
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.
|
74
|
-
upgini-1.2.
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
73
|
+
upgini-1.2.90.dist-info/METADATA,sha256=QWKn1q4NNZEH8k41xW03uvPmUSjwb-2uFH_Asecnr44,49162
|
74
|
+
upgini-1.2.90.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
75
|
+
upgini-1.2.90.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.90.dist-info/RECORD,,
|
File without changes
|