upgini 1.2.88a3884.dev0__py3-none-any.whl → 1.2.89a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/data_source/data_source_publisher.py +23 -2
- upgini/features_enricher.py +33 -37
- upgini/resource_bundle/strings.properties +2 -0
- upgini/utils/sklearn_ext.py +1 -1
- {upgini-1.2.88a3884.dev0.dist-info → upgini-1.2.89a1.dist-info}/METADATA +1 -1
- {upgini-1.2.88a3884.dev0.dist-info → upgini-1.2.89a1.dist-info}/RECORD +9 -9
- {upgini-1.2.88a3884.dev0.dist-info → upgini-1.2.89a1.dist-info}/WHEEL +1 -1
- {upgini-1.2.88a3884.dev0.dist-info → upgini-1.2.89a1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.89a1"
|
@@ -5,6 +5,8 @@ from datetime import datetime
|
|
5
5
|
from enum import Enum
|
6
6
|
from typing import Dict, List, Literal, Optional, Union
|
7
7
|
|
8
|
+
import pandas as pd
|
9
|
+
|
8
10
|
from upgini.errors import HttpError, ValidationError
|
9
11
|
from upgini.http import LoggerFactory, get_rest_client
|
10
12
|
from upgini.mdc import MDC
|
@@ -33,7 +35,7 @@ class OnlineUploadingType(Enum):
|
|
33
35
|
class DataSourcePublisher:
|
34
36
|
FINAL_STATUSES = ["COMPLETED", "FAILED", "TIMED_OUT"]
|
35
37
|
ACCEPTABLE_UPDATE_FREQUENCIES = ["Daily", "Weekly", "Monthly", "Quarterly", "Annually"]
|
36
|
-
DEFAULT_GENERATE_EMBEDDINGS =
|
38
|
+
DEFAULT_GENERATE_EMBEDDINGS = dict()
|
37
39
|
|
38
40
|
def __init__(self, api_key: Optional[str] = None, endpoint: Optional[str] = None, logs_enabled=True):
|
39
41
|
self._rest_client = get_rest_client(endpoint, api_key)
|
@@ -58,7 +60,7 @@ class DataSourcePublisher:
|
|
58
60
|
hash_feature_names=False,
|
59
61
|
snapshot_frequency_days: Optional[int] = None,
|
60
62
|
join_date_abs_limit_days: Optional[int] = None,
|
61
|
-
features_for_embeddings: Optional[
|
63
|
+
features_for_embeddings: Optional[Dict[str, str]] = DEFAULT_GENERATE_EMBEDDINGS,
|
62
64
|
data_table_id_to_replace: Optional[str] = None,
|
63
65
|
keep_features: Optional[List[str]] = None,
|
64
66
|
date_features: Optional[List[str]] = None,
|
@@ -137,6 +139,25 @@ class DataSourcePublisher:
|
|
137
139
|
) and not date_format:
|
138
140
|
raise ValidationError("date_format argument is required for PHONE+DATE and HEM+DATE search keys")
|
139
141
|
|
142
|
+
if secondary_search_keys:
|
143
|
+
response = self._rest_client.get_active_ads_definitions()
|
144
|
+
definitions = pd.DataFrame(response["adsDefinitions"])
|
145
|
+
prod_secondary_definitions = definitions.query(
|
146
|
+
"(secondarySearchKeys.astype('string') != '[]') & (adsDefinitionAccessType == 'PROD')"
|
147
|
+
)[["name", "searchKeys", "secondarySearchKeys"]]
|
148
|
+
for _, row in prod_secondary_definitions.iterrows():
|
149
|
+
existing_secondary_keys = {item for sublist in row["secondarySearchKeys"] for item in sublist}
|
150
|
+
if existing_secondary_keys == {v.value.name for v in secondary_search_keys.values()}:
|
151
|
+
existing_search_keys = {item for sublist in row["searchKeys"] for item in sublist}
|
152
|
+
if (
|
153
|
+
existing_search_keys == {v.value.name for v in search_keys.values()}
|
154
|
+
or ("IP" in str(existing_search_keys) and "IP" in str(search_keys.values()))
|
155
|
+
):
|
156
|
+
raise ValidationError(
|
157
|
+
"ADS with the same PRIMARY_KEYS -> SECONDARY_KEYS mapping "
|
158
|
+
f"already exists: {row['name']}"
|
159
|
+
)
|
160
|
+
|
140
161
|
request = {
|
141
162
|
"dataTableUri": data_table_uri,
|
142
163
|
"searchKeys": {k: v.value.value for k, v in search_keys.items()},
|
upgini/features_enricher.py
CHANGED
@@ -7,7 +7,6 @@ import json
|
|
7
7
|
import logging
|
8
8
|
import numbers
|
9
9
|
import os
|
10
|
-
import pickle
|
11
10
|
import sys
|
12
11
|
import tempfile
|
13
12
|
import time
|
@@ -3450,6 +3449,11 @@ if response.status_code == 200:
|
|
3450
3449
|
if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
|
3451
3450
|
raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
|
3452
3451
|
|
3452
|
+
# Check for duplicates between train and eval sets by comparing all values
|
3453
|
+
train_eval_intersection = pd.merge(X, validated_eval_X, how='inner')
|
3454
|
+
if len(train_eval_intersection) > 0:
|
3455
|
+
raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
|
3456
|
+
|
3453
3457
|
return validated_eval_X, validated_eval_y
|
3454
3458
|
|
3455
3459
|
def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
|
@@ -4569,60 +4573,52 @@ if response.status_code == 200:
|
|
4569
4573
|
y: Union[pd.DataFrame, pd.Series, None] = None,
|
4570
4574
|
eval_set: Union[Tuple, None] = None,
|
4571
4575
|
):
|
4572
|
-
def dump_task():
|
4576
|
+
def dump_task(X_, y_, eval_set_):
|
4573
4577
|
try:
|
4574
|
-
|
4575
|
-
|
4576
|
-
|
4577
|
-
|
4578
|
-
else:
|
4579
|
-
xy_sample_index = []
|
4580
|
-
|
4581
|
-
def sample(inp, sample_index):
|
4582
|
-
if _num_samples(inp) <= 1000:
|
4583
|
-
return inp
|
4584
|
-
if isinstance(inp, (pd.DataFrame, pd.Series)):
|
4585
|
-
return inp.sample(n=1000, random_state=random_state)
|
4586
|
-
if isinstance(inp, np.ndarray):
|
4587
|
-
return inp[sample_index]
|
4588
|
-
if isinstance(inp, list):
|
4589
|
-
return inp[sample_index]
|
4578
|
+
if isinstance(X_, pd.Series):
|
4579
|
+
X_ = X_.to_frame()
|
4580
|
+
|
4581
|
+
# TODO check that this file was already uploaded
|
4590
4582
|
|
4591
4583
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
4592
|
-
|
4593
|
-
|
4594
|
-
if
|
4595
|
-
|
4596
|
-
|
4597
|
-
|
4598
|
-
|
4599
|
-
|
4600
|
-
|
4601
|
-
|
4602
|
-
|
4584
|
+
X_.to_parquet(f"{tmp_dir}/x.parquet", compression="zstd")
|
4585
|
+
|
4586
|
+
if y_ is not None:
|
4587
|
+
if isinstance(y_, pd.Series):
|
4588
|
+
y_ = y_.to_frame()
|
4589
|
+
y_.to_parquet(f"{tmp_dir}/y.parquet", compression="zstd")
|
4590
|
+
if eval_set_ and _num_samples(eval_set_[0][0]) > 0:
|
4591
|
+
eval_x_ = eval_set_[0][0]
|
4592
|
+
eval_y_ = eval_set_[0][1]
|
4593
|
+
if isinstance(eval_x_, pd.Series):
|
4594
|
+
eval_x_ = eval_x_.to_frame()
|
4595
|
+
eval_x_.to_parquet(f"{tmp_dir}/eval_x.parquet", compression="zstd")
|
4596
|
+
if isinstance(eval_y_, pd.Series):
|
4597
|
+
eval_y_ = eval_y_.to_frame()
|
4598
|
+
eval_y_.to_parquet(f"{tmp_dir}/eval_y.parquet", compression="zstd")
|
4603
4599
|
self.rest_client.dump_input_files(
|
4604
4600
|
trace_id,
|
4605
|
-
f"{tmp_dir}/x.
|
4606
|
-
f"{tmp_dir}/y.
|
4607
|
-
f"{tmp_dir}/eval_x.
|
4608
|
-
f"{tmp_dir}/eval_y.
|
4601
|
+
f"{tmp_dir}/x.parquet",
|
4602
|
+
f"{tmp_dir}/y.parquet",
|
4603
|
+
f"{tmp_dir}/eval_x.parquet",
|
4604
|
+
f"{tmp_dir}/eval_y.parquet",
|
4609
4605
|
)
|
4610
4606
|
else:
|
4611
4607
|
self.rest_client.dump_input_files(
|
4612
4608
|
trace_id,
|
4613
|
-
f"{tmp_dir}/x.
|
4614
|
-
f"{tmp_dir}/y.
|
4609
|
+
f"{tmp_dir}/x.parquet",
|
4610
|
+
f"{tmp_dir}/y.parquet",
|
4615
4611
|
)
|
4616
4612
|
else:
|
4617
4613
|
self.rest_client.dump_input_files(
|
4618
4614
|
trace_id,
|
4619
|
-
f"{tmp_dir}/x.
|
4615
|
+
f"{tmp_dir}/x.parquet",
|
4620
4616
|
)
|
4621
4617
|
except Exception:
|
4622
4618
|
self.logger.warning("Failed to dump input files", exc_info=True)
|
4623
4619
|
|
4624
4620
|
try:
|
4625
|
-
Thread(target=dump_task, daemon=True).start()
|
4621
|
+
Thread(target=dump_task, args=(X, y, eval_set), daemon=True).start()
|
4626
4622
|
except Exception:
|
4627
4623
|
self.logger.warning("Failed to dump input files", exc_info=True)
|
4628
4624
|
|
@@ -137,6 +137,8 @@ eval_y_multiindex_unsupported=Multi index in y in eval_set is not supported
|
|
137
137
|
eval_x_is_empty=X in eval_set is empty.
|
138
138
|
eval_y_is_empty=y in eval_set is empty.
|
139
139
|
x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
|
140
|
+
eval_x_has_train_samples=Eval set X has rows that are present in train set X
|
141
|
+
|
140
142
|
baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
|
141
143
|
baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
|
142
144
|
missing_features_for_transform=Missing some features for transform that were presented on fit: {}
|
upgini/utils/sklearn_ext.py
CHANGED
@@ -1324,7 +1324,7 @@ def _encode_cat_features(X_train, y_train, X_test, y_test, cat_features, estimat
|
|
1324
1324
|
else:
|
1325
1325
|
# Shuffle train data
|
1326
1326
|
X_train_shuffled, y_train_shuffled = _shuffle_pair(
|
1327
|
-
X_train[cat_features]
|
1327
|
+
X_train[cat_features], y_train, random_state
|
1328
1328
|
)
|
1329
1329
|
|
1330
1330
|
# Fit encoder on training fold
|
@@ -1,9 +1,9 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=zOnCJXwGq7WXQ73_SrUBkMqGfV00s4WlXFUPNyejNQ8,25
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=fRtqSkXNONLnPe6cCL967GMt349FTIpXzy_u8LUKncw,35354
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=rieH8wjC1c_q2LYZoju8KZyshokNzFpwVtrCtG88w3s,215940
|
7
7
|
upgini/http.py,sha256=6Qcepv0tDC72mBBJxYHnA2xqw6QwFaKrXN8o4vju8Es,44372
|
8
8
|
upgini/metadata.py,sha256=zt_9k0iQbWXuiRZcel4ORNPdQKt6Ou69ucZD_E1Q46o,12341
|
9
9
|
upgini/metrics.py,sha256=zIOaiyfQLedU9Fk4877drnlWh-KiImSkZpPeiq6Xr1E,45295
|
@@ -31,14 +31,14 @@ upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_a
|
|
31
31
|
upgini/autofe/timeseries/trend.py,sha256=K1_iw2ko_LIUU8YCUgrvN3n0MkHtsi7-63-8x9er1k4,2129
|
32
32
|
upgini/autofe/timeseries/volatility.py,sha256=SvZfhM_ZAWCNpTf87WjSnZsnlblARgruDlu4By4Zvhc,8078
|
33
33
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
34
|
-
upgini/data_source/data_source_publisher.py,sha256=
|
34
|
+
upgini/data_source/data_source_publisher.py,sha256=ufL8qK1vg8iUKd5bLWz6hEMGiC3JepUaWYx-nBKVqjA,24294
|
35
35
|
upgini/mdc/__init__.py,sha256=iHJlXQg6xRM1-ZOUtaPSJqw5SpQDszvxp4LyqviNLIQ,1027
|
36
36
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
37
37
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
38
|
upgini/normalizer/normalize_utils.py,sha256=g2TcDXZeJp9kAFO2sTqZ4CAsN4J1qHNgoJHZ8gtzUWo,7376
|
39
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
40
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
41
|
-
upgini/resource_bundle/strings.properties,sha256=
|
41
|
+
upgini/resource_bundle/strings.properties,sha256=SxO1uWFAc1s7BOFi01OyEI3ajklUKBhs8LkKrstImIg,28290
|
42
42
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
43
43
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
44
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
@@ -64,13 +64,13 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
|
64
64
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
65
65
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
66
66
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
67
|
-
upgini/utils/sklearn_ext.py,sha256=
|
67
|
+
upgini/utils/sklearn_ext.py,sha256=jLJWAKkqQinV15Z4y1ZnsN3c-fKFwXTsprs00COnyVU,49315
|
68
68
|
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
69
69
|
upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,16832
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.
|
74
|
-
upgini-1.2.
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
73
|
+
upgini-1.2.89a1.dist-info/METADATA,sha256=d9XvUcHoqSr2RzIpqLR42x1bffkKnr7PyT6iB6kZGYQ,49164
|
74
|
+
upgini-1.2.89a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
75
|
+
upgini-1.2.89a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.89a1.dist-info/RECORD,,
|
File without changes
|