upgini 1.2.88a3884.dev0__py3-none-any.whl → 1.2.89a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.88a3884.dev0"
1
+ __version__ = "1.2.89a1"
@@ -5,6 +5,8 @@ from datetime import datetime
5
5
  from enum import Enum
6
6
  from typing import Dict, List, Literal, Optional, Union
7
7
 
8
+ import pandas as pd
9
+
8
10
  from upgini.errors import HttpError, ValidationError
9
11
  from upgini.http import LoggerFactory, get_rest_client
10
12
  from upgini.mdc import MDC
@@ -33,7 +35,7 @@ class OnlineUploadingType(Enum):
33
35
  class DataSourcePublisher:
34
36
  FINAL_STATUSES = ["COMPLETED", "FAILED", "TIMED_OUT"]
35
37
  ACCEPTABLE_UPDATE_FREQUENCIES = ["Daily", "Weekly", "Monthly", "Quarterly", "Annually"]
36
- DEFAULT_GENERATE_EMBEDDINGS = []
38
+ DEFAULT_GENERATE_EMBEDDINGS = dict()
37
39
 
38
40
  def __init__(self, api_key: Optional[str] = None, endpoint: Optional[str] = None, logs_enabled=True):
39
41
  self._rest_client = get_rest_client(endpoint, api_key)
@@ -58,7 +60,7 @@ class DataSourcePublisher:
58
60
  hash_feature_names=False,
59
61
  snapshot_frequency_days: Optional[int] = None,
60
62
  join_date_abs_limit_days: Optional[int] = None,
61
- features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
63
+ features_for_embeddings: Optional[Dict[str, str]] = DEFAULT_GENERATE_EMBEDDINGS,
62
64
  data_table_id_to_replace: Optional[str] = None,
63
65
  keep_features: Optional[List[str]] = None,
64
66
  date_features: Optional[List[str]] = None,
@@ -137,6 +139,25 @@ class DataSourcePublisher:
137
139
  ) and not date_format:
138
140
  raise ValidationError("date_format argument is required for PHONE+DATE and HEM+DATE search keys")
139
141
 
142
+ if secondary_search_keys:
143
+ response = self._rest_client.get_active_ads_definitions()
144
+ definitions = pd.DataFrame(response["adsDefinitions"])
145
+ prod_secondary_definitions = definitions.query(
146
+ "(secondarySearchKeys.astype('string') != '[]') & (adsDefinitionAccessType == 'PROD')"
147
+ )[["name", "searchKeys", "secondarySearchKeys"]]
148
+ for _, row in prod_secondary_definitions.iterrows():
149
+ existing_secondary_keys = {item for sublist in row["secondarySearchKeys"] for item in sublist}
150
+ if existing_secondary_keys == {v.value.name for v in secondary_search_keys.values()}:
151
+ existing_search_keys = {item for sublist in row["searchKeys"] for item in sublist}
152
+ if (
153
+ existing_search_keys == {v.value.name for v in search_keys.values()}
154
+ or ("IP" in str(existing_search_keys) and "IP" in str(search_keys.values()))
155
+ ):
156
+ raise ValidationError(
157
+ "ADS with the same PRIMARY_KEYS -> SECONDARY_KEYS mapping "
158
+ f"already exists: {row['name']}"
159
+ )
160
+
140
161
  request = {
141
162
  "dataTableUri": data_table_uri,
142
163
  "searchKeys": {k: v.value.value for k, v in search_keys.items()},
@@ -7,7 +7,6 @@ import json
7
7
  import logging
8
8
  import numbers
9
9
  import os
10
- import pickle
11
10
  import sys
12
11
  import tempfile
13
12
  import time
@@ -3450,6 +3449,11 @@ if response.status_code == 200:
3450
3449
  if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
3451
3450
  raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
3452
3451
 
3452
+ # Check for duplicates between train and eval sets by comparing all values
3453
+ train_eval_intersection = pd.merge(X, validated_eval_X, how='inner')
3454
+ if len(train_eval_intersection) > 0:
3455
+ raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
3456
+
3453
3457
  return validated_eval_X, validated_eval_y
3454
3458
 
3455
3459
  def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
@@ -4569,60 +4573,52 @@ if response.status_code == 200:
4569
4573
  y: Union[pd.DataFrame, pd.Series, None] = None,
4570
4574
  eval_set: Union[Tuple, None] = None,
4571
4575
  ):
4572
- def dump_task():
4576
+ def dump_task(X_, y_, eval_set_):
4573
4577
  try:
4574
- random_state = 42
4575
- rnd = np.random.RandomState(random_state)
4576
- if _num_samples(X) > 0:
4577
- xy_sample_index = rnd.randint(0, _num_samples(X), size=1000)
4578
- else:
4579
- xy_sample_index = []
4580
-
4581
- def sample(inp, sample_index):
4582
- if _num_samples(inp) <= 1000:
4583
- return inp
4584
- if isinstance(inp, (pd.DataFrame, pd.Series)):
4585
- return inp.sample(n=1000, random_state=random_state)
4586
- if isinstance(inp, np.ndarray):
4587
- return inp[sample_index]
4588
- if isinstance(inp, list):
4589
- return inp[sample_index]
4578
+ if isinstance(X_, pd.Series):
4579
+ X_ = X_.to_frame()
4580
+
4581
+ # TODO check that this file was already uploaded
4590
4582
 
4591
4583
  with tempfile.TemporaryDirectory() as tmp_dir:
4592
- with open(f"{tmp_dir}/x.pickle", "wb") as x_file:
4593
- pickle.dump(sample(X, xy_sample_index), x_file)
4594
- if y is not None:
4595
- with open(f"{tmp_dir}/y.pickle", "wb") as y_file:
4596
- pickle.dump(sample(y, xy_sample_index), y_file)
4597
- if eval_set and _num_samples(eval_set[0][0]) > 0:
4598
- eval_xy_sample_index = rnd.randint(0, _num_samples(eval_set[0][0]), size=1000)
4599
- with open(f"{tmp_dir}/eval_x.pickle", "wb") as eval_x_file:
4600
- pickle.dump(sample(eval_set[0][0], eval_xy_sample_index), eval_x_file)
4601
- with open(f"{tmp_dir}/eval_y.pickle", "wb") as eval_y_file:
4602
- pickle.dump(sample(eval_set[0][1], eval_xy_sample_index), eval_y_file)
4584
+ X_.to_parquet(f"{tmp_dir}/x.parquet", compression="zstd")
4585
+
4586
+ if y_ is not None:
4587
+ if isinstance(y_, pd.Series):
4588
+ y_ = y_.to_frame()
4589
+ y_.to_parquet(f"{tmp_dir}/y.parquet", compression="zstd")
4590
+ if eval_set_ and _num_samples(eval_set_[0][0]) > 0:
4591
+ eval_x_ = eval_set_[0][0]
4592
+ eval_y_ = eval_set_[0][1]
4593
+ if isinstance(eval_x_, pd.Series):
4594
+ eval_x_ = eval_x_.to_frame()
4595
+ eval_x_.to_parquet(f"{tmp_dir}/eval_x.parquet", compression="zstd")
4596
+ if isinstance(eval_y_, pd.Series):
4597
+ eval_y_ = eval_y_.to_frame()
4598
+ eval_y_.to_parquet(f"{tmp_dir}/eval_y.parquet", compression="zstd")
4603
4599
  self.rest_client.dump_input_files(
4604
4600
  trace_id,
4605
- f"{tmp_dir}/x.pickle",
4606
- f"{tmp_dir}/y.pickle",
4607
- f"{tmp_dir}/eval_x.pickle",
4608
- f"{tmp_dir}/eval_y.pickle",
4601
+ f"{tmp_dir}/x.parquet",
4602
+ f"{tmp_dir}/y.parquet",
4603
+ f"{tmp_dir}/eval_x.parquet",
4604
+ f"{tmp_dir}/eval_y.parquet",
4609
4605
  )
4610
4606
  else:
4611
4607
  self.rest_client.dump_input_files(
4612
4608
  trace_id,
4613
- f"{tmp_dir}/x.pickle",
4614
- f"{tmp_dir}/y.pickle",
4609
+ f"{tmp_dir}/x.parquet",
4610
+ f"{tmp_dir}/y.parquet",
4615
4611
  )
4616
4612
  else:
4617
4613
  self.rest_client.dump_input_files(
4618
4614
  trace_id,
4619
- f"{tmp_dir}/x.pickle",
4615
+ f"{tmp_dir}/x.parquet",
4620
4616
  )
4621
4617
  except Exception:
4622
4618
  self.logger.warning("Failed to dump input files", exc_info=True)
4623
4619
 
4624
4620
  try:
4625
- Thread(target=dump_task, daemon=True).start()
4621
+ Thread(target=dump_task, args=(X, y, eval_set), daemon=True).start()
4626
4622
  except Exception:
4627
4623
  self.logger.warning("Failed to dump input files", exc_info=True)
4628
4624
 
@@ -137,6 +137,8 @@ eval_y_multiindex_unsupported=Multi index in y in eval_set is not supported
137
137
  eval_x_is_empty=X in eval_set is empty.
138
138
  eval_y_is_empty=y in eval_set is empty.
139
139
  x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
140
+ eval_x_has_train_samples=Eval set X has rows that are present in train set X
141
+
140
142
  baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
141
143
  baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
142
144
  missing_features_for_transform=Missing some features for transform that were presented on fit: {}
@@ -1324,7 +1324,7 @@ def _encode_cat_features(X_train, y_train, X_test, y_test, cat_features, estimat
1324
1324
  else:
1325
1325
  # Shuffle train data
1326
1326
  X_train_shuffled, y_train_shuffled = _shuffle_pair(
1327
- X_train[cat_features].astype("object"), y_train, random_state
1327
+ X_train[cat_features], y_train, random_state
1328
1328
  )
1329
1329
 
1330
1330
  # Fit encoder on training fold
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.88a3884.dev0
3
+ Version: 1.2.89a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=9UxVEFo0h8LcuPSKD5JSZ_n02IZF15Ksx8d1ITu4M7U,33
1
+ upgini/__about__.py,sha256=zOnCJXwGq7WXQ73_SrUBkMqGfV00s4WlXFUPNyejNQ8,25
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=fRtqSkXNONLnPe6cCL967GMt349FTIpXzy_u8LUKncw,35354
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=eFnJVb8jM1INlT-imfjafhWtOfx9EJv2HSvlfyGy0_U,216188
6
+ upgini/features_enricher.py,sha256=rieH8wjC1c_q2LYZoju8KZyshokNzFpwVtrCtG88w3s,215940
7
7
  upgini/http.py,sha256=6Qcepv0tDC72mBBJxYHnA2xqw6QwFaKrXN8o4vju8Es,44372
8
8
  upgini/metadata.py,sha256=zt_9k0iQbWXuiRZcel4ORNPdQKt6Ou69ucZD_E1Q46o,12341
9
9
  upgini/metrics.py,sha256=zIOaiyfQLedU9Fk4877drnlWh-KiImSkZpPeiq6Xr1E,45295
@@ -31,14 +31,14 @@ upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_a
31
31
  upgini/autofe/timeseries/trend.py,sha256=K1_iw2ko_LIUU8YCUgrvN3n0MkHtsi7-63-8x9er1k4,2129
32
32
  upgini/autofe/timeseries/volatility.py,sha256=SvZfhM_ZAWCNpTf87WjSnZsnlblARgruDlu4By4Zvhc,8078
33
33
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
- upgini/data_source/data_source_publisher.py,sha256=4S9qwlAklD8vg9tUU_c1pHE2_glUHAh15-wr5hMwKFw,22879
34
+ upgini/data_source/data_source_publisher.py,sha256=ufL8qK1vg8iUKd5bLWz6hEMGiC3JepUaWYx-nBKVqjA,24294
35
35
  upgini/mdc/__init__.py,sha256=iHJlXQg6xRM1-ZOUtaPSJqw5SpQDszvxp4LyqviNLIQ,1027
36
36
  upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
37
37
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
38
  upgini/normalizer/normalize_utils.py,sha256=g2TcDXZeJp9kAFO2sTqZ4CAsN4J1qHNgoJHZ8gtzUWo,7376
39
39
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
40
40
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
41
- upgini/resource_bundle/strings.properties,sha256=xpHD-3mW1U6Nca0QghC6FSrQLDci9pInuMpOBPPiB8M,28212
41
+ upgini/resource_bundle/strings.properties,sha256=SxO1uWFAc1s7BOFi01OyEI3ajklUKBhs8LkKrstImIg,28290
42
42
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
43
43
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -64,13 +64,13 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
64
64
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
65
65
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
66
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
67
- upgini/utils/sklearn_ext.py,sha256=Mdxz0tc-9zT4QyNccA3B86fY4l0MnLDr94POVdYeCT4,49332
67
+ upgini/utils/sklearn_ext.py,sha256=jLJWAKkqQinV15Z4y1ZnsN3c-fKFwXTsprs00COnyVU,49315
68
68
  upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
69
69
  upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,16832
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.88a3884.dev0.dist-info/METADATA,sha256=e_lwt9ydR712gQBymukF9Lc2W-5aqj5nrZa-6T-UXA4,49172
74
- upgini-1.2.88a3884.dev0.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
75
- upgini-1.2.88a3884.dev0.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.88a3884.dev0.dist-info/RECORD,,
73
+ upgini-1.2.89a1.dist-info/METADATA,sha256=d9XvUcHoqSr2RzIpqLR42x1bffkKnr7PyT6iB6kZGYQ,49164
74
+ upgini-1.2.89a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.89a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.89a1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.24.2
2
+ Generator: hatchling 1.25.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any