upgini 1.2.82a3853.dev2__py3-none-any.whl → 1.2.83__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.82a3853.dev2"
1
+ __version__ = "1.2.83"
upgini/dataset.py CHANGED
@@ -394,7 +394,7 @@ class Dataset: # (pd.DataFrame):
394
394
  if col in mandatory_columns:
395
395
  self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
396
396
 
397
- invalid_values = list(self.data.loc[self.data[f"{col}_is_valid"] == 0, col].head().values) # type: ignore
397
+ invalid_values = list(set(self.data.loc[self.data[f"{col}_is_valid"] == 0, col].head().values))
398
398
  valid_share = self.data[f"{col}_is_valid"].sum() / nrows
399
399
  original_col_name = self.columns_renaming[col]
400
400
  validation_stats[original_col_name] = {}
@@ -1559,9 +1559,12 @@ class FeaturesEnricher(TransformerMixin):
1559
1559
  fitting_X = X_sorted[client_features].copy()
1560
1560
  fitting_enriched_X = enriched_X_sorted[client_features + existing_filtered_enriched_features].copy()
1561
1561
 
1562
+ renamed_generate_features = [columns_renaming.get(c, c) for c in (self.generate_features or [])]
1563
+ renamed_client_cat_features = [columns_renaming.get(c, c) for c in (client_cat_features or [])]
1564
+
1562
1565
  # Detect and drop high cardinality columns in train
1563
1566
  columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
1564
- non_excluding_columns = (self.generate_features or []) + (client_cat_features or [])
1567
+ non_excluding_columns = renamed_generate_features + renamed_client_cat_features
1565
1568
  columns_with_high_cardinality = [c for c in columns_with_high_cardinality if c not in non_excluding_columns]
1566
1569
  if len(columns_with_high_cardinality) > 0:
1567
1570
  self.logger.warning(
@@ -2447,13 +2450,15 @@ if response.status_code == 200:
2447
2450
  meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
2448
2451
  meaning_types.update({col: key.value for col, key in search_keys.items()})
2449
2452
 
2450
- features_not_to_pass.extend([
2451
- c
2452
- for c in df.columns
2453
- if c not in search_keys.keys()
2454
- and c not in features_for_transform
2455
- and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
2456
- ])
2453
+ features_not_to_pass.extend(
2454
+ [
2455
+ c
2456
+ for c in df.columns
2457
+ if c not in search_keys.keys()
2458
+ and c not in features_for_transform
2459
+ and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
2460
+ ]
2461
+ )
2457
2462
 
2458
2463
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2459
2464
  df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
@@ -3712,7 +3717,7 @@ if response.status_code == 200:
3712
3717
  columns_to_sort = [date_column] if date_column is not None else []
3713
3718
 
3714
3719
  do_sorting = True
3715
- if self.id_columns and self.cv in [CVType.time_series, CVType.blocked_time_series]:
3720
+ if self.id_columns and self.cv.is_time_series():
3716
3721
  # Check duplicates by date and id_columns
3717
3722
  reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
3718
3723
  renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in self.id_columns]
@@ -3722,14 +3727,7 @@ if response.status_code == 200:
3722
3727
 
3723
3728
  duplicates = df.duplicated(subset=duplicate_check_columns, keep=False)
3724
3729
  if duplicates.any():
3725
- if not silent:
3726
- self.__log_warning(self.bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
3727
- else:
3728
- self.logger.warning(
3729
- f"Found {duplicates.sum()} duplicate rows by date and ID columns: {duplicate_check_columns}."
3730
- " Will not sort dataset"
3731
- )
3732
- do_sorting = False
3730
+ raise ValueError(self.bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
3733
3731
  else:
3734
3732
  columns_to_hash = list(set(list(search_keys.keys()) + renamed_id_columns + [target_name]))
3735
3733
  columns_to_hash = sort_columns(
@@ -36,7 +36,6 @@ trial_quota_limit_riched=You have reached the quota limit of trial data usage. P
36
36
  loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
37
37
  loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
38
38
  multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
39
- date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns
40
39
  group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
41
40
  current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
42
41
  # Errors
@@ -191,6 +190,7 @@ timeseries_invalid_split_count=Cross-validation requires at least one train/test
191
190
  timeseries_invalid_test_size_type=test_size={} should be a float in the (0, 1) range
192
191
  timeseries_splits_more_than_samples=Number of splits={} can't be more than number of samples={}
193
192
  timeseries_invalid_test_size=Wrong number of samples in a test fold: (test_size * n_samples / n_splits) <= 1
193
+ date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns. Please remove them and try again
194
194
  # Upload ads validation
195
195
  ads_upload_too_few_rows=At least 1000 records per sample are needed. Increase the sample size for evaluation and resubmit the data
196
196
  ads_upload_search_key_not_found=Search key {} wasn't found in dataframe columns
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.82a3853.dev2
3
+ Version: 1.2.83
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=FXbALhFkuQ3hoEzhmp_olrO40HwaCvGiCKus4wSy940,33
1
+ upgini/__about__.py,sha256=h2Ibse6YuHdXZHWt0iDDzwOXKJiV09m9RkgFgDU2HsY,23
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
4
+ upgini/dataset.py,sha256=fRtqSkXNONLnPe6cCL967GMt349FTIpXzy_u8LUKncw,35354
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=cWbEA2lOt51x62NrLkyxu1G8I4KQo_2aOgqt3Ypyr1M,212819
6
+ upgini/features_enricher.py,sha256=2ryADtOVEEebuUBhimusvnBzGxUkdTaqpEh2F1PqHSs,212719
7
7
  upgini/http.py,sha256=AfaJ3c8z_tK2hZFEehNybDKE0mp1tYcyAP_l0_p8bLQ,43933
8
8
  upgini/metadata.py,sha256=zt_9k0iQbWXuiRZcel4ORNPdQKt6Ou69ucZD_E1Q46o,12341
9
9
  upgini/metrics.py,sha256=3cip0_L6-OFew74KsRwzxJDU6UFq05h2v7IsyHLcMRc,43164
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
38
38
  upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
39
39
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
40
40
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
41
- upgini/resource_bundle/strings.properties,sha256=GmkTgxowpykuuviubVH5cMF_lNFQJEqfRoBJaj3c72E,27957
41
+ upgini/resource_bundle/strings.properties,sha256=U_ewTI-qPww4X3WcFG3qDf_jv2vo6RrlCehVDjqtzEI,27991
42
42
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
43
43
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.82a3853.dev2.dist-info/METADATA,sha256=rxFyB6IXqVf7wFvGuT5Y5rqK3cFafMDQQl28-i1j09M,49172
74
- upgini-1.2.82a3853.dev2.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
75
- upgini-1.2.82a3853.dev2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.82a3853.dev2.dist-info/RECORD,,
73
+ upgini-1.2.83.dist-info/METADATA,sha256=roEvcDdV_9hyXf6geUHr-fBo6k-yBs7RfCXdrLEnhac,49162
74
+ upgini-1.2.83.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
75
+ upgini-1.2.83.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.83.dist-info/RECORD,,