upgini 1.2.82a3853.dev1__py3-none-any.whl → 1.2.83a3853.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/feature.py +1 -1
- upgini/dataset.py +1 -1
- upgini/features_enricher.py +15 -17
- upgini/resource_bundle/strings.properties +1 -1
- {upgini-1.2.82a3853.dev1.dist-info → upgini-1.2.83a3853.dev1.dist-info}/METADATA +1 -1
- {upgini-1.2.82a3853.dev1.dist-info → upgini-1.2.83a3853.dev1.dist-info}/RECORD +9 -9
- {upgini-1.2.82a3853.dev1.dist-info → upgini-1.2.83a3853.dev1.dist-info}/WHEEL +0 -0
- {upgini-1.2.82a3853.dev1.dist-info → upgini-1.2.83a3853.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.83a3853.dev1"
|
upgini/autofe/feature.py
CHANGED
@@ -117,7 +117,7 @@ class Feature:
|
|
117
117
|
+ [ch.op.get_hash_component() for ch in self.children if isinstance(ch, Feature)]
|
118
118
|
+ [ch.get_display_name() for ch in self.children]
|
119
119
|
).encode("utf-8")
|
120
|
-
).hexdigest()[:
|
120
|
+
).hexdigest()[:10]
|
121
121
|
|
122
122
|
def set_alias(self, alias: str) -> "Feature":
|
123
123
|
self.alias = alias
|
upgini/dataset.py
CHANGED
@@ -394,7 +394,7 @@ class Dataset: # (pd.DataFrame):
|
|
394
394
|
if col in mandatory_columns:
|
395
395
|
self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
|
396
396
|
|
397
|
-
invalid_values = list(self.data.loc[self.data[f"{col}_is_valid"] == 0, col].head().values)
|
397
|
+
invalid_values = list(set(self.data.loc[self.data[f"{col}_is_valid"] == 0, col].head().values))
|
398
398
|
valid_share = self.data[f"{col}_is_valid"].sum() / nrows
|
399
399
|
original_col_name = self.columns_renaming[col]
|
400
400
|
validation_stats[original_col_name] = {}
|
upgini/features_enricher.py
CHANGED
@@ -1559,9 +1559,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
1559
1559
|
fitting_X = X_sorted[client_features].copy()
|
1560
1560
|
fitting_enriched_X = enriched_X_sorted[client_features + existing_filtered_enriched_features].copy()
|
1561
1561
|
|
1562
|
+
renamed_generate_features = [columns_renaming.get(c, c) for c in (self.generate_features or [])]
|
1563
|
+
renamed_client_cat_features = [columns_renaming.get(c, c) for c in (client_cat_features or [])]
|
1564
|
+
|
1562
1565
|
# Detect and drop high cardinality columns in train
|
1563
1566
|
columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
|
1564
|
-
non_excluding_columns =
|
1567
|
+
non_excluding_columns = renamed_generate_features + renamed_client_cat_features
|
1565
1568
|
columns_with_high_cardinality = [c for c in columns_with_high_cardinality if c not in non_excluding_columns]
|
1566
1569
|
if len(columns_with_high_cardinality) > 0:
|
1567
1570
|
self.logger.warning(
|
@@ -2447,13 +2450,15 @@ if response.status_code == 200:
|
|
2447
2450
|
meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
|
2448
2451
|
meaning_types.update({col: key.value for col, key in search_keys.items()})
|
2449
2452
|
|
2450
|
-
features_not_to_pass.extend(
|
2451
|
-
|
2452
|
-
|
2453
|
-
|
2454
|
-
|
2455
|
-
|
2456
|
-
|
2453
|
+
features_not_to_pass.extend(
|
2454
|
+
[
|
2455
|
+
c
|
2456
|
+
for c in df.columns
|
2457
|
+
if c not in search_keys.keys()
|
2458
|
+
and c not in features_for_transform
|
2459
|
+
and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
|
2460
|
+
]
|
2461
|
+
)
|
2457
2462
|
|
2458
2463
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
2459
2464
|
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
@@ -3712,7 +3717,7 @@ if response.status_code == 200:
|
|
3712
3717
|
columns_to_sort = [date_column] if date_column is not None else []
|
3713
3718
|
|
3714
3719
|
do_sorting = True
|
3715
|
-
if self.id_columns and self.cv
|
3720
|
+
if self.id_columns and self.cv.is_time_series():
|
3716
3721
|
# Check duplicates by date and id_columns
|
3717
3722
|
reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
3718
3723
|
renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in self.id_columns]
|
@@ -3722,14 +3727,7 @@ if response.status_code == 200:
|
|
3722
3727
|
|
3723
3728
|
duplicates = df.duplicated(subset=duplicate_check_columns, keep=False)
|
3724
3729
|
if duplicates.any():
|
3725
|
-
|
3726
|
-
self.__log_warning(self.bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
|
3727
|
-
else:
|
3728
|
-
self.logger.warning(
|
3729
|
-
f"Found {duplicates.sum()} duplicate rows by date and ID columns: {duplicate_check_columns}."
|
3730
|
-
" Will not sort dataset"
|
3731
|
-
)
|
3732
|
-
do_sorting = False
|
3730
|
+
raise ValueError(self.bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
|
3733
3731
|
else:
|
3734
3732
|
columns_to_hash = list(set(list(search_keys.keys()) + renamed_id_columns + [target_name]))
|
3735
3733
|
columns_to_hash = sort_columns(
|
@@ -36,7 +36,6 @@ trial_quota_limit_riched=You have reached the quota limit of trial data usage. P
|
|
36
36
|
loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
|
37
37
|
loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
|
38
38
|
multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
|
39
|
-
date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns
|
40
39
|
group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
|
41
40
|
current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
|
42
41
|
# Errors
|
@@ -191,6 +190,7 @@ timeseries_invalid_split_count=Cross-validation requires at least one train/test
|
|
191
190
|
timeseries_invalid_test_size_type=test_size={} should be a float in the (0, 1) range
|
192
191
|
timeseries_splits_more_than_samples=Number of splits={} can't be more than number of samples={}
|
193
192
|
timeseries_invalid_test_size=Wrong number of samples in a test fold: (test_size * n_samples / n_splits) <= 1
|
193
|
+
date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns. Please remove them and try again
|
194
194
|
# Upload ads validation
|
195
195
|
ads_upload_too_few_rows=At least 1000 records per sample are needed. Increase the sample size for evaluation and resubmit the data
|
196
196
|
ads_upload_search_key_not_found=Search key {} wasn't found in dataframe columns
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.83a3853.dev1
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -1,9 +1,9 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=pE92yxPL5O3G3o-N3okdWiKFsRCImwb_aZBdl9D0e38,33
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
|
-
upgini/dataset.py,sha256=
|
4
|
+
upgini/dataset.py,sha256=fRtqSkXNONLnPe6cCL967GMt349FTIpXzy_u8LUKncw,35354
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=2ryADtOVEEebuUBhimusvnBzGxUkdTaqpEh2F1PqHSs,212719
|
7
7
|
upgini/http.py,sha256=AfaJ3c8z_tK2hZFEehNybDKE0mp1tYcyAP_l0_p8bLQ,43933
|
8
8
|
upgini/metadata.py,sha256=zt_9k0iQbWXuiRZcel4ORNPdQKt6Ou69ucZD_E1Q46o,12341
|
9
9
|
upgini/metrics.py,sha256=3cip0_L6-OFew74KsRwzxJDU6UFq05h2v7IsyHLcMRc,43164
|
@@ -16,7 +16,7 @@ upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
16
|
upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
|
17
17
|
upgini/autofe/binary.py,sha256=oOEECc4nRzZN2tYaiqx8F2XHnfWpk1bVvb7ZkZJ0lO8,7709
|
18
18
|
upgini/autofe/date.py,sha256=MM1S-6imNSzCDOhbNnmsc_bwSqUWBcS8vWAdHF8j1kY,11134
|
19
|
-
upgini/autofe/feature.py,sha256=
|
19
|
+
upgini/autofe/feature.py,sha256=MjBbviB5Jy90EuWlnhgGgBptn3GomJv0xNjhjZN0P5I,15329
|
20
20
|
upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
|
21
21
|
upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
|
22
22
|
upgini/autofe/unary.py,sha256=Sx11IoHRh5nwyALzjgG9GQOrVNIs8NZ1JzunAJuN66A,5731
|
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
38
38
|
upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
|
39
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
40
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
41
|
-
upgini/resource_bundle/strings.properties,sha256=
|
41
|
+
upgini/resource_bundle/strings.properties,sha256=U_ewTI-qPww4X3WcFG3qDf_jv2vo6RrlCehVDjqtzEI,27991
|
42
42
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
43
43
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
44
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
|
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.
|
74
|
-
upgini-1.2.
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
73
|
+
upgini-1.2.83a3853.dev1.dist-info/METADATA,sha256=XLOx9TyPMTtLczwt9SZduqjkP1yHiJmLDy67HNXIV9s,49172
|
74
|
+
upgini-1.2.83a3853.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
75
|
+
upgini-1.2.83a3853.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.83a3853.dev1.dist-info/RECORD,,
|
File without changes
|
File without changes
|