upgini 1.2.57a3675.dev4__py3-none-any.whl → 1.2.58a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/date.py +0 -8
- upgini/autofe/feature.py +10 -1
- upgini/data_source/data_source_publisher.py +0 -1
- upgini/dataset.py +8 -16
- upgini/features_enricher.py +60 -51
- upgini/resource_bundle/strings.properties +1 -1
- upgini/utils/email_utils.py +6 -6
- upgini/utils/mstats.py +177 -0
- upgini/utils/sort.py +160 -0
- {upgini-1.2.57a3675.dev4.dist-info → upgini-1.2.58a1.dist-info}/METADATA +1 -1
- {upgini-1.2.57a3675.dev4.dist-info → upgini-1.2.58a1.dist-info}/RECORD +14 -12
- {upgini-1.2.57a3675.dev4.dist-info → upgini-1.2.58a1.dist-info}/WHEEL +1 -1
- {upgini-1.2.57a3675.dev4.dist-info → upgini-1.2.58a1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.58a1"
|
upgini/autofe/date.py
CHANGED
|
@@ -64,9 +64,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
64
64
|
return res
|
|
65
65
|
|
|
66
66
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
67
|
-
if left.isna().all() or right.isna().all():
|
|
68
|
-
return pd.Series([None] * len(left))
|
|
69
|
-
|
|
70
67
|
left = self._convert_to_date(left, self.left_unit)
|
|
71
68
|
right = self._convert_to_date(right, self.right_unit)
|
|
72
69
|
diff = self._convert_diff_to_unit(left.dt.date - right.dt.date)
|
|
@@ -145,9 +142,6 @@ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
|
|
|
145
142
|
return cls(aggregation=aggregation)
|
|
146
143
|
|
|
147
144
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
148
|
-
if left.isna().all() or right.isna().all():
|
|
149
|
-
return pd.Series([None] * len(left), dtype=np.float64)
|
|
150
|
-
|
|
151
145
|
left = self._convert_to_date(left, self.left_unit)
|
|
152
146
|
right_mask = right.apply(lambda x: len(x) > 0)
|
|
153
147
|
mask = left.notna() & right.notna() & right_mask
|
|
@@ -236,8 +230,6 @@ class DatePercentileBase(PandasOperand, abc.ABC):
|
|
|
236
230
|
pass
|
|
237
231
|
|
|
238
232
|
def _perc(self, f, bounds):
|
|
239
|
-
if f is None or np.isnan(f):
|
|
240
|
-
return np.nan
|
|
241
233
|
hit = np.where(f >= np.array(bounds))[0]
|
|
242
234
|
if hit.size > 0:
|
|
243
235
|
return np.max(hit) + 1
|
upgini/autofe/feature.py
CHANGED
|
@@ -26,9 +26,18 @@ class Column:
|
|
|
26
26
|
return dict()
|
|
27
27
|
|
|
28
28
|
def rename_columns(self, mapping: Dict[str, str]) -> "Column":
|
|
29
|
-
self.name = mapping.get(self.name) or self.name
|
|
29
|
+
self.name = self._unhash(mapping.get(self.name) or self.name)
|
|
30
30
|
return self
|
|
31
31
|
|
|
32
|
+
def _unhash(self, feature_name: str) -> str:
|
|
33
|
+
last_component_idx = feature_name.rfind("_")
|
|
34
|
+
if not feature_name.startswith("f_"):
|
|
35
|
+
return feature_name # etalon feature
|
|
36
|
+
elif last_component_idx == 1:
|
|
37
|
+
return feature_name[2:] # fully hashed name, cannot unhash
|
|
38
|
+
else:
|
|
39
|
+
return feature_name[2:last_component_idx]
|
|
40
|
+
|
|
32
41
|
def delete_data(self):
|
|
33
42
|
self.data = None
|
|
34
43
|
|
|
@@ -386,7 +386,6 @@ class DataSourcePublisher:
|
|
|
386
386
|
search_keys = [k.value.value for k in search_keys] if search_keys else None
|
|
387
387
|
request = {"bqTableId": bq_table_id, "searchKeys": search_keys}
|
|
388
388
|
task_id = self._rest_client.upload_online(request, trace_id)
|
|
389
|
-
print(f"Uploading online task created. task_id={task_id}")
|
|
390
389
|
with Spinner():
|
|
391
390
|
status_response = self._rest_client.poll_ads_management_task_status(task_id, trace_id)
|
|
392
391
|
while status_response["status"] not in self.FINAL_STATUSES:
|
upgini/dataset.py
CHANGED
|
@@ -587,23 +587,15 @@ class Dataset: # (pd.DataFrame):
|
|
|
587
587
|
if (
|
|
588
588
|
runtime_parameters is not None
|
|
589
589
|
and runtime_parameters.properties is not None
|
|
590
|
+
and "generate_features" in runtime_parameters.properties
|
|
590
591
|
):
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
for
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
|
|
599
|
-
if "columns_for_online_api" in runtime_parameters.properties:
|
|
600
|
-
columns_for_online_api = runtime_parameters.properties["columns_for_online_api"].split(",")
|
|
601
|
-
renamed_columns_for_online_api = []
|
|
602
|
-
for f in columns_for_online_api:
|
|
603
|
-
for new_column, orig_column in self.columns_renaming.items():
|
|
604
|
-
if f == orig_column:
|
|
605
|
-
renamed_columns_for_online_api.append(new_column)
|
|
606
|
-
runtime_parameters.properties["columns_for_online_api"] = ",".join(renamed_columns_for_online_api)
|
|
592
|
+
generate_features = runtime_parameters.properties["generate_features"].split(",")
|
|
593
|
+
renamed_generate_features = []
|
|
594
|
+
for f in generate_features:
|
|
595
|
+
for new_column, orig_column in self.columns_renaming.items():
|
|
596
|
+
if f == orig_column:
|
|
597
|
+
renamed_generate_features.append(new_column)
|
|
598
|
+
runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
|
|
607
599
|
|
|
608
600
|
return runtime_parameters
|
|
609
601
|
|
upgini/features_enricher.py
CHANGED
|
@@ -112,6 +112,7 @@ try:
|
|
|
112
112
|
except Exception:
|
|
113
113
|
from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
|
|
114
114
|
|
|
115
|
+
from upgini.utils.sort import sort_columns
|
|
115
116
|
from upgini.utils.target_utils import (
|
|
116
117
|
balance_undersample_forced,
|
|
117
118
|
calculate_psi,
|
|
@@ -222,7 +223,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
222
223
|
loss: Optional[str] = None,
|
|
223
224
|
detect_missing_search_keys: bool = True,
|
|
224
225
|
generate_features: Optional[List[str]] = None,
|
|
225
|
-
columns_for_online_api: Optional[List[str]] = None,
|
|
226
226
|
round_embeddings: Optional[int] = None,
|
|
227
227
|
logs_enabled: bool = True,
|
|
228
228
|
raise_validation_error: bool = True,
|
|
@@ -346,9 +346,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
346
346
|
self.logger.error(msg)
|
|
347
347
|
raise ValidationError(msg)
|
|
348
348
|
self.runtime_parameters.properties["round_embeddings"] = round_embeddings
|
|
349
|
-
self.columns_for_online_api = columns_for_online_api
|
|
350
|
-
if columns_for_online_api is not None:
|
|
351
|
-
self.runtime_parameters.properties["columns_for_online_api"] = ",".join(columns_for_online_api)
|
|
352
349
|
maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
|
|
353
350
|
if maybe_downsampling_limit is not None:
|
|
354
351
|
Dataset.FIT_SAMPLE_THRESHOLD = int(maybe_downsampling_limit)
|
|
@@ -1261,7 +1258,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1261
1258
|
for feature, shap in new_shaps.items()
|
|
1262
1259
|
if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
|
|
1263
1260
|
}
|
|
1264
|
-
self.__prepare_feature_importances(trace_id, x_columns, new_shaps
|
|
1261
|
+
self.__prepare_feature_importances(trace_id, x_columns, new_shaps)
|
|
1265
1262
|
|
|
1266
1263
|
if self.features_info_display_handle is not None:
|
|
1267
1264
|
try:
|
|
@@ -1738,7 +1735,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1738
1735
|
self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
|
|
1739
1736
|
df = df.sample(n=sample_rows, random_state=self.random_state)
|
|
1740
1737
|
|
|
1741
|
-
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
1738
|
+
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET)
|
|
1742
1739
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
1743
1740
|
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
1744
1741
|
|
|
@@ -1919,6 +1916,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1919
1916
|
progress_bar=progress_bar,
|
|
1920
1917
|
progress_callback=progress_callback,
|
|
1921
1918
|
add_fit_system_record_id=True,
|
|
1919
|
+
target_name=tmp_target_name,
|
|
1922
1920
|
)
|
|
1923
1921
|
if enriched_df is None:
|
|
1924
1922
|
return None
|
|
@@ -1968,6 +1966,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1968
1966
|
progress_bar=progress_bar,
|
|
1969
1967
|
progress_callback=progress_callback,
|
|
1970
1968
|
add_fit_system_record_id=True,
|
|
1969
|
+
target_name=tmp_target_name,
|
|
1971
1970
|
)
|
|
1972
1971
|
if enriched_Xy is None:
|
|
1973
1972
|
return None
|
|
@@ -2129,6 +2128,7 @@ if response.status_code == 200:
|
|
|
2129
2128
|
progress_bar: Optional[ProgressBar] = None,
|
|
2130
2129
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
2131
2130
|
add_fit_system_record_id: bool = False,
|
|
2131
|
+
target_name: Optional[str] = None,
|
|
2132
2132
|
) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
|
|
2133
2133
|
if self._search_task is None:
|
|
2134
2134
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
@@ -2313,8 +2313,11 @@ if response.status_code == 200:
|
|
|
2313
2313
|
and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
|
|
2314
2314
|
]
|
|
2315
2315
|
|
|
2316
|
-
if add_fit_system_record_id:
|
|
2317
|
-
|
|
2316
|
+
if add_fit_system_record_id and target_name is not None:
|
|
2317
|
+
reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
|
2318
|
+
df = self.__add_fit_system_record_id(
|
|
2319
|
+
df, search_keys, SYSTEM_RECORD_ID, reversed_columns_renaming.get(target_name, target_name)
|
|
2320
|
+
)
|
|
2318
2321
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
2319
2322
|
features_not_to_pass.append(SORT_ID)
|
|
2320
2323
|
|
|
@@ -2624,18 +2627,17 @@ if response.status_code == 200:
|
|
|
2624
2627
|
checked_generate_features = []
|
|
2625
2628
|
for gen_feature in self.generate_features:
|
|
2626
2629
|
if gen_feature not in x_columns:
|
|
2627
|
-
|
|
2628
|
-
|
|
2630
|
+
if gen_feature == self._get_phone_column(self.search_keys):
|
|
2631
|
+
raise ValidationError(
|
|
2632
|
+
self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
|
|
2633
|
+
)
|
|
2634
|
+
else:
|
|
2635
|
+
self.__log_warning(self.bundle.get("missing_generate_feature").format(gen_feature, x_columns))
|
|
2629
2636
|
else:
|
|
2630
2637
|
checked_generate_features.append(gen_feature)
|
|
2631
2638
|
self.generate_features = checked_generate_features
|
|
2632
2639
|
self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
|
|
2633
2640
|
|
|
2634
|
-
if self.columns_for_online_api is not None and len(self.columns_for_online_api) > 0:
|
|
2635
|
-
for column in self.columns_for_online_api:
|
|
2636
|
-
if column not in validated_X.columns:
|
|
2637
|
-
raise ValidationError(self.bundle.get("missing_column_for_online_api").format(column))
|
|
2638
|
-
|
|
2639
2641
|
if self.id_columns is not None:
|
|
2640
2642
|
for id_column in self.id_columns:
|
|
2641
2643
|
if id_column not in validated_X.columns:
|
|
@@ -2759,7 +2761,7 @@ if response.status_code == 200:
|
|
|
2759
2761
|
self.__log_warning(full_duplicates_warning)
|
|
2760
2762
|
|
|
2761
2763
|
# Explode multiple search keys
|
|
2762
|
-
df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2764
|
+
df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID, TARGET)
|
|
2763
2765
|
|
|
2764
2766
|
# TODO check that this is correct for enrichment
|
|
2765
2767
|
self.df_with_original_index = df.copy()
|
|
@@ -2841,7 +2843,7 @@ if response.status_code == 200:
|
|
|
2841
2843
|
if eval_set is not None and len(eval_set) > 0:
|
|
2842
2844
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2843
2845
|
|
|
2844
|
-
df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
|
|
2846
|
+
df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID, TARGET)
|
|
2845
2847
|
|
|
2846
2848
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2847
2849
|
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
@@ -3533,53 +3535,60 @@ if response.status_code == 200:
|
|
|
3533
3535
|
# meaning_types: Dict[str, FileColumnMeaningType],
|
|
3534
3536
|
search_keys: Dict[str, SearchKey],
|
|
3535
3537
|
id_name: str,
|
|
3538
|
+
target_name: str,
|
|
3536
3539
|
) -> pd.DataFrame:
|
|
3537
|
-
# save original order or rows
|
|
3538
3540
|
original_index_name = df.index.name
|
|
3539
3541
|
index_name = df.index.name or DEFAULT_INDEX
|
|
3540
3542
|
original_order_name = "original_order"
|
|
3543
|
+
# Save original index
|
|
3541
3544
|
df = df.reset_index().rename(columns={index_name: ORIGINAL_INDEX})
|
|
3545
|
+
# Save original order
|
|
3542
3546
|
df = df.reset_index().rename(columns={DEFAULT_INDEX: original_order_name})
|
|
3543
3547
|
|
|
3544
|
-
# order by date and idempotent order by other keys
|
|
3545
|
-
if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
3546
|
-
sort_exclude_columns = [
|
|
3547
|
-
original_order_name,
|
|
3548
|
-
ORIGINAL_INDEX,
|
|
3549
|
-
EVAL_SET_INDEX,
|
|
3550
|
-
TARGET,
|
|
3551
|
-
"__target",
|
|
3552
|
-
ENTITY_SYSTEM_RECORD_ID,
|
|
3553
|
-
]
|
|
3554
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3555
|
-
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3556
|
-
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
3557
|
-
else:
|
|
3558
|
-
date_column = self._get_date_column(search_keys)
|
|
3559
|
-
sort_columns = [date_column] if date_column is not None else []
|
|
3548
|
+
# order by date and idempotent order by other keys and features
|
|
3560
3549
|
|
|
3561
|
-
|
|
3562
|
-
|
|
3550
|
+
sort_exclude_columns = [
|
|
3551
|
+
original_order_name,
|
|
3552
|
+
ORIGINAL_INDEX,
|
|
3553
|
+
EVAL_SET_INDEX,
|
|
3554
|
+
TARGET,
|
|
3555
|
+
"__target",
|
|
3556
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
3557
|
+
]
|
|
3558
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3559
|
+
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3560
|
+
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
3561
|
+
else:
|
|
3562
|
+
date_column = self._get_date_column(search_keys)
|
|
3563
|
+
sort_exclude_columns.append(date_column)
|
|
3564
|
+
columns_to_sort = [date_column] if date_column is not None else []
|
|
3563
3565
|
|
|
3564
|
-
|
|
3565
|
-
|
|
3566
|
-
|
|
3567
|
-
|
|
3568
|
-
|
|
3569
|
-
|
|
3570
|
-
and c not in sort_exclude_columns
|
|
3571
|
-
and df[c].nunique() > 1
|
|
3572
|
-
]
|
|
3573
|
-
)
|
|
3566
|
+
do_sorting = True
|
|
3567
|
+
if self.id_columns and self.cv in [CVType.time_series, CVType.blocked_time_series]:
|
|
3568
|
+
# Check duplicates by date and id_columns
|
|
3569
|
+
duplicate_check_columns = [c for c in self.id_columns if c in df.columns]
|
|
3570
|
+
if date_column is not None:
|
|
3571
|
+
duplicate_check_columns.append(date_column)
|
|
3574
3572
|
|
|
3575
|
-
|
|
3573
|
+
duplicates = df.duplicated(subset=duplicate_check_columns, keep=False)
|
|
3574
|
+
if duplicates.any():
|
|
3575
|
+
self.__log_warning(self.bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
|
|
3576
|
+
do_sorting = False
|
|
3577
|
+
else:
|
|
3578
|
+
columns_to_hash = list(search_keys.keys()) + self.id_columns
|
|
3579
|
+
columns_to_hash = sort_columns(
|
|
3580
|
+
df[columns_to_hash], target_name, search_keys, self.model_task_type, sort_exclude_columns
|
|
3581
|
+
)
|
|
3582
|
+
else:
|
|
3583
|
+
columns_to_hash = sort_columns(df, target_name, search_keys, self.model_task_type, sort_exclude_columns)
|
|
3576
3584
|
|
|
3585
|
+
if do_sorting:
|
|
3577
3586
|
search_keys_hash = "search_keys_hash"
|
|
3578
|
-
if len(
|
|
3579
|
-
|
|
3580
|
-
|
|
3587
|
+
if len(columns_to_hash) > 0:
|
|
3588
|
+
df[search_keys_hash] = pd.util.hash_pandas_object(df[columns_to_hash], index=False)
|
|
3589
|
+
columns_to_sort.append(search_keys_hash)
|
|
3581
3590
|
|
|
3582
|
-
df = df.sort_values(by=
|
|
3591
|
+
df = df.sort_values(by=columns_to_sort)
|
|
3583
3592
|
|
|
3584
3593
|
if search_keys_hash in df.columns:
|
|
3585
3594
|
df.drop(columns=search_keys_hash, inplace=True)
|
|
@@ -35,6 +35,7 @@ trial_quota_limit_riched=You have reached the quota limit of trial data usage. P
|
|
|
35
35
|
loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
|
|
36
36
|
loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
|
|
37
37
|
multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
|
|
38
|
+
date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns
|
|
38
39
|
group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
|
|
39
40
|
current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
|
|
40
41
|
# Errors
|
|
@@ -111,7 +112,6 @@ x_is_empty=X is empty
|
|
|
111
112
|
y_is_empty=y is empty
|
|
112
113
|
x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
|
|
113
114
|
missing_generate_feature=Feature {} specified in `generate_features` is not present in input columns: {}
|
|
114
|
-
missing_column_for_online_api=Column {} specified in `columns_for_online_api` is not present in input columns: {}
|
|
115
115
|
x_unstable_by_date=Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
|
|
116
116
|
train_unstable_target=Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
|
|
117
117
|
eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
|
upgini/utils/email_utils.py
CHANGED
|
@@ -116,17 +116,17 @@ class EmailSearchKeyConverter:
|
|
|
116
116
|
else:
|
|
117
117
|
df[self.hem_column] = df[self.hem_column].astype("string").str.lower()
|
|
118
118
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
119
|
+
del self.search_keys[self.email_column]
|
|
120
|
+
if self.email_column in self.unnest_search_keys:
|
|
121
|
+
self.unnest_search_keys.remove(self.email_column)
|
|
122
122
|
|
|
123
123
|
one_domain_name = self.email_column + self.ONE_DOMAIN_SUFFIX
|
|
124
124
|
df[one_domain_name] = df[self.email_column].apply(self._email_to_one_domain)
|
|
125
125
|
self.columns_renaming[one_domain_name] = original_email_column
|
|
126
126
|
self.search_keys[one_domain_name] = SearchKey.EMAIL_ONE_DOMAIN
|
|
127
127
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
128
|
+
if self.email_converted_to_hem:
|
|
129
|
+
df = df.drop(columns=self.email_column)
|
|
130
|
+
del self.columns_renaming[self.email_column]
|
|
131
131
|
|
|
132
132
|
return df
|
upgini/utils/mstats.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
from collections import namedtuple
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import numpy.ma as ma
|
|
6
|
+
import scipy
|
|
7
|
+
from joblib import Parallel, delayed
|
|
8
|
+
from numpy import ndarray
|
|
9
|
+
from psutil import cpu_count
|
|
10
|
+
|
|
11
|
+
np.seterr(divide="ignore")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
warnings.simplefilter(action="ignore", category=RuntimeWarning)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _find_repeats(arr):
|
|
18
|
+
# This function assumes it may clobber its input.
|
|
19
|
+
if len(arr) == 0:
|
|
20
|
+
return np.array(0, np.float64), np.array(0, np.intp)
|
|
21
|
+
|
|
22
|
+
# XXX This cast was previously needed for the Fortran implementation,
|
|
23
|
+
# should we ditch it?
|
|
24
|
+
arr = np.asarray(arr, np.float64).ravel()
|
|
25
|
+
arr.sort()
|
|
26
|
+
|
|
27
|
+
# Taken from NumPy 1.9's np.unique.
|
|
28
|
+
change = np.concatenate(([True], arr[1:] != arr[:-1]))
|
|
29
|
+
unique = arr[change]
|
|
30
|
+
change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
|
|
31
|
+
freq = np.diff(change_idx)
|
|
32
|
+
atleast2 = freq > 1
|
|
33
|
+
return unique[atleast2], freq[atleast2]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def find_repeats(arr):
|
|
37
|
+
# Make sure we get a copy. ma.compressed promises a "new array", but can
|
|
38
|
+
# actually return a reference.
|
|
39
|
+
compr = np.asarray(ma.compressed(arr), dtype=np.float64)
|
|
40
|
+
try:
|
|
41
|
+
need_copy = np.may_share_memory(compr, arr)
|
|
42
|
+
except AttributeError:
|
|
43
|
+
# numpy < 1.8.2 bug: np.may_share_memory([], []) raises,
|
|
44
|
+
# while in numpy 1.8.2 and above it just (correctly) returns False.
|
|
45
|
+
need_copy = False
|
|
46
|
+
if need_copy:
|
|
47
|
+
compr = compr.copy()
|
|
48
|
+
return _find_repeats(compr)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def rankdata(data, axis=None, use_missing=False):
|
|
52
|
+
def _rank1d(data, use_missing=False):
|
|
53
|
+
n = data.count()
|
|
54
|
+
rk = np.empty(data.size, dtype=float)
|
|
55
|
+
idx = data.argsort()
|
|
56
|
+
rk[idx[:n]] = np.arange(1, n + 1)
|
|
57
|
+
|
|
58
|
+
if use_missing:
|
|
59
|
+
rk[idx[n:]] = (n + 1) / 2.0
|
|
60
|
+
else:
|
|
61
|
+
rk[idx[n:]] = 0
|
|
62
|
+
|
|
63
|
+
repeats = find_repeats(data.copy())
|
|
64
|
+
for r in repeats[0]:
|
|
65
|
+
condition = (data == r).filled(False)
|
|
66
|
+
rk[condition] = rk[condition].mean()
|
|
67
|
+
return rk
|
|
68
|
+
|
|
69
|
+
data = ma.array(data, copy=False)
|
|
70
|
+
if axis is None:
|
|
71
|
+
if data.ndim > 1:
|
|
72
|
+
return _rank1d(data.ravel(), use_missing).reshape(data.shape)
|
|
73
|
+
else:
|
|
74
|
+
return _rank1d(data, use_missing)
|
|
75
|
+
else:
|
|
76
|
+
return ma.apply_along_axis(_rank1d, axis, data, use_missing).view(ndarray)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _chk_asarray(a, axis):
|
|
80
|
+
# Always returns a masked array, raveled for axis=None
|
|
81
|
+
a = ma.asanyarray(a)
|
|
82
|
+
if axis is None:
|
|
83
|
+
a = ma.ravel(a)
|
|
84
|
+
outaxis = 0
|
|
85
|
+
else:
|
|
86
|
+
outaxis = axis
|
|
87
|
+
return a, outaxis
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
SpearmanrResult = namedtuple("SpearmanrResult", ("correlation", "pvalue"))
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# Taken from scipy.mstats with following tweaks:
|
|
94
|
+
# 1. parallel pairwise computation
|
|
95
|
+
# 2. custom masking
|
|
96
|
+
def spearmanr(
|
|
97
|
+
x, y=None, use_ties=True, axis=None, nan_policy="propagate", alternative="two-sided", mask_fn=ma.masked_invalid
|
|
98
|
+
):
|
|
99
|
+
if not use_ties:
|
|
100
|
+
raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")
|
|
101
|
+
|
|
102
|
+
# Always returns a masked array, raveled if axis=None
|
|
103
|
+
x, axisout = _chk_asarray(x, axis)
|
|
104
|
+
if y is not None:
|
|
105
|
+
# Deal only with 2-D `x` case.
|
|
106
|
+
y, _ = _chk_asarray(y, axis)
|
|
107
|
+
if axisout == 0:
|
|
108
|
+
x = ma.column_stack((x, y))
|
|
109
|
+
else:
|
|
110
|
+
x = ma.row_stack((x, y))
|
|
111
|
+
|
|
112
|
+
if axisout == 1:
|
|
113
|
+
# To simplify the code that follow (always use `n_obs, n_vars` shape)
|
|
114
|
+
x = x.T
|
|
115
|
+
|
|
116
|
+
if nan_policy == "omit":
|
|
117
|
+
x = mask_fn(x)
|
|
118
|
+
|
|
119
|
+
def _spearmanr_2cols(x):
|
|
120
|
+
# Mask the same observations for all variables, and then drop those
|
|
121
|
+
# observations (can't leave them masked, rankdata is weird).
|
|
122
|
+
x = ma.mask_rowcols(x, axis=0)
|
|
123
|
+
x = x[~x.mask.any(axis=1), :]
|
|
124
|
+
|
|
125
|
+
# If either column is entirely NaN or Inf
|
|
126
|
+
if not np.any(x.data):
|
|
127
|
+
return SpearmanrResult(np.nan, np.nan)
|
|
128
|
+
|
|
129
|
+
m = ma.getmask(x)
|
|
130
|
+
n_obs = x.shape[0]
|
|
131
|
+
dof = n_obs - 2 - int(m.sum(axis=0)[0])
|
|
132
|
+
if dof < 0:
|
|
133
|
+
return SpearmanrResult(np.nan, np.nan)
|
|
134
|
+
|
|
135
|
+
# Gets the ranks and rank differences
|
|
136
|
+
x_ranked = rankdata(x, axis=0)
|
|
137
|
+
rs = ma.corrcoef(x_ranked, rowvar=False).data
|
|
138
|
+
|
|
139
|
+
# rs can have elements equal to 1, so avoid zero division warnings
|
|
140
|
+
with np.errstate(divide="ignore"):
|
|
141
|
+
# clip the small negative values possibly caused by rounding
|
|
142
|
+
# errors before taking the square root
|
|
143
|
+
t = rs * np.sqrt((dof / ((rs + 1.0) * (1.0 - rs))).clip(0))
|
|
144
|
+
|
|
145
|
+
t, prob = scipy.stats._mstats_basic._ttest_finish(dof, t, alternative)
|
|
146
|
+
|
|
147
|
+
# For backwards compatibility, return scalars when comparing 2 columns
|
|
148
|
+
if rs.shape == (2, 2):
|
|
149
|
+
return SpearmanrResult(rs[1, 0], prob[1, 0])
|
|
150
|
+
else:
|
|
151
|
+
return SpearmanrResult(rs, prob)
|
|
152
|
+
|
|
153
|
+
# Need to do this per pair of variables, otherwise the dropped observations
|
|
154
|
+
# in a third column mess up the result for a pair.
|
|
155
|
+
n_vars = x.shape[1]
|
|
156
|
+
if n_vars == 2:
|
|
157
|
+
return _spearmanr_2cols(x)
|
|
158
|
+
else:
|
|
159
|
+
max_cpu_cores = cpu_count(logical=False)
|
|
160
|
+
with np.errstate(divide="ignore"):
|
|
161
|
+
results = Parallel(n_jobs=max_cpu_cores)(
|
|
162
|
+
delayed(_spearmanr_2cols)(x[:, [var1, var2]])
|
|
163
|
+
for var1 in range(n_vars - 1)
|
|
164
|
+
for var2 in range(var1 + 1, n_vars)
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
rs = np.ones((n_vars, n_vars), dtype=float)
|
|
168
|
+
prob = np.zeros((n_vars, n_vars), dtype=float)
|
|
169
|
+
for var1 in range(n_vars - 1):
|
|
170
|
+
for var2 in range(var1 + 1, n_vars):
|
|
171
|
+
result = results.pop(0)
|
|
172
|
+
rs[var1, var2] = result.correlation
|
|
173
|
+
rs[var2, var1] = result.correlation
|
|
174
|
+
prob[var1, var2] = result.pvalue
|
|
175
|
+
prob[var2, var1] = result.pvalue
|
|
176
|
+
|
|
177
|
+
return SpearmanrResult(rs, prob)
|
upgini/utils/sort.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from joblib import Parallel, delayed
|
|
7
|
+
from pandas.api.types import is_datetime64_any_dtype, is_numeric_dtype
|
|
8
|
+
from psutil import cpu_count
|
|
9
|
+
from scipy.stats import skew, spearmanr
|
|
10
|
+
|
|
11
|
+
from upgini.metadata import ModelTaskType, SearchKey
|
|
12
|
+
from upgini.utils import mstats
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def sort_columns(
|
|
16
|
+
df: pd.DataFrame,
|
|
17
|
+
target_column: str,
|
|
18
|
+
search_keys: Dict[str, SearchKey],
|
|
19
|
+
model_task_type: ModelTaskType,
|
|
20
|
+
exclude_columns: List[str],
|
|
21
|
+
) -> List[str]:
|
|
22
|
+
df = df.copy() # avoid side effects
|
|
23
|
+
sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
|
|
24
|
+
sorted_keys = [k for k in sorted_keys if k not in exclude_columns]
|
|
25
|
+
|
|
26
|
+
other_columns = sorted(
|
|
27
|
+
[
|
|
28
|
+
c
|
|
29
|
+
for c in df.columns
|
|
30
|
+
if c not in sorted_keys
|
|
31
|
+
and c not in exclude_columns
|
|
32
|
+
and df[c].nunique() > 1
|
|
33
|
+
]
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
target = prepare_target(df[target_column], model_task_type)
|
|
37
|
+
sort_dict = get_sort_columns_dict(df[sorted_keys + other_columns], target, sorted_keys, omit_nan=True)
|
|
38
|
+
other_columns = [c for c in other_columns if c in sort_dict]
|
|
39
|
+
columns_for_sort = sorted_keys + sorted(other_columns, key=lambda e: sort_dict[e], reverse=True)
|
|
40
|
+
return columns_for_sort
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_sort_columns_dict(
|
|
44
|
+
df: pd.DataFrame,
|
|
45
|
+
target: pd.Series,
|
|
46
|
+
sorted_keys: List[str],
|
|
47
|
+
omit_nan: bool,
|
|
48
|
+
n_jobs: int | None = None,
|
|
49
|
+
) -> dict[str, Any]:
|
|
50
|
+
string_features = [c for c in df.select_dtypes(exclude=[np.number]).columns if c not in sorted_keys]
|
|
51
|
+
columns_for_sort = [c for c in df.columns if c not in sorted_keys + string_features]
|
|
52
|
+
if len(string_features) > 0:
|
|
53
|
+
if len(df) > len(df.drop(columns=string_features).drop_duplicates()):
|
|
54
|
+
# factorize string features
|
|
55
|
+
for c in string_features:
|
|
56
|
+
df[c] = df[c].factorize(sort=True)[0]
|
|
57
|
+
columns_for_sort.extend(string_features)
|
|
58
|
+
|
|
59
|
+
if len(columns_for_sort) == 0:
|
|
60
|
+
return {}
|
|
61
|
+
|
|
62
|
+
df = df[columns_for_sort]
|
|
63
|
+
hashes = [hash_series(df[col]) for col in columns_for_sort]
|
|
64
|
+
df = np.asarray(df, dtype=np.float32)
|
|
65
|
+
correlations = get_sort_columns_correlations(df, target, omit_nan, n_jobs)
|
|
66
|
+
|
|
67
|
+
sort_dict = {col: (corr, h) for col, corr, h in zip(columns_for_sort, correlations, hashes)}
|
|
68
|
+
return sort_dict
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def get_sort_columns_correlations(df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: int | None = None):
|
|
72
|
+
target_correlations = get_target_correlations(df, target, omit_nan, n_jobs, precision=7)
|
|
73
|
+
|
|
74
|
+
return np.max(target_correlations, axis=0)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_target_correlations(
|
|
78
|
+
df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: int | None = None, precision: int = 15
|
|
79
|
+
):
|
|
80
|
+
df = np.asarray(df, dtype=np.float32)
|
|
81
|
+
target_correlations = np.zeros((2, df.shape[1]))
|
|
82
|
+
target_correlations[0, :] = np.nan_to_num(
|
|
83
|
+
calculate_spearman_corr_with_target(df, target, omit_nan, n_jobs), copy=False
|
|
84
|
+
)
|
|
85
|
+
target_correlations[1, :] = np.nan_to_num(np.abs(np.corrcoef(df.T, target.T, rowvar=True)[-1, :-1]))
|
|
86
|
+
|
|
87
|
+
target_correlations = np.trunc(target_correlations * 10**precision) / (10**precision)
|
|
88
|
+
|
|
89
|
+
return target_correlations
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def corr_dict_from_sort_dict(sort_dict: dict[str, tuple[float, int]]) -> dict[str, float]:
|
|
93
|
+
return {k: v[0] for k, v in sort_dict.items()}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def calculate_spearman_corr_with_target(
|
|
97
|
+
X: pd.DataFrame | np.ndarray, y: pd.Series, omit_nan: bool = False, n_jobs: int | None = None
|
|
98
|
+
) -> np.ndarray:
|
|
99
|
+
if isinstance(X, pd.DataFrame):
|
|
100
|
+
X = np.asarray(X, dtype=np.float32)
|
|
101
|
+
|
|
102
|
+
if X.size == 0:
|
|
103
|
+
return np.ndarray(shape=(0,))
|
|
104
|
+
|
|
105
|
+
all_correlations = np.zeros(X.shape[1])
|
|
106
|
+
all_correlations.fill(np.nan)
|
|
107
|
+
cols2calc = np.where([c.size > 0 and not (c == c[0]).all() for c in X.T])[0]
|
|
108
|
+
|
|
109
|
+
if omit_nan:
|
|
110
|
+
results = Parallel(n_jobs=n_jobs or cpu_count(logical=False))(
|
|
111
|
+
delayed(mstats.spearmanr)(
|
|
112
|
+
X[:, i],
|
|
113
|
+
y,
|
|
114
|
+
nan_policy="omit",
|
|
115
|
+
axis=0,
|
|
116
|
+
)
|
|
117
|
+
for i in cols2calc
|
|
118
|
+
)
|
|
119
|
+
target_correlations = np.array([abs(res.correlation) for res in results])
|
|
120
|
+
else:
|
|
121
|
+
cols2calc = cols2calc[np.where(~np.isnan(X[:, cols2calc]).any(axis=0))[0]]
|
|
122
|
+
target_correlations = calculate_spearman(X[:, cols2calc], y, nan_policy="raise")
|
|
123
|
+
if isinstance(target_correlations, float):
|
|
124
|
+
target_correlations = np.abs([target_correlations])
|
|
125
|
+
else:
|
|
126
|
+
target_correlations = np.abs(target_correlations)[-1, :-1]
|
|
127
|
+
|
|
128
|
+
all_correlations[cols2calc] = target_correlations
|
|
129
|
+
|
|
130
|
+
return all_correlations
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def calculate_spearman(X: np.ndarray, y: pd.Series | None, nan_policy: str):
|
|
134
|
+
features_num = X.shape[1]
|
|
135
|
+
if y is not None:
|
|
136
|
+
features_num += 1
|
|
137
|
+
|
|
138
|
+
if features_num < 2:
|
|
139
|
+
return 1.0
|
|
140
|
+
else:
|
|
141
|
+
return spearmanr(X, y, nan_policy=nan_policy).correlation
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def hash_series(series: pd.Series) -> int:
|
|
145
|
+
return int(hashlib.sha256(pd.util.hash_pandas_object(series, index=True).values).hexdigest(), 16)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def prepare_target(target: pd.Series, model_task_type: ModelTaskType) -> pd.Series:
|
|
149
|
+
target_name = target.name
|
|
150
|
+
if model_task_type != ModelTaskType.REGRESSION or (
|
|
151
|
+
not is_numeric_dtype(target) and not is_datetime64_any_dtype(target)
|
|
152
|
+
):
|
|
153
|
+
target = target.astype(str).astype("category").cat.codes
|
|
154
|
+
|
|
155
|
+
elif model_task_type == ModelTaskType.REGRESSION:
|
|
156
|
+
skewness = round(abs(skew(target)), 2)
|
|
157
|
+
if (target.min() >= 0) and (skewness >= 0.9):
|
|
158
|
+
target = np.log1p(target)
|
|
159
|
+
|
|
160
|
+
return pd.Series(target, name=target_name)
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=LqyhrBVdkMR0Yf_KH4QG5lOkJF9WVTAZe6Z3z7S3fPg,25
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=vT4JyHmafLNbj54SySXr93f5hNS6-t94aFslbBy-7No,33535
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=O2VkqPFqyjgjpd27s6JU7pRcEsQM0L14L020aWJPvNs,202385
|
|
7
7
|
upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
|
|
@@ -16,21 +16,21 @@ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo
|
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
upgini/autofe/all_operands.py,sha256=v0_NozalvvzeojSAA0d7UJ5INS654ZVaLn4S8djK6Ac,329
|
|
18
18
|
upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
|
|
19
|
-
upgini/autofe/date.py,sha256=
|
|
20
|
-
upgini/autofe/feature.py,sha256=
|
|
19
|
+
upgini/autofe/date.py,sha256=d-sijAD7dETfqIOCaZh1vhuVjsS_nqa-6dhjwkCdny4,10441
|
|
20
|
+
upgini/autofe/feature.py,sha256=l8A8E3BH2BmYvqEC81zbcIEfH6KEEhcesJ2BH4fn0-4,15140
|
|
21
21
|
upgini/autofe/groupby.py,sha256=G48_sQZw016eGx3cOy8YQrEIOp95puWqYUpFWd-gdeM,3595
|
|
22
22
|
upgini/autofe/operand.py,sha256=8Ttrfxv_H91dMbS7J55zxluzAJHfGXU_Y2xCh4OHwb8,4774
|
|
23
23
|
upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
|
|
24
24
|
upgini/autofe/vector.py,sha256=udkg4pP7IIeLjt0Cg6rzEKUmGaubOnqsEz3bz9R6E44,7110
|
|
25
25
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
-
upgini/data_source/data_source_publisher.py,sha256=
|
|
26
|
+
upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lYQaGZbxDnOd4A3Q,22516
|
|
27
27
|
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
28
28
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
|
29
29
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
30
|
upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=D2wXkNMehx85klDfIiqVYCiIHz7SsbgmR8R54oIXy78,27428
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -46,20 +46,22 @@ upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
|
46
46
|
upgini/utils/datetime_utils.py,sha256=RVAk4_rakK8X9zjybK3-rj0to0e3elye8tnBuA4wTWU,13491
|
|
47
47
|
upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
|
|
48
48
|
upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
|
|
49
|
-
upgini/utils/email_utils.py,sha256=
|
|
49
|
+
upgini/utils/email_utils.py,sha256=GbnhHJn1nhUBytmK6PophYqaoq4t7Lp6i0-O0Gd3RV8,5265
|
|
50
50
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
51
51
|
upgini/utils/feature_info.py,sha256=0rOXSyCj-sw-8migWP0ge8qrOzGU50dQvH0JUJUrDfQ,6766
|
|
52
52
|
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
|
53
53
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
54
54
|
upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
|
|
55
|
+
upgini/utils/mstats.py,sha256=GjBAUacgfAoVQVFUrMiRYdVkmx93CIThLRNvYLLiV48,5765
|
|
55
56
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
56
57
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
57
58
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
58
59
|
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
60
|
+
upgini/utils/sort.py,sha256=ACcYrRNu1MPoYyt1oEYXvdL_dXx9YtJoSLjeiBu46KU,5648
|
|
59
61
|
upgini/utils/target_utils.py,sha256=RlpKGss9kMibVSlA8iZuO_qxmyeplqzn7X8g6hiGGGs,14341
|
|
60
62
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
61
63
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
64
|
+
upgini-1.2.58a1.dist-info/METADATA,sha256=1L4P8tuUJWf6nuGA0YfaZ5PUx2YVvqrTODLrPZRHyc0,49057
|
|
65
|
+
upgini-1.2.58a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
66
|
+
upgini-1.2.58a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
67
|
+
upgini-1.2.58a1.dist-info/RECORD,,
|
|
File without changes
|